include/xapian/unicode.h

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2006,2007,2008 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00019  */
00020 
00021 #ifndef XAPIAN_INCLUDED_UNICODE_H
00022 #define XAPIAN_INCLUDED_UNICODE_H
00023 
00024 #include <xapian/visibility.h>
00025 
00026 #include <string>
00027 
00028 namespace Xapian {
00029 
00033 class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator {
00034     const unsigned char *p;
00035     const unsigned char *end;
00036     mutable unsigned seqlen;
00037 
00038     void calculate_sequence_length() const;
00039 
00040     unsigned get_char() const;
00041 
00042     Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
00043         : p(p_), end(end_), seqlen(seqlen_) { }
00044 
00045   public:
00047     const char * raw() const {
00048         return reinterpret_cast<const char *>(p ? p : end);
00049     }
00050 
00052     size_t left() const { return p ? end - p : 0; }
00053 
00065     void assign(const char *p_, size_t len) {
00066         if (len) {
00067             p = reinterpret_cast<const unsigned char*>(p_);
00068             end = p + len;
00069             seqlen = 0;
00070         } else {
00071             p = NULL;
00072         }
00073     }
00074 
00085     void assign(const std::string &s) { assign(s.data(), s.size()); }
00086 
00095     explicit Utf8Iterator(const char *p_);
00096 
00107     Utf8Iterator(const char *p_, size_t len) { assign(p_, len); }
00108 
00118     Utf8Iterator(const std::string &s) { assign(s.data(), s.size()); }
00119 
00125     Utf8Iterator() : p(NULL), end(0), seqlen(0) { }
00126 
00131     unsigned operator*() const;
00132 
00137     Utf8Iterator operator++(int) {
00138         // If we've not calculated seqlen yet, do so.
00139         if (seqlen == 0) calculate_sequence_length();
00140         const unsigned char *old_p = p;
00141         unsigned old_seqlen = seqlen;
00142         p += seqlen;
00143         if (p == end) p = NULL;
00144         seqlen = 0;
00145         return Utf8Iterator(old_p, end, old_seqlen);
00146     }
00147 
00152     Utf8Iterator & operator++() {
00153         if (seqlen == 0) calculate_sequence_length();
00154         p += seqlen;
00155         if (p == end) p = NULL;
00156         seqlen = 0;
00157         return *this;
00158     }
00159 
00164     bool operator==(const Utf8Iterator &other) const { return p == other.p; }
00165 
00170     bool operator!=(const Utf8Iterator &other) const { return p != other.p; }
00171 
00173 
00174     typedef std::input_iterator_tag iterator_category;
00175     typedef unsigned value_type;
00176     typedef size_t difference_type;
00177     typedef const unsigned * pointer;
00178     typedef const unsigned & reference;
00180 };
00181 
00182 namespace Unicode {
00183 
00185 typedef enum {
00186     UNASSIGNED,
00187     UPPERCASE_LETTER,
00188     LOWERCASE_LETTER,
00189     TITLECASE_LETTER,
00190     MODIFIER_LETTER,
00191     OTHER_LETTER,
00192     NON_SPACING_MARK,
00193     ENCLOSING_MARK,
00194     COMBINING_SPACING_MARK,
00195     DECIMAL_DIGIT_NUMBER,
00196     LETTER_NUMBER,
00197     OTHER_NUMBER,
00198     SPACE_SEPARATOR,
00199     LINE_SEPARATOR,
00200     PARAGRAPH_SEPARATOR,
00201     CONTROL,
00202     FORMAT,
00203     PRIVATE_USE,
00204     SURROGATE,
00205     CONNECTOR_PUNCTUATION,
00206     DASH_PUNCTUATION,
00207     OPEN_PUNCTUATION,
00208     CLOSE_PUNCTUATION,
00209     INITIAL_QUOTE_PUNCTUATION,
00210     FINAL_QUOTE_PUNCTUATION,
00211     OTHER_PUNCTUATION,
00212     MATH_SYMBOL,
00213     CURRENCY_SYMBOL,
00214     MODIFIER_SYMBOL,
00215     OTHER_SYMBOL
00216 } category;
00217 
00218 namespace Internal {
00224     XAPIAN_VISIBILITY_DEFAULT
00225     int get_character_info(unsigned ch);
00226 
00230     inline int get_case_type(int info) { return ((info & 0xe0) >> 5); }
00231 
00233     inline category get_category(int info) { return static_cast<category>(info & 0x1f); }
00234 
00238     inline int get_delta(int info) {
00239         /* It's implementation defined if sign extension happens on right shift
00240          * of a signed int, hence the conditional (hopefully the compiler will
00241          * spot this and optimise it to a sign-extending shift on architectures
00242          * with a suitable instruction).
00243          */
00244         return (info >= 0) ? (info >> 15) : (~(~info >> 15));
00245     }
00246 }
00247 
00257 XAPIAN_VISIBILITY_DEFAULT
00258 unsigned nonascii_to_utf8(unsigned ch, char * buf);
00259 
00267 inline unsigned to_utf8(unsigned ch, char *buf) {
00268     if (ch < 128) {
00269         *buf = static_cast<unsigned char>(ch);
00270         return 1;
00271     }
00272     return Xapian::Unicode::nonascii_to_utf8(ch, buf);
00273 }
00274 
00278 inline void append_utf8(std::string &s, unsigned ch) {
00279     char buf[4];
00280     s.append(buf, to_utf8(ch, buf));
00281 }
00282 
00284 inline category get_category(unsigned ch) {
00285     // Categorise non-Unicode values as UNASSIGNED.
00286     if (ch >= 0x110000) return Xapian::Unicode::UNASSIGNED;
00287     return Internal::get_category(Internal::get_character_info(ch));
00288 }
00289 
00291 inline bool is_wordchar(unsigned ch) {
00292     const unsigned int WORDCHAR_MASK =
00293             (1 << Xapian::Unicode::UPPERCASE_LETTER) |
00294             (1 << Xapian::Unicode::LOWERCASE_LETTER) |
00295             (1 << Xapian::Unicode::TITLECASE_LETTER) |
00296             (1 << Xapian::Unicode::MODIFIER_LETTER) |
00297             (1 << Xapian::Unicode::OTHER_LETTER) |
00298             (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) |
00299             (1 << Xapian::Unicode::LETTER_NUMBER) |
00300             (1 << Xapian::Unicode::OTHER_NUMBER) |
00301             (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION);
00302     return ((WORDCHAR_MASK >> get_category(ch)) & 1);
00303 }
00304 
00306 inline bool is_whitespace(unsigned ch) {
00307     const unsigned int WHITESPACE_MASK =
00308             (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF.
00309             (1 << Xapian::Unicode::SPACE_SEPARATOR) |
00310             (1 << Xapian::Unicode::LINE_SEPARATOR) |
00311             (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR);
00312     return ((WHITESPACE_MASK >> get_category(ch)) & 1);
00313 }
00314 
00316 inline bool is_currency(unsigned ch) {
00317     return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL);
00318 }
00319 
00321 inline unsigned tolower(unsigned ch) {
00322     int info;
00323     // Leave non-Unicode values unchanged.
00324     if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode::Internal::get_character_info(ch))) & 2))
00325         return ch;
00326     return ch + Internal::get_delta(info);
00327 }
00328 
00330 inline unsigned toupper(unsigned ch) {
00331     int info;
00332     // Leave non-Unicode values unchanged.
00333     if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode::Internal::get_character_info(ch))) & 4))
00334         return ch;
00335     return ch - Internal::get_delta(info);
00336 }
00337 
00339 inline std::string
00340 tolower(const std::string &term)
00341 {
00342     std::string result;
00343     result.reserve(term.size());
00344     for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
00345         append_utf8(result, tolower(*i));
00346     }
00347     return result;
00348 }
00349 
00351 inline std::string
00352 toupper(const std::string &term)
00353 {
00354     std::string result;
00355     result.reserve(term.size());
00356     for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
00357         append_utf8(result, toupper(*i));
00358     }
00359     return result;
00360 }
00361 
00362 }
00363 
00364 }
00365 
00366 #endif // XAPIAN_INCLUDED_UNICODE_H

Documentation for Xapian (version 1.0.10).
Generated on 23 Dec 2008 by Doxygen 1.5.2.