Xapian: Internal Source Documentation: xapian-core: unicode/utf8itor.cc Source File

00001 /* utf8itor.cc: iterate over a utf8 string.
00002  *
00003  * Copyright (C) 2006,2007 Olly Betts
00004  *
00005  * This program is free software; you can redistribute it and/or modify
00006  * it under the terms of the GNU General Public License as published by
00007  * the Free Software Foundation; either version 2 of the License, or
00008  * (at your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00018  */
00019 
00020 #include <config.h>
00021 
00022 #include <xapian/unicode.h>
00023 
00024 #include <string.h>
00025 
00026 inline bool bad_cont(unsigned char ch) { return (ch & 0xc0) != 0x80; }
00027 
00028 namespace Xapian {
00029 
00030 namespace Unicode {
00031 
00032 // buf should be at least 4 bytes.
00033 unsigned
00034 nonascii_to_utf8(unsigned ch, char * buf)
00035 {
00036     if (ch < 0x800) {
00037         buf[0] = 0xc0 | (ch >> 6);
00038         buf[1] = 0x80 | (ch & 0x3f);
00039         return 2;
00040     }
00041     if (ch < 0x10000) {
00042         buf[0] = 0xe0 | (ch >> 12);
00043         buf[1] = 0x80 | ((ch >> 6) & 0x3f);
00044         buf[2] = 0x80 | (ch & 0x3f);
00045         return 3;
00046     }
00047     if (ch < 0x200000) {
00048         buf[0] = 0xf0 | (ch >> 18);
00049         buf[1] = 0x80 | ((ch >> 12) & 0x3f);
00050         buf[2] = 0x80 | ((ch >> 6) & 0x3f);
00051         buf[3] = 0x80 | (ch & 0x3f);
00052         return 4;
00053     }
00054     // Unicode doesn't specify any characters above 0x10ffff.
00055     // Should we be presented with such a numeric character
00056     // entity or similar, we just replace it with nothing.
00057     return 0;
00058 }
00059 
00060 }
00061 
00062 Utf8Iterator::Utf8Iterator(const char *p_)
00063 {
00064     assign(p_, strlen(p_));
00065 }
00066 
00067 void
00068 Utf8Iterator::calculate_sequence_length() const
00069 {
00070     // Handle invalid UTF-8, overlong sequences, and truncated sequences as
00071     // if the text was actually in ISO-8859-1 since we need to do something
00072     // with it, and this seems the most likely reason why we'd have invalid
00073     // UTF-8.
00074 
00075     unsigned char ch = *p;
00076 
00077     seqlen = 1;
00078     // Single byte encoding (0x00-0x7f) or overlong sequence (0x80-0xc1).
00079     //
00080     // (0xc0 and 0xc1 would start 2 byte sequences for characters which are
00081     // representable in a single byte, and we should not decode these.)
00082     if (ch < 0xc2) return;
00083 
00084     if (ch < 0xe0) {
00085         if (p + 1 == end || // Not enough bytes
00086             (p[1] & 0xc0) != 0x80) // Overlong encoding
00087             return;
00088         seqlen = 2;
00089         return;
00090     }
00091     if (ch < 0xf0) {
00092         if (end - p < 3 || // Not enough bytes
00093             bad_cont(p[1]) || bad_cont(p[2]) || // Invalid
00094             (p[0] == 0xe0 && p[1] < 0xa0)) // Overlong encoding
00095             return;
00096         seqlen = 3;
00097         return;
00098     }
00099     if (ch >= 0xf5 || // Code value above Unicode
00100         end - p < 4 || // Not enough bytes
00101         bad_cont(p[1]) || bad_cont(p[2]) || bad_cont(p[3]) || // Invalid
00102         (p[0] == 0xf0 && p[1] < 0x90) || // Overlong encoding
00103         (p[0] == 0xf4 && p[1] >= 0x90)) // Code value above Unicode
00104         return;
00105     seqlen = 4;
00106     return;
00107 }
00108 
00109 unsigned Utf8Iterator::operator*() const {
00110     if (p == NULL) return unsigned(-1);
00111     if (seqlen == 0) calculate_sequence_length();
00112     unsigned char ch = *p;
00113     if (seqlen == 1) return ch;
00114     if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
00115     if (seqlen == 3)
00116         return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
00117     return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
00118             ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
00119 }
00120 
00121 }