00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <config.h>
00021
00022 #include <xapian/unicode.h>
00023
00024 #include <string.h>
00025
00026 inline bool bad_cont(unsigned char ch) { return (ch & 0xc0) != 0x80; }
00027
00028 namespace Xapian {
00029
00030 namespace Unicode {
00031
00032
00033 unsigned
00034 nonascii_to_utf8(unsigned ch, char * buf)
00035 {
00036 if (ch < 0x800) {
00037 buf[0] = 0xc0 | (ch >> 6);
00038 buf[1] = 0x80 | (ch & 0x3f);
00039 return 2;
00040 }
00041 if (ch < 0x10000) {
00042 buf[0] = 0xe0 | (ch >> 12);
00043 buf[1] = 0x80 | ((ch >> 6) & 0x3f);
00044 buf[2] = 0x80 | (ch & 0x3f);
00045 return 3;
00046 }
00047 if (ch < 0x200000) {
00048 buf[0] = 0xf0 | (ch >> 18);
00049 buf[1] = 0x80 | ((ch >> 12) & 0x3f);
00050 buf[2] = 0x80 | ((ch >> 6) & 0x3f);
00051 buf[3] = 0x80 | (ch & 0x3f);
00052 return 4;
00053 }
00054
00055
00056
00057 return 0;
00058 }
00059
00060 }
00061
00062 Utf8Iterator::Utf8Iterator(const char *p_)
00063 {
00064 assign(p_, strlen(p_));
00065 }
00066
00067 void
00068 Utf8Iterator::calculate_sequence_length() const
00069 {
00070
00071
00072
00073
00074
00075 unsigned char ch = *p;
00076
00077 seqlen = 1;
00078
00079
00080
00081
00082 if (ch < 0xc2) return;
00083
00084 if (ch < 0xe0) {
00085 if (p + 1 == end ||
00086 (p[1] & 0xc0) != 0x80)
00087 return;
00088 seqlen = 2;
00089 return;
00090 }
00091 if (ch < 0xf0) {
00092 if (end - p < 3 ||
00093 bad_cont(p[1]) || bad_cont(p[2]) ||
00094 (p[0] == 0xe0 && p[1] < 0xa0))
00095 return;
00096 seqlen = 3;
00097 return;
00098 }
00099 if (ch >= 0xf5 ||
00100 end - p < 4 ||
00101 bad_cont(p[1]) || bad_cont(p[2]) || bad_cont(p[3]) ||
00102 (p[0] == 0xf0 && p[1] < 0x90) ||
00103 (p[0] == 0xf4 && p[1] >= 0x90))
00104 return;
00105 seqlen = 4;
00106 return;
00107 }
00108
00109 unsigned Utf8Iterator::operator*() const {
00110 if (p == NULL) return unsigned(-1);
00111 if (seqlen == 0) calculate_sequence_length();
00112 unsigned char ch = *p;
00113 if (seqlen == 1) return ch;
00114 if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
00115 if (seqlen == 3)
00116 return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
00117 return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
00118 ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
00119 }
00120
00121 }