00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include "termgenerator_internal.h"
00024
00025 #include <xapian/document.h>
00026 #include <xapian/queryparser.h>
00027 #include <xapian/unicode.h>
00028
00029 #include "stringutils.h"
00030
00031 #include <algorithm>
00032 #include <string>
00033
00034 using namespace std;
00035
00036 namespace Xapian {
00037
00038
00039
00040 static const unsigned int MAX_PROB_TERM_LENGTH = 64;
00041
00042
00043
00044
00045
00046 inline bool
00047 U_isupper(unsigned ch) {
00048 return (ch < 128 && C_isupper((unsigned char)ch));
00049 }
00050
00051 inline unsigned check_wordchar(unsigned ch) {
00052 if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
00053 return 0;
00054 }
00055
00056 inline bool
00057 should_stem(const std::string & term)
00058 {
00059 const unsigned int SHOULD_STEM_MASK =
00060 (1 << Unicode::LOWERCASE_LETTER) |
00061 (1 << Unicode::TITLECASE_LETTER) |
00062 (1 << Unicode::MODIFIER_LETTER) |
00063 (1 << Unicode::OTHER_LETTER);
00064 Utf8Iterator u(term);
00065 return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
00066 }
00067
00068 inline unsigned check_infix(unsigned ch) {
00069 if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
00070
00071
00072
00073
00074 return ch;
00075 }
00076
00077
00078 if (ch == 0x2019 || ch == 0x201b) return '\'';
00079 return 0;
00080 }
00081
00082 inline unsigned check_infix_digit(unsigned ch) {
00083
00084 switch (ch) {
00085 case ',':
00086 case '.':
00087 case ';':
00088 case 0x037e:
00089 case 0x0589:
00090 case 0x060D:
00091 case 0x07F8:
00092 case 0x2044:
00093 case 0xFE10:
00094 case 0xFE13:
00095 case 0xFE14:
00096 return ch;
00097 }
00098 return 0;
00099 }
00100
00101 inline bool
00102 is_digit(unsigned ch) {
00103 return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
00104 }
00105
00106 inline unsigned check_suffix(unsigned ch) {
00107 if (ch == '+' || ch == '#') return ch;
00108
00109 return 0;
00110 }
00111
00112
00113 #define STOPWORDS_NONE 0
00114 #define STOPWORDS_IGNORE 1
00115 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
00116
00117 void
00118 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount weight,
00119 const string & prefix, bool with_positions)
00120 {
00121 int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
00122
00123 if (!stopper) stop_mode = STOPWORDS_NONE;
00124
00125 while (true) {
00126
00127 unsigned ch;
00128 while (true) {
00129 if (itor == Utf8Iterator()) return;
00130 ch = check_wordchar(*itor);
00131 if (ch) break;
00132 ++itor;
00133 }
00134
00135 string term;
00136
00137
00138 if (U_isupper(*itor)) {
00139 const Utf8Iterator end;
00140 Utf8Iterator p = itor;
00141 do {
00142 Unicode::append_utf8(term, Unicode::tolower(*p++));
00143 } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
00144
00145
00146 if (term.size() > 1) {
00147
00148
00149 if (p == end || !Unicode::is_wordchar(*p)) {
00150 itor = p;
00151 goto endofterm;
00152 }
00153 }
00154 term.resize(0);
00155 }
00156
00157 while (true) {
00158 unsigned prevch;
00159 do {
00160 Unicode::append_utf8(term, ch);
00161 prevch = ch;
00162 if (++itor == Utf8Iterator()) goto endofterm;
00163 ch = check_wordchar(*itor);
00164 } while (ch);
00165
00166 Utf8Iterator next(itor);
00167 ++next;
00168 if (next == Utf8Iterator()) break;
00169 unsigned nextch = check_wordchar(*next);
00170 if (!nextch) break;
00171 unsigned infix_ch = *itor;
00172 if (is_digit(prevch) && is_digit(*next)) {
00173 infix_ch = check_infix_digit(infix_ch);
00174 } else {
00175
00176 infix_ch = check_infix(infix_ch);
00177 }
00178 if (!infix_ch) break;
00179 Unicode::append_utf8(term, infix_ch);
00180 ch = nextch;
00181 itor = next;
00182 }
00183
00184 {
00185 size_t len = term.size();
00186 unsigned count = 0;
00187 while ((ch = check_suffix(*itor))) {
00188 if (++count > 3) {
00189 term.resize(len);
00190 break;
00191 }
00192 Unicode::append_utf8(term, ch);
00193 if (++itor == Utf8Iterator()) goto endofterm;
00194 }
00195 }
00196
00197 endofterm:
00198 if (term.size() > MAX_PROB_TERM_LENGTH) continue;
00199
00200 if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
00201
00202 if (with_positions) {
00203 doc.add_posting(prefix + term, ++termpos, weight);
00204 } else {
00205 doc.add_term(prefix + term, weight);
00206 }
00207 if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
00208
00209 if (!stemmer.internal.get()) continue;
00210
00211 if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(term))
00212 continue;
00213
00214
00215
00216 if (!should_stem(term)) continue;
00217
00218
00219 string stem("Z");
00220 stem += prefix;
00221 stem += stemmer(term);
00222 doc.add_term(stem, weight);
00223 }
00224 }
00225
00226 }