00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054 #include <config.h>
00055
00056 #include <xapian/error.h>
00057
00058 #include "omassert.h"
00059 #include "steminternal.h"
00060
00061 #if 0
00062 #include <stdio.h>
00063 #endif
00064 #include <stdlib.h>
00065 #include <string.h>
00066
00067 #include <string>
00068
00069 using namespace std;
00070
00071 #define CREATE_SIZE 16
00072
00073 extern symbol * create_s() {
00074 void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
00075 if (mem == NULL) throw std::bad_alloc();
00076 symbol * p = (symbol *) (HEAD + (char *) mem);
00077 SET_CAPACITY(p, CREATE_SIZE);
00078 SET_SIZE(p, CREATE_SIZE);
00079 return p;
00080 }
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090 extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
00091 if (n >= 0) {
00092 for (; n > 0; n--) {
00093 if (c >= l) return -1;
00094 if (p[c++] >= 0xC0) {
00095 while (c < l) {
00096
00097 if (p[c] >> 6 != 2) break;
00098 c++;
00099 }
00100 }
00101 }
00102 } else {
00103 for (; n < 0; n++) {
00104 if (c <= lb) return -1;
00105 if (p[--c] >= 0x80) {
00106 while (c > lb) {
00107 if (p[c] >= 0xC0) break;
00108 c--;
00109 }
00110 }
00111 }
00112 }
00113 return c;
00114 }
00115
00116
00117
00118
00119
00120 static symbol * increase_size(symbol * p, int n) {
00121 int new_size = n + 20;
00122 void * mem = realloc((char *) p - HEAD,
00123 HEAD + (new_size + 1) * sizeof(symbol));
00124 if (mem == NULL) {
00125 throw std::bad_alloc();
00126 }
00127 symbol * q = (symbol *) (HEAD + (char *)mem);
00128 SET_CAPACITY(q, new_size);
00129 return q;
00130 }
00131
00132 namespace Xapian {
00133
00134 Stem::Internal::Internal()
00135 : p(create_s()), c(0), l(0), lb(0), bra(0), ket(0)
00136 {
00137 }
00138
00139 Stem::Internal::~Internal()
00140 {
00141 lose_s(p);
00142 }
00143
00144 string
00145 Stem::Internal::operator()(const string & word)
00146 {
00147 const symbol * s = reinterpret_cast<const symbol *>(word.data());
00148 replace_s(0, l, word.size(), s);
00149 c = 0;
00150 if (stem() < 0) {
00151
00152 throw Xapian::InternalError("stemming exception!");
00153 }
00154 return string(reinterpret_cast<const char *>(p), l);
00155 }
00156
00157
00158
00159 int Stem::Internal::get_utf8(int * slot) {
00160 int b0, b1;
00161 int tmp = c;
00162 if (tmp >= l) return 0;
00163 b0 = p[tmp++];
00164 if (b0 < 0xC0 || tmp == l) {
00165 * slot = b0; return 1;
00166 }
00167 b1 = p[tmp++];
00168 if (b0 < 0xE0 || tmp == l) {
00169 * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
00170 }
00171 * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[tmp] & 0x3F); return 3;
00172 }
00173
00174 int Stem::Internal::get_b_utf8(int * slot) {
00175 int b0, b1;
00176 int tmp = c;
00177 if (tmp <= lb) return 0;
00178 b0 = p[--tmp];
00179 if (b0 < 0x80 || tmp == lb) {
00180 * slot = b0; return 1;
00181 }
00182 b1 = p[--tmp];
00183 if (b1 >= 0xC0 || tmp == lb) {
00184 * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
00185 }
00186 * slot = (p[tmp] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
00187 }
00188
00189 int Stem::Internal::in_grouping_U(const unsigned char * s, int min, int max, int repeat) {
00190 do {
00191 int ch;
00192 int w = get_utf8(&ch);
00193 if (!w) return -1;
00194 if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
00195 return w;
00196 c += w;
00197 } while (repeat);
00198 return 0;
00199 }
00200
00201 int Stem::Internal::in_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
00202 do {
00203 int ch;
00204 int w = get_b_utf8(&ch);
00205 if (!w) return -1;
00206 if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
00207 return w;
00208 c -= w;
00209 } while (repeat);
00210 return 0;
00211 }
00212
00213 int Stem::Internal::out_grouping_U(const unsigned char * s, int min, int max, int repeat) {
00214 do {
00215 int ch;
00216 int w = get_utf8(&ch);
00217 if (!w) return -1;
00218 if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
00219 return w;
00220 c += w;
00221 } while (repeat);
00222 return 0;
00223 }
00224
00225 int Stem::Internal::out_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
00226 do {
00227 int ch;
00228 int w = get_b_utf8(&ch);
00229 if (!w) return -1;
00230 if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
00231 return w;
00232 c -= w;
00233 } while (repeat);
00234 return 0;
00235 }
00236
00237 int Stem::Internal::eq_s(int s_size, const symbol * s) {
00238 if (l - c < s_size || memcmp(p + c, s, s_size * sizeof(symbol)) != 0)
00239 return 0;
00240 c += s_size;
00241 return 1;
00242 }
00243
00244 int Stem::Internal::eq_s_b(int s_size, const symbol * s) {
00245 if (c - lb < s_size || memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0)
00246 return 0;
00247 c -= s_size;
00248 return 1;
00249 }
00250
00251 int Stem::Internal::find_among(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) {
00252 int i = 0;
00253 int j = v_size;
00254
00255 const symbol * q = p + c;
00256 int c_orig = c;
00257
00258 int common_i = 0;
00259 int common_j = 0;
00260
00261 int first_key_inspected = 0;
00262
00263 while(1) {
00264 int k = i + ((j - i) >> 1);
00265 int diff = 0;
00266 int common = common_i < common_j ? common_i : common_j;
00267 const struct among * w = v + k;
00268 for (int x = common; x < w->s_size; x++) {
00269 if (c_orig + common == l) { diff = -1; break; }
00270 diff = q[common] - w->s[x];
00271 if (diff != 0) break;
00272 common++;
00273 }
00274 if (diff < 0) { j = k; common_j = common; }
00275 else { i = k; common_i = common; }
00276 if (j - i <= 1) {
00277 if (i > 0) break;
00278 if (j == i) break;
00279
00280
00281
00282
00283
00284 if (first_key_inspected) break;
00285 first_key_inspected = 1;
00286 }
00287 }
00288 while(1) {
00289 const struct among * w = v + i;
00290 if (common_i >= w->s_size) {
00291 c = c_orig + w->s_size;
00292 if (!fnum || !fnum[i]) return w->result;
00293 {
00294 int res = f[fnum[i] - 1](this);
00295 c = c_orig + w->s_size;
00296 if (res) return w->result;
00297 }
00298 }
00299 i = w->substring_i;
00300 if (i < 0) return 0;
00301 }
00302 }
00303
00304
00305 int Stem::Internal::find_among_b(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) {
00306 int i = 0;
00307 int j = v_size;
00308
00309 const symbol * q = p + c - 1;
00310 int c_orig = c;
00311
00312 int common_i = 0;
00313 int common_j = 0;
00314
00315 int first_key_inspected = 0;
00316
00317 while(1) {
00318 int k = i + ((j - i) >> 1);
00319 int diff = 0;
00320 int common = common_i < common_j ? common_i : common_j;
00321 const struct among * w = v + k;
00322 for (int x = w->s_size - 1 - common; x >= 0; x--) {
00323 if (c_orig - common == lb) { diff = -1; break; }
00324 diff = q[- common] - w->s[x];
00325 if (diff != 0) break;
00326 common++;
00327 }
00328 if (diff < 0) { j = k; common_j = common; }
00329 else { i = k; common_i = common; }
00330 if (j - i <= 1) {
00331 if (i > 0) break;
00332 if (j == i) break;
00333 if (first_key_inspected) break;
00334 first_key_inspected = 1;
00335 }
00336 }
00337 while(1) {
00338 const struct among * w = v + i;
00339 if (common_i >= w->s_size) {
00340 c = c_orig - w->s_size;
00341 if (!fnum || !fnum[i]) return w->result;
00342 {
00343 int res = f[fnum[i] - 1](this);
00344 c = c_orig - w->s_size;
00345 if (res) return w->result;
00346 }
00347 }
00348 i = w->substring_i;
00349 if (i < 0) return 0;
00350 }
00351 }
00352
00353 int
00354 Stem::Internal::replace_s(int c_bra, int c_ket, int s_size, const symbol * s)
00355 {
00356 int adjustment;
00357 int len;
00358 Assert(p);
00359 adjustment = s_size - (c_ket - c_bra);
00360 len = SIZE(p);
00361 if (adjustment != 0) {
00362 if (adjustment + len > CAPACITY(p)) {
00363 p = increase_size(p, adjustment + len);
00364 }
00365 memmove(p + c_ket + adjustment,
00366 p + c_ket,
00367 (len - c_ket) * sizeof(symbol));
00368 SET_SIZE(p, adjustment + len);
00369 l += adjustment;
00370 if (c >= c_ket)
00371 c += adjustment;
00372 else
00373 if (c > c_bra)
00374 c = c_bra;
00375 }
00376 if (s_size != 0) memmove(p + c_bra, s, s_size * sizeof(symbol));
00377 return adjustment;
00378 }
00379
00380 int Stem::Internal::slice_check() {
00381 Assert(p);
00382 if (bra < 0 || bra > ket || ket > l) {
00383 #if 0
00384 fprintf(stderr, "faulty slice operation:\n");
00385 debug(z, -1, 0);
00386 #endif
00387 return -1;
00388 }
00389 return 0;
00390 }
00391
00392 int Stem::Internal::slice_from_s(int s_size, const symbol * s) {
00393 if (slice_check()) return -1;
00394 replace_s(bra, ket, s_size, s);
00395 return 0;
00396 }
00397
00398 void Stem::Internal::insert_s(int c_bra, int c_ket, int s_size, const symbol * s) {
00399 int adjustment = replace_s(c_bra, c_ket, s_size, s);
00400 if (c_bra <= bra) bra += adjustment;
00401 if (c_bra <= ket) ket += adjustment;
00402 }
00403
00404 symbol * Stem::Internal::slice_to(symbol * v) {
00405 if (slice_check()) return NULL;
00406 {
00407 int len = ket - bra;
00408 if (CAPACITY(v) < len) {
00409 v = increase_size(v, len);
00410 }
00411 memmove(v, p + bra, len * sizeof(symbol));
00412 SET_SIZE(v, len);
00413 }
00414 return v;
00415 }
00416
00417 symbol * Stem::Internal::assign_to(symbol * v) {
00418 int len = l;
00419 if (CAPACITY(v) < len) {
00420 v = increase_size(v, len);
00421 }
00422 memmove(v, p, len * sizeof(symbol));
00423 SET_SIZE(v, len);
00424 return v;
00425 }
00426
00427 #if 0
00428 void Stem::Internal::debug(int number, int line_count) {
00429 int i;
00430 int limit = SIZE(p);
00431
00432 if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
00433 for (i = 0; i <= limit; i++) {
00434 if (lb == i) printf("{");
00435 if (bra == i) printf("[");
00436 if (c == i) printf("|");
00437 if (ket == i) printf("]");
00438 if (l == i) printf("}");
00439 if (i < limit)
00440 { int ch = p[i];
00441 if (ch == 0) ch = '#';
00442 printf("%c", ch);
00443 }
00444 }
00445 printf("'\n");
00446 }
00447 #endif
00448 }