languages/steminternal.cc

Go to the documentation of this file.
00001 
00004 /* Derived from snowball's runtime/api.c:
00005  *
00006  * Copyright (c) 2001, Dr Martin Porter
00007  * Copyright (c) 2004,2005, Richard Boulton
00008  * Copyright (c) 2006,2007 Olly Betts
00009  * All rights reserved.
00010  *
00011  * Redistribution and use in source and binary forms, with or without
00012  * modification, are permitted provided that the following conditions are met:
00013  *
00014  *   * Redistributions of source code must retain the above copyright notice,
00015  *     this list of conditions and the following disclaimer.
00016  *
00017  *   * Redistributions in binary form must reproduce the above copyright
00018  *     notice, this list of conditions and the following disclaimer in the
00019  *     documentation and/or other materials provided with the distribution.
00020  *
00021  *   * Neither the name of the <ORGANIZATION> nor the names of its contributors
00022  *     may be used to endorse or promote products derived from this software
00023  *     without specific prior written permission.
00024  *
00025  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00026  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00028  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00029  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00030  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00031  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00032  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00033  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00034  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00035  * POSSIBILITY OF SUCH DAMAGE.
00036  */
00037 /* Copyright (C) 2007 Olly Betts
00038  *
00039  * This program is free software; you can redistribute it and/or
00040  * modify it under the terms of the GNU General Public License as
00041  * published by the Free Software Foundation; either version 2 of the
00042  * License, or (at your option) any later version.
00043  *
00044  * This program is distributed in the hope that it will be useful,
00045  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00046  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00047  * GNU General Public License for more details.
00048  *
00049  * You should have received a copy of the GNU General Public License
00050  * along with this program; if not, write to the Free Software
00051  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00052  */
00053 
00054 #include <config.h>
00055 
00056 #include <xapian/error.h>
00057 
00058 #include "omassert.h"
00059 #include "steminternal.h"
00060 
00061 #if 0
00062 #include <stdio.h>
00063 #endif
00064 #include <stdlib.h>
00065 #include <string.h>
00066 
00067 #include <string>
00068 
00069 using namespace std;
00070 
00071 #define CREATE_SIZE 16
00072 
00073 extern symbol * create_s() {
00074     void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
00075     if (mem == NULL) throw std::bad_alloc();
00076     symbol * p = (symbol *) (HEAD + (char *) mem);
00077     SET_CAPACITY(p, CREATE_SIZE);
00078     SET_SIZE(p, CREATE_SIZE);
00079     return p;
00080 }
00081 
00082 /*
00083    new_c = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
00084    if n +ve, or n characters backwards from p + c - 1 if n -ve. new_c is the new
00085    value of the cursor c, or -1 on failure.
00086 
00087    -- used to implement hop and next in the utf8 case.
00088 */
00089 
00090 extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
00091     if (n >= 0) {
00092         for (; n > 0; n--) {
00093             if (c >= l) return -1;
00094             if (p[c++] >= 0xC0) {   /* 1100 0000 */
00095                 while (c < l) {
00096                     /* break unless p[c] is 10------ */
00097                     if (p[c] >> 6 != 2) break;
00098                     c++;
00099                 }
00100             }
00101         }
00102     } else {
00103         for (; n < 0; n++) {
00104             if (c <= lb) return -1;
00105             if (p[--c] >= 0x80) {   /* 1000 0000 */
00106                 while (c > lb) {
00107                     if (p[c] >= 0xC0) break; /* 1100 0000 */
00108                     c--;
00109                 }
00110             }
00111         }
00112     }
00113     return c;
00114 }
00115 
00116 
00117 /* Increase the size of the buffer pointed to by p to at least n symbols.
00118  * If insufficient memory, throw std::bad_alloc().
00119  */
00120 static symbol * increase_size(symbol * p, int n) {
00121     int new_size = n + 20;
00122     void * mem = realloc((char *) p - HEAD,
00123                          HEAD + (new_size + 1) * sizeof(symbol));
00124     if (mem == NULL) {
00125         throw std::bad_alloc();
00126     }
00127     symbol * q = (symbol *) (HEAD + (char *)mem);
00128     SET_CAPACITY(q, new_size);
00129     return q;
00130 }
00131 
00132 namespace Xapian {
00133 
00134 Stem::Internal::Internal()
00135     : p(create_s()), c(0), l(0), lb(0), bra(0), ket(0)
00136 {
00137 }
00138 
00139 Stem::Internal::~Internal()
00140 {
00141     lose_s(p);
00142 }
00143 
00144 string
00145 Stem::Internal::operator()(const string & word)
00146 {
00147     const symbol * s = reinterpret_cast<const symbol *>(word.data());
00148     replace_s(0, l, word.size(), s);
00149     c = 0;
00150     if (stem() < 0) {
00151         // FIXME: Is there a better choice of exception class?
00152         throw Xapian::InternalError("stemming exception!");
00153     }
00154     return string(reinterpret_cast<const char *>(p), l);
00155 }
00156 
00157 /* Code for character groupings: utf8 cases */
00158 
00159 int Stem::Internal::get_utf8(int * slot) {
00160     int b0, b1;
00161     int tmp = c;
00162     if (tmp >= l) return 0;
00163     b0 = p[tmp++];
00164     if (b0 < 0xC0 || tmp == l) {   /* 1100 0000 */
00165         * slot = b0; return 1;
00166     }
00167     b1 = p[tmp++];
00168     if (b0 < 0xE0 || tmp == l) {   /* 1110 0000 */
00169         * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
00170     }
00171     * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[tmp] & 0x3F); return 3;
00172 }
00173 
00174 int Stem::Internal::get_b_utf8(int * slot) {
00175     int b0, b1;
00176     int tmp = c;
00177     if (tmp <= lb) return 0;
00178     b0 = p[--tmp];
00179     if (b0 < 0x80 || tmp == lb) {   /* 1000 0000 */
00180         * slot = b0; return 1;
00181     }
00182     b1 = p[--tmp];
00183     if (b1 >= 0xC0 || tmp == lb) {   /* 1100 0000 */
00184         * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
00185     }
00186     * slot = (p[tmp] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
00187 }
00188 
00189 int Stem::Internal::in_grouping_U(const unsigned char * s, int min, int max, int repeat) {
00190     do {
00191         int ch;
00192         int w = get_utf8(&ch);
00193         if (!w) return -1;
00194         if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
00195             return w;
00196         c += w;
00197     } while (repeat);
00198     return 0;
00199 }
00200 
00201 int Stem::Internal::in_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
00202     do {
00203         int ch;
00204         int w = get_b_utf8(&ch);
00205         if (!w) return -1;
00206         if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
00207             return w;
00208         c -= w;
00209     } while (repeat);
00210     return 0;
00211 }
00212 
00213 int Stem::Internal::out_grouping_U(const unsigned char * s, int min, int max, int repeat) {
00214     do {
00215         int ch;
00216         int w = get_utf8(&ch);
00217         if (!w) return -1;
00218         if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
00219             /* FIXME: try adding this so gopast in generated code is simpler: if (repeat == 2) c += w; */ return w;
00220         c += w;
00221     } while (repeat);
00222     return 0;
00223 }
00224 
00225 int Stem::Internal::out_grouping_b_U(const unsigned char * s, int min, int max, int repeat) {
00226     do {
00227         int ch;
00228         int w = get_b_utf8(&ch);
00229         if (!w) return -1;
00230         if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
00231             return w;
00232         c -= w;
00233     } while (repeat);
00234     return 0;
00235 }
00236 
00237 int Stem::Internal::eq_s(int s_size, const symbol * s) {
00238     if (l - c < s_size || memcmp(p + c, s, s_size * sizeof(symbol)) != 0)
00239         return 0;
00240     c += s_size;
00241     return 1;
00242 }
00243 
00244 int Stem::Internal::eq_s_b(int s_size, const symbol * s) {
00245     if (c - lb < s_size || memcmp(p + c - s_size, s, s_size * sizeof(symbol)) != 0)
00246         return 0;
00247     c -= s_size;
00248     return 1;
00249 }
00250 
00251 int Stem::Internal::find_among(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) {
00252     int i = 0;
00253     int j = v_size;
00254 
00255     const symbol * q = p + c;
00256     int c_orig = c;
00257 
00258     int common_i = 0;
00259     int common_j = 0;
00260 
00261     int first_key_inspected = 0;
00262 
00263     while(1) {
00264         int k = i + ((j - i) >> 1);
00265         int diff = 0;
00266         int common = common_i < common_j ? common_i : common_j; /* smaller */
00267         const struct among * w = v + k;
00268         for (int x = common; x < w->s_size; x++) {
00269             if (c_orig + common == l) { diff = -1; break; }
00270             diff = q[common] - w->s[x];
00271             if (diff != 0) break;
00272             common++;
00273         }
00274         if (diff < 0) { j = k; common_j = common; }
00275                  else { i = k; common_i = common; }
00276         if (j - i <= 1) {
00277             if (i > 0) break; /* v->s has been inspected */
00278             if (j == i) break; /* only one item in v */
00279 
00280             /* - but now we need to go round once more to get
00281                v->s inspected. This looks messy, but is actually
00282                the optimal approach.  */
00283 
00284             if (first_key_inspected) break;
00285             first_key_inspected = 1;
00286         }
00287     }
00288     while(1) {
00289         const struct among * w = v + i;
00290         if (common_i >= w->s_size) {
00291             c = c_orig + w->s_size;
00292             if (!fnum || !fnum[i]) return w->result;
00293             {
00294                 int res = f[fnum[i] - 1](this);
00295                 c = c_orig + w->s_size;
00296                 if (res) return w->result;
00297             }
00298         }
00299         i = w->substring_i;
00300         if (i < 0) return 0;
00301     }
00302 }
00303 
00304 /* find_among_b is for backwards processing. Same comments apply */
00305 int Stem::Internal::find_among_b(const struct among * v, int v_size, const unsigned char * fnum, const among_function * f) {
00306     int i = 0;
00307     int j = v_size;
00308 
00309     const symbol * q = p + c - 1;
00310     int c_orig = c;
00311 
00312     int common_i = 0;
00313     int common_j = 0;
00314 
00315     int first_key_inspected = 0;
00316 
00317     while(1) {
00318         int k = i + ((j - i) >> 1);
00319         int diff = 0;
00320         int common = common_i < common_j ? common_i : common_j;
00321         const struct among * w = v + k;
00322         for (int x = w->s_size - 1 - common; x >= 0; x--) {
00323             if (c_orig - common == lb) { diff = -1; break; }
00324             diff = q[- common] - w->s[x];
00325             if (diff != 0) break;
00326             common++;
00327         }
00328         if (diff < 0) { j = k; common_j = common; }
00329                  else { i = k; common_i = common; }
00330         if (j - i <= 1) {
00331             if (i > 0) break;
00332             if (j == i) break;
00333             if (first_key_inspected) break;
00334             first_key_inspected = 1;
00335         }
00336     }
00337     while(1) {
00338         const struct among * w = v + i;
00339         if (common_i >= w->s_size) {
00340             c = c_orig - w->s_size;
00341             if (!fnum || !fnum[i]) return w->result;
00342             {
00343                 int res = f[fnum[i] - 1](this);
00344                 c = c_orig - w->s_size;
00345                 if (res) return w->result;
00346             }
00347         }
00348         i = w->substring_i;
00349         if (i < 0) return 0;
00350     }
00351 }
00352 
00353 int
00354 Stem::Internal::replace_s(int c_bra, int c_ket, int s_size, const symbol * s)
00355 {
00356     int adjustment;
00357     int len;
00358     Assert(p);
00359     adjustment = s_size - (c_ket - c_bra);
00360     len = SIZE(p);
00361     if (adjustment != 0) {
00362         if (adjustment + len > CAPACITY(p)) {
00363             p = increase_size(p, adjustment + len);
00364         }
00365         memmove(p + c_ket + adjustment,
00366                 p + c_ket,
00367                 (len - c_ket) * sizeof(symbol));
00368         SET_SIZE(p, adjustment + len);
00369         l += adjustment;
00370         if (c >= c_ket)
00371             c += adjustment;
00372         else
00373             if (c > c_bra)
00374                 c = c_bra;
00375     }
00376     if (s_size != 0) memmove(p + c_bra, s, s_size * sizeof(symbol));
00377     return adjustment;
00378 }
00379 
00380 int Stem::Internal::slice_check() {
00381     Assert(p);
00382     if (bra < 0 || bra > ket || ket > l) {
00383 #if 0
00384         fprintf(stderr, "faulty slice operation:\n");
00385         debug(z, -1, 0);
00386 #endif
00387         return -1;
00388     }
00389     return 0;
00390 }
00391 
00392 int Stem::Internal::slice_from_s(int s_size, const symbol * s) {
00393     if (slice_check()) return -1;
00394     replace_s(bra, ket, s_size, s);
00395     return 0;
00396 }
00397 
00398 void Stem::Internal::insert_s(int c_bra, int c_ket, int s_size, const symbol * s) {
00399     int adjustment = replace_s(c_bra, c_ket, s_size, s);
00400     if (c_bra <= bra) bra += adjustment;
00401     if (c_bra <= ket) ket += adjustment;
00402 }
00403 
00404 symbol * Stem::Internal::slice_to(symbol * v) {
00405     if (slice_check()) return NULL;
00406     {
00407         int len = ket - bra;
00408         if (CAPACITY(v) < len) {
00409             v = increase_size(v, len);
00410         }
00411         memmove(v, p + bra, len * sizeof(symbol));
00412         SET_SIZE(v, len);
00413     }
00414     return v;
00415 }
00416 
00417 symbol * Stem::Internal::assign_to(symbol * v) {
00418     int len = l;
00419     if (CAPACITY(v) < len) {
00420         v = increase_size(v, len);
00421     }
00422     memmove(v, p, len * sizeof(symbol));
00423     SET_SIZE(v, len);
00424     return v;
00425 }
00426 
00427 #if 0
00428 void Stem::Internal::debug(int number, int line_count) {
00429     int i;
00430     int limit = SIZE(p);
00431     /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
00432     if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
00433     for (i = 0; i <= limit; i++) {
00434         if (lb == i) printf("{");
00435         if (bra == i) printf("[");
00436         if (c == i) printf("|");
00437         if (ket == i) printf("]");
00438         if (l == i) printf("}");
00439         if (i < limit)
00440         {   int ch = p[i];
00441             if (ch == 0) ch = '#';
00442             printf("%c", ch);
00443         }
00444     }
00445     printf("'\n");
00446 }
00447 #endif
00448 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.