Xapian: Internal Source Documentation: xapian-core: queryparser/termgenerator

00001 
00004 /* Copyright (C) 2007 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include "termgenerator_internal.h"
00024 
00025 #include <xapian/document.h>
00026 #include <xapian/queryparser.h>
00027 #include <xapian/unicode.h>
00028 
00029 #include "stringutils.h"
00030 
00031 #include <algorithm>
00032 #include <string>
00033 
00034 using namespace std;
00035 
00036 namespace Xapian {
00037 
00038 // Put a limit on the size of terms to help prevent the index being bloated
00039 // by useless junk terms.
00040 static const unsigned int MAX_PROB_TERM_LENGTH = 64;
00041 // FIXME: threshold is currently in bytes of UTF-8 representation, not unicode
00042 // characters - what actually makes most sense here?
00043 
00044 // FIXME: Add API to allow control of how stemming is used?
00045 
00046 inline bool
00047 U_isupper(unsigned ch) {
00048     return (ch < 128 && C_isupper((unsigned char)ch));
00049 }
00050 
00051 inline unsigned check_wordchar(unsigned ch) {
00052     if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
00053     return 0;
00054 }
00055 
00056 inline bool
00057 should_stem(const std::string & term)
00058 {
00059     const unsigned int SHOULD_STEM_MASK =
00060         (1 << Unicode::LOWERCASE_LETTER) |
00061         (1 << Unicode::TITLECASE_LETTER) |
00062         (1 << Unicode::MODIFIER_LETTER) |
00063         (1 << Unicode::OTHER_LETTER);
00064     Utf8Iterator u(term);
00065     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
00066 }
00067 
00068 inline unsigned check_infix(unsigned ch) {
00069     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
00070         // Unicode includes all these except '&' in it's word boundary rules,
00071         // as well as 0x2019 (which we handle below) and ':' (for Swedish
00072         // apparently, but we ignore this for now as it's problematic in
00073         // real world cases).
00074         return ch;
00075     }
00076     // 0x2019 is Unicode apostrophe and single closing quote.
00077     // 0x201b is Unicode single opening quote with the tail rising.
00078     if (ch == 0x2019 || ch == 0x201b) return '\'';
00079     return 0;
00080 }
00081 
00082 inline unsigned check_infix_digit(unsigned ch) {
00083     // This list of characters comes from Unicode's word identifying algorithm.
00084     switch (ch) {
00085         case ',':
00086         case '.':
00087         case ';':
00088         case 0x037e: // GREEK QUESTION MARK
00089         case 0x0589: // ARMENIAN FULL STOP
00090         case 0x060D: // ARABIC DATE SEPARATOR
00091         case 0x07F8: // NKO COMMA
00092         case 0x2044: // FRACTION SLASH
00093         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
00094         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
00095         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
00096             return ch;
00097     }
00098     return 0;
00099 }
00100 
00101 inline bool
00102 is_digit(unsigned ch) {
00103     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
00104 }
00105 
00106 inline unsigned check_suffix(unsigned ch) {
00107     if (ch == '+' || ch == '#') return ch;
00108     // FIXME: what about '-'?
00109     return 0;
00110 }
00111 
00112 // FIXME: add API for this:
00113 #define STOPWORDS_NONE 0
00114 #define STOPWORDS_IGNORE 1
00115 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
00116 
00117 void
00118 TermGenerator::Internal::index_text(Utf8Iterator itor, termcount weight,
00119                                     const string & prefix, bool with_positions)
00120 {
00121     int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
00122 
00123     if (!stopper) stop_mode = STOPWORDS_NONE;
00124 
00125     while (true) {
00126         // Advance to the start of the next term.
00127         unsigned ch;
00128         while (true) {
00129             if (itor == Utf8Iterator()) return;
00130             ch = check_wordchar(*itor);
00131             if (ch) break;
00132             ++itor;
00133         }
00134 
00135         string term;
00136         // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
00137         // Don't worry if there's a trailing '.' or not.
00138         if (U_isupper(*itor)) {
00139             const Utf8Iterator end;
00140             Utf8Iterator p = itor;
00141             do {
00142                 Unicode::append_utf8(term, Unicode::tolower(*p++));
00143             } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
00144             // One letter does not make an acronym!  If we handled a single
00145             // uppercase letter here, we wouldn't catch M&S below.
00146             if (term.size() > 1) {
00147                 // Check there's not a (lower case) letter or digit
00148                 // immediately after it.
00149                 if (p == end || !Unicode::is_wordchar(*p)) {
00150                     itor = p;
00151                     goto endofterm;
00152                 }
00153             }
00154             term.resize(0);
00155         }
00156 
00157         while (true) {
00158             unsigned prevch;
00159             do {
00160                 Unicode::append_utf8(term, ch);
00161                 prevch = ch;
00162                 if (++itor == Utf8Iterator()) goto endofterm;
00163                 ch = check_wordchar(*itor);
00164             } while (ch);
00165 
00166             Utf8Iterator next(itor);
00167             ++next;
00168             if (next == Utf8Iterator()) break;
00169             unsigned nextch = check_wordchar(*next);
00170             if (!nextch) break;
00171             unsigned infix_ch = *itor;
00172             if (is_digit(prevch) && is_digit(*next)) {
00173                 infix_ch = check_infix_digit(infix_ch);
00174             } else {
00175                 // Handle things like '&' in AT&T, apostrophes, etc.
00176                 infix_ch = check_infix(infix_ch);
00177             }
00178             if (!infix_ch) break;
00179             Unicode::append_utf8(term, infix_ch);
00180             ch = nextch;
00181             itor = next;
00182         }
00183 
00184         {
00185             size_t len = term.size();
00186             unsigned count = 0;
00187             while ((ch = check_suffix(*itor))) {
00188                 if (++count > 3) {
00189                     term.resize(len);
00190                     break;
00191                 }
00192                 Unicode::append_utf8(term, ch);
00193                 if (++itor == Utf8Iterator()) goto endofterm;
00194             }
00195         }
00196 
00197 endofterm:
00198         if (term.size() > MAX_PROB_TERM_LENGTH) continue;
00199 
00200         if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
00201 
00202         if (with_positions) {
00203             doc.add_posting(prefix + term, ++termpos, weight);
00204         } else {
00205             doc.add_term(prefix + term, weight);
00206         }
00207         if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
00208 
00209         if (!stemmer.internal.get()) continue;
00210 
00211         if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(term))
00212             continue;
00213 
00214         // Note, this uses the lowercased term, but that's OK as we only
00215         // want to avoid stemming terms starting with a digit.
00216         if (!should_stem(term)) continue;
00217 
00218         // Add stemmed form without positional information.
00219         string stem("Z");
00220         stem += prefix;
00221         stem += stemmer(term);
00222         doc.add_term(stem, weight);
00223     }
00224 }
00225 
00226 }
queryparser/termgenerator_internal.cc