backends/flint/flint_spelling.cc

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2004,2005,2006,2007 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include <xapian/error.h>
00024 #include <xapian/types.h>
00025 
00026 #include "expandweight.h"
00027 #include "flint_spelling.h"
00028 #include "flint_utils.h"
00029 #include "omassert.h"
00030 #include "ortermlist.h"
00031 
00032 #include <algorithm>
00033 #include <map>
00034 #include <queue>
00035 #include <vector>
00036 #include <set>
00037 #include <string>
00038 
00039 using namespace std;
00040 
00041 // We XOR the length values with this so that they are more likely to coincide
00042 // with lower case ASCII letters, which are likely to be common.  This means
00043 // that zlib should do a better job of compressing tag values - in tests, this
00044 // gave 5% better compression.
00045 #define MAGIC_XOR_VALUE 96
00046 
00047 class PrefixCompressedStringItor {
00048     const unsigned char * p;
00049     size_t left;
00050     string current;
00051 
00052     PrefixCompressedStringItor(const unsigned char * p_, size_t left_,
00053                                const string &current_)
00054         : p(p_), left(left_), current(current_) { }
00055 
00056   public:
00057     PrefixCompressedStringItor(const std::string & s)
00058         : p(reinterpret_cast<const unsigned char *>(s.data())),
00059           left(s.size()) {
00060         if (left) {
00061             operator++();
00062         } else {
00063             p = NULL;
00064         }
00065     }
00066 
00067     const string & operator*() const {
00068         return current;
00069     }
00070 
00071     PrefixCompressedStringItor operator++(int) {
00072         const unsigned char * old_p = p;
00073         size_t old_left = left;
00074         string old_current = current;
00075         operator++();
00076         return PrefixCompressedStringItor(old_p, old_left, old_current);
00077     }
00078 
00079     PrefixCompressedStringItor & operator++() {
00080         if (left == 0) {
00081             p = NULL;
00082         } else {
00083             if (!current.empty()) {
00084                 current.resize(*p++ ^ MAGIC_XOR_VALUE);
00085                 --left;
00086             }
00087             size_t add;
00088             if (left == 0 || (add = *p ^ MAGIC_XOR_VALUE) >= left)
00089                 throw Xapian::DatabaseCorruptError("Bad spelling data (too little left)");
00090             current.append(reinterpret_cast<const char *>(p + 1), add);
00091             p += add + 1;
00092             left -= add + 1;
00093         }
00094         return *this;
00095     }
00096 
00097     bool at_end() const {
00098         return p == NULL;
00099     }
00100 };
00101 
00102 class PrefixCompressedStringWriter {
00103     string current;
00104     string & out;
00105 
00106   public:
00107     PrefixCompressedStringWriter(string & out_) : out(out_) { }
00108 
00109     void append(const string & word) {
00110         // If this isn't the first entry, see how much of the previous one
00111         // we can reuse.
00112         if (!current.empty()) {
00113             size_t len = min(current.size(), word.size());
00114             size_t i;
00115             for (i = 0; i < len; ++i) {
00116                 if (current[i] != word[i]) break;
00117             }
00118             out += char(i ^ MAGIC_XOR_VALUE);
00119             out += char((word.size() - i) ^ MAGIC_XOR_VALUE);
00120             out.append(word.data() + i, word.size() - i);
00121         } else {
00122             out += char(word.size() ^ MAGIC_XOR_VALUE);
00123             out += word;
00124         }
00125         current = word;
00126     }
00127 };
00128 
00129 void
00130 FlintSpellingTable::merge_changes()
00131 {
00132     map<fragment, set<string> >::const_iterator i;
00133     for (i = termlist_deltas.begin(); i != termlist_deltas.end(); ++i) {
00134         string key = i->first;
00135         const set<string> & changes = i->second;
00136 
00137         set<string>::const_iterator d = changes.begin();
00138         Assert(d != changes.end());
00139 
00140         string updated;
00141         string current;
00142         PrefixCompressedStringWriter out(updated);
00143         if (get_exact_entry(key, current)) {
00144             PrefixCompressedStringItor in(current);
00145             updated.reserve(current.size()); // FIXME plus some?
00146             while (!in.at_end()) {
00147                 const string & word = *in;
00148                 if (word < *d) {
00149                     out.append(word);
00150                     ++in;
00151                 } else if (word > *d) {
00152                     out.append(*d++);
00153                     if (d == changes.end()) break;
00154                     continue;
00155                 } else {
00156                     // If an existing entry is in the changes list, that means
00157                     // we should remove it.
00158                     ++in;
00159                     ++d;
00160                 }
00161             }
00162             if (!in.at_end()) {
00163                 // FIXME : easy to optimise this to a fix-up and substring copy.
00164                 while (!in.at_end()) {
00165                     out.append(*in++);
00166                 }
00167             }
00168         }
00169         while (d != changes.end()) {
00170             out.append(*d++);
00171         }
00172         if (!updated.empty()) {
00173             add(key, updated);
00174         } else {
00175             del(key);
00176         }
00177     }
00178     termlist_deltas.clear();
00179 
00180     map<string, Xapian::termcount>::const_iterator j;
00181     for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
00182         string key = "W" + j->first;
00183         if (j->second) {
00184             add(key, pack_uint_last(j->second));
00185         } else {
00186             del(key);
00187         }
00188     }
00189     wordfreq_changes.clear();
00190 }
00191 
00192 void
00193 FlintSpellingTable::add_fragment(fragment frag, const string & word)
00194 {
00195     map<fragment, set<string> >::iterator i = termlist_deltas.find(frag);
00196     if (i == termlist_deltas.end()) {
00197         i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
00198     }
00199     i->second.insert(word);
00200 }
00201 
00202 void
00203 FlintSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
00204 {
00205     if (word.size() <= 1) return;
00206 
00207     map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
00208     if (i != wordfreq_changes.end()) {
00209         // Word "word" already exists and has been modified.
00210         if (i->second) {
00211             i->second += freqinc;
00212             return;
00213         }
00214     } else {
00215         string key = "W" + word;
00216         string data;
00217         if (get_exact_entry(key, data)) {
00218             // Word "word" already exists, so increment its count.
00219             Xapian::termcount freq;
00220             const char * p = data.data();
00221             if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
00222                 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00223             }
00224             wordfreq_changes[word] = freq + freqinc;
00225             return;
00226         }
00227     }
00228 
00229     // New word - need to create trigrams for it.
00230     if (i != wordfreq_changes.end()) {
00231         i->second = freqinc;
00232     } else {
00233         wordfreq_changes[word] = freqinc;
00234     }
00235 
00236     fragment buf;
00237     // Head:
00238     buf[0] = 'H';
00239     buf[1] = word[0];
00240     buf[2] = word[1];
00241     buf[3] = '\0';
00242     add_fragment(buf, word);
00243 
00244     // Tail:
00245     buf[0] = 'T';
00246     buf[1] = word[word.size() - 2];
00247     buf[2] = word[word.size() - 1];
00248     buf[3] = '\0';
00249     add_fragment(buf, word);
00250 
00251     if (word.size() <= 4) {
00252         // We also generate 'bookends' for two, three, and four character
00253         // terms so we can handle transposition of the middle two characters
00254         // of a four character word, substitution or deletion of the middle
00255         // character of a three character word, or insertion in the middle of a
00256         // two character word.
00257         // 'Bookends':
00258         buf[0] = 'B';
00259         buf[1] = word[0];
00260         buf[3] = '\0';
00261         add_fragment(buf, word);
00262     }
00263     if (word.size() > 2) {
00264         // Middles:
00265         buf[0] = 'M';
00266         for (size_t start = 0; start <= word.size() - 3; ++start) {
00267             memcpy(buf.data + 1, word.data() + start, 3);
00268             add_fragment(buf, word);
00269         }
00270     }
00271 }
00272 
00273 void
00274 FlintSpellingTable::remove_fragment(fragment frag, const string & word)
00275 {
00276     map<fragment, set<string> >::iterator i = termlist_deltas.find(frag);
00277     if (i != termlist_deltas.end()) {
00278         i->second.erase(word);
00279     }
00280 }
00281 
00282 void
00283 FlintSpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
00284 {
00285     map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
00286     if (i != wordfreq_changes.end()) {
00287         if (i->second == 0) {
00288             // Word has already been deleted.
00289             return;
00290         }
00291         // Word "word" exists and has been modified.
00292         if (freqdec < i->second) {
00293             i->second -= freqdec;
00294             return;
00295         }
00296     }
00297 
00298     {
00299         string key = "W" + word;
00300         string data;
00301         if (!get_exact_entry(key, data)) {
00302             // This word doesn't exist.
00303             return;
00304         }
00305 
00306         Xapian::termcount freq;
00307         const char *p = data.data();
00308         if (!unpack_uint_last(&p, p + data.size(), &freq)) {
00309             throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00310         }
00311         if (freqdec < freq) {
00312             wordfreq_changes[word] = freq - freqdec;
00313             return;
00314         }
00315     }
00316 
00317     // Mark word as deleted, and remove its fragment entries.
00318     wordfreq_changes[word] = 0;
00319 
00320     fragment buf;
00321     // Head:
00322     buf[0] = 'H';
00323     buf[1] = word[0];
00324     buf[2] = word[1];
00325     buf[3] = '\0';
00326     remove_fragment(buf, word);
00327 
00328     // Tail:
00329     buf[0] = 'T';
00330     buf[1] = word[word.size() - 2];
00331     buf[2] = word[word.size() - 1];
00332     buf[3] = '\0';
00333     remove_fragment(buf, word);
00334 
00335     if (word.size() <= 4) {
00336         // 'Bookends':
00337         buf[0] = 'B';
00338         buf[1] = word[0];
00339         buf[3] = '\0';
00340         remove_fragment(buf, word);
00341     }
00342     if (word.size() > 2) {
00343         // Middles:
00344         buf[0] = 'M';
00345         for (size_t start = 0; start <= word.size() - 3; ++start) {
00346             memcpy(buf.data + 1, word.data() + start, 3);
00347             remove_fragment(buf, word);
00348         }
00349     }
00350 }
00351 
00352 struct TermListGreaterApproxSize {
00353     bool operator()(const TermList *a, const TermList *b) {
00354         return a->get_approx_size() > b->get_approx_size();
00355     }
00356 };
00357 
00358 TermList *
00359 FlintSpellingTable::open_termlist(const string & word)
00360 {
00361     if (word.size() <= 1) return NULL;
00362 
00363     // Merge any pending changes to disk, but don't call commit() so they
00364     // won't be switched live.
00365     if (!wordfreq_changes.empty()) merge_changes();
00366 
00367     // Build a priority queue of TermList objects which returns those of
00368     // greatest approximate size first.
00369     priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
00370     try {
00371         string data;
00372         fragment buf;
00373 
00374         // Head:
00375         buf[0] = 'H';
00376         buf[1] = word[0];
00377         buf[2] = word[1];
00378         if (get_exact_entry(string(buf), data))
00379             pq.push(new FlintSpellingTermList(data));
00380 
00381         // Tail:
00382         buf[0] = 'T';
00383         buf[1] = word[word.size() - 2];
00384         buf[2] = word[word.size() - 1];
00385         if (get_exact_entry(string(buf), data))
00386             pq.push(new FlintSpellingTermList(data));
00387 
00388         if (word.size() <= 4) {
00389             // We also generate 'bookends' for two, three, and four character
00390             // terms so we can handle transposition of the middle two
00391             // characters of a four character word, substitution or deletion of
00392             // the middle character of a three character word, or insertion in
00393             // the middle of a two character word.
00394             buf[0] = 'B';
00395             buf[1] = word[0];
00396             buf[3] = '\0';
00397             if (get_exact_entry(string(buf), data))
00398                 pq.push(new FlintSpellingTermList(data));
00399         }
00400         if (word.size() > 2) {
00401             // Middles:
00402             buf[0] = 'M';
00403             for (size_t start = 0; start <= word.size() - 3; ++start) {
00404                 memcpy(buf.data + 1, word.data() + start, 3);
00405                 if (get_exact_entry(string(buf), data))
00406                     pq.push(new FlintSpellingTermList(data));
00407             }
00408 
00409             if (word.size() == 3) {
00410                 // For three letter words, we generate the two "single
00411                 // transposition" forms too, so that we can produce good
00412                 // spelling suggestions.
00413                 // ABC -> BAC
00414                 buf[1] = word[1];
00415                 buf[2] = word[0];
00416                 if (get_exact_entry(string(buf), data))
00417                     pq.push(new FlintSpellingTermList(data));
00418                 // ABC -> ACB
00419                 buf[1] = word[0];
00420                 buf[2] = word[2];
00421                 buf[3] = word[1];
00422                 if (get_exact_entry(string(buf), data))
00423                     pq.push(new FlintSpellingTermList(data));
00424             }
00425         } else {
00426             Assert(word.size() == 2);
00427             // For two letter words, we generate H and T terms for the
00428             // transposed form so that we can produce good spelling
00429             // suggestions.
00430             // AB -> BA
00431             buf[0] = 'H';
00432             buf[1] = word[1];
00433             buf[2] = word[0];
00434             if (get_exact_entry(string(buf), data))
00435                 pq.push(new FlintSpellingTermList(data));
00436             buf[0] = 'T';
00437             if (get_exact_entry(string(buf), data))
00438                 pq.push(new FlintSpellingTermList(data));
00439         }
00440 
00441         if (pq.empty()) return NULL;
00442 
00443         // Build up an OrTermList tree by combine leaves and/or branches in
00444         // pairs.  The tree is balanced by the approximated sizes of the leaf
00445         // FlintSpellingTermList objects - the way the tree is built are very
00446         // similar to how an optimal Huffman code is often constructed.
00447         //
00448         // Balancing the tree like this should tend to minimise the amount of
00449         // work done.
00450         while (pq.size() > 1) {
00451             // Build the tree such that left is always >= right so that
00452             // OrTermList can rely on this when trying to minimise work.
00453             TermList * termlist = pq.top();
00454             pq.pop();
00455 
00456             termlist = new OrTermList(pq.top(), termlist);
00457             pq.pop();
00458             pq.push(termlist);
00459         }
00460 
00461         return pq.top();
00462     } catch (...) {
00463         // Make sure we delete all the TermList objects to avoid leaking
00464         // memory.
00465         while (!pq.empty()) {
00466             delete pq.top();
00467             pq.pop();
00468         }
00469         throw;
00470     }
00471 }
00472 
00473 Xapian::doccount
00474 FlintSpellingTable::get_word_frequency(const string & word) const
00475 {
00476     map<string, Xapian::termcount>::const_iterator i;
00477     i = wordfreq_changes.find(word);
00478     if (i != wordfreq_changes.end()) {
00479         // Modified frequency for word:
00480         return i->second;
00481     }
00482 
00483     string key = "W" + word;
00484     string data;
00485     if (get_exact_entry(key, data)) {
00486         // Word "word" already exists.
00487         Xapian::termcount freq;
00488         const char *p = data.data();
00489         if (!unpack_uint_last(&p, p + data.size(), &freq)) {
00490             throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00491         }
00492         return freq;
00493     }
00494 
00495     return 0;
00496 }
00497 
00499 
00500 Xapian::termcount
00501 FlintSpellingTermList::get_approx_size() const
00502 {
00503     // This is only used to decide how to build a OR-tree of TermList objects
00504     // so we just need to return "sizes" which are ordered roughly correctly.
00505     return data.size();
00506 }
00507 
00508 std::string
00509 FlintSpellingTermList::get_termname() const
00510 {
00511     return current_term;
00512 }
00513 
00514 Xapian::termcount
00515 FlintSpellingTermList::get_wdf() const
00516 {
00517     return 1;
00518 }
00519 
00520 Xapian::doccount
00521 FlintSpellingTermList::get_termfreq() const
00522 {
00523     return 1;
00524 }
00525 
00526 Xapian::termcount
00527 FlintSpellingTermList::get_collection_freq() const
00528 {
00529     return 1;
00530 }
00531 
00532 TermList *
00533 FlintSpellingTermList::next()
00534 {
00535     if (p == data.size()) {
00536         p = 0;
00537         data.resize(0);
00538         return NULL;
00539     }
00540     if (!current_term.empty()) {
00541         if (p == data.size())
00542             throw Xapian::DatabaseCorruptError("Bad spelling termlist");
00543         current_term.resize(byte(data[p++]) ^ MAGIC_XOR_VALUE);
00544     }
00545     size_t add;
00546     if (p == data.size() ||
00547         (add = byte(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
00548         throw Xapian::DatabaseCorruptError("Bad spelling termlist");
00549     current_term.append(data.data() + p + 1, add);
00550     p += add + 1;
00551     return NULL;
00552 }
00553 
00554 bool
00555 FlintSpellingTermList::at_end() const
00556 {
00557     return data.empty();
00558 }
00559 
00560 Xapian::termcount
00561 FlintSpellingTermList::positionlist_count() const
00562 {
00563     throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_count() not implemented");
00564 }
00565 
00566 Xapian::PositionIterator
00567 FlintSpellingTermList::positionlist_begin() const
00568 {
00569     throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_begin() not implemented");
00570 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.