00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include <xapian/error.h>
00024 #include <xapian/types.h>
00025
00026 #include "expandweight.h"
00027 #include "flint_spelling.h"
00028 #include "flint_utils.h"
00029 #include "omassert.h"
00030 #include "ortermlist.h"
00031
00032 #include <algorithm>
00033 #include <map>
00034 #include <queue>
00035 #include <vector>
00036 #include <set>
00037 #include <string>
00038
00039 using namespace std;
00040
00041
00042
00043
00044
00045 #define MAGIC_XOR_VALUE 96
00046
00047 class PrefixCompressedStringItor {
00048 const unsigned char * p;
00049 size_t left;
00050 string current;
00051
00052 PrefixCompressedStringItor(const unsigned char * p_, size_t left_,
00053 const string ¤t_)
00054 : p(p_), left(left_), current(current_) { }
00055
00056 public:
00057 PrefixCompressedStringItor(const std::string & s)
00058 : p(reinterpret_cast<const unsigned char *>(s.data())),
00059 left(s.size()) {
00060 if (left) {
00061 operator++();
00062 } else {
00063 p = NULL;
00064 }
00065 }
00066
00067 const string & operator*() const {
00068 return current;
00069 }
00070
00071 PrefixCompressedStringItor operator++(int) {
00072 const unsigned char * old_p = p;
00073 size_t old_left = left;
00074 string old_current = current;
00075 operator++();
00076 return PrefixCompressedStringItor(old_p, old_left, old_current);
00077 }
00078
00079 PrefixCompressedStringItor & operator++() {
00080 if (left == 0) {
00081 p = NULL;
00082 } else {
00083 if (!current.empty()) {
00084 current.resize(*p++ ^ MAGIC_XOR_VALUE);
00085 --left;
00086 }
00087 size_t add;
00088 if (left == 0 || (add = *p ^ MAGIC_XOR_VALUE) >= left)
00089 throw Xapian::DatabaseCorruptError("Bad spelling data (too little left)");
00090 current.append(reinterpret_cast<const char *>(p + 1), add);
00091 p += add + 1;
00092 left -= add + 1;
00093 }
00094 return *this;
00095 }
00096
00097 bool at_end() const {
00098 return p == NULL;
00099 }
00100 };
00101
00102 class PrefixCompressedStringWriter {
00103 string current;
00104 string & out;
00105
00106 public:
00107 PrefixCompressedStringWriter(string & out_) : out(out_) { }
00108
00109 void append(const string & word) {
00110
00111
00112 if (!current.empty()) {
00113 size_t len = min(current.size(), word.size());
00114 size_t i;
00115 for (i = 0; i < len; ++i) {
00116 if (current[i] != word[i]) break;
00117 }
00118 out += char(i ^ MAGIC_XOR_VALUE);
00119 out += char((word.size() - i) ^ MAGIC_XOR_VALUE);
00120 out.append(word.data() + i, word.size() - i);
00121 } else {
00122 out += char(word.size() ^ MAGIC_XOR_VALUE);
00123 out += word;
00124 }
00125 current = word;
00126 }
00127 };
00128
00129 void
00130 FlintSpellingTable::merge_changes()
00131 {
00132 map<fragment, set<string> >::const_iterator i;
00133 for (i = termlist_deltas.begin(); i != termlist_deltas.end(); ++i) {
00134 string key = i->first;
00135 const set<string> & changes = i->second;
00136
00137 set<string>::const_iterator d = changes.begin();
00138 Assert(d != changes.end());
00139
00140 string updated;
00141 string current;
00142 PrefixCompressedStringWriter out(updated);
00143 if (get_exact_entry(key, current)) {
00144 PrefixCompressedStringItor in(current);
00145 updated.reserve(current.size());
00146 while (!in.at_end()) {
00147 const string & word = *in;
00148 if (word < *d) {
00149 out.append(word);
00150 ++in;
00151 } else if (word > *d) {
00152 out.append(*d++);
00153 if (d == changes.end()) break;
00154 continue;
00155 } else {
00156
00157
00158 ++in;
00159 ++d;
00160 }
00161 }
00162 if (!in.at_end()) {
00163
00164 while (!in.at_end()) {
00165 out.append(*in++);
00166 }
00167 }
00168 }
00169 while (d != changes.end()) {
00170 out.append(*d++);
00171 }
00172 if (!updated.empty()) {
00173 add(key, updated);
00174 } else {
00175 del(key);
00176 }
00177 }
00178 termlist_deltas.clear();
00179
00180 map<string, Xapian::termcount>::const_iterator j;
00181 for (j = wordfreq_changes.begin(); j != wordfreq_changes.end(); ++j) {
00182 string key = "W" + j->first;
00183 if (j->second) {
00184 add(key, pack_uint_last(j->second));
00185 } else {
00186 del(key);
00187 }
00188 }
00189 wordfreq_changes.clear();
00190 }
00191
00192 void
00193 FlintSpellingTable::add_fragment(fragment frag, const string & word)
00194 {
00195 map<fragment, set<string> >::iterator i = termlist_deltas.find(frag);
00196 if (i == termlist_deltas.end()) {
00197 i = termlist_deltas.insert(make_pair(frag, set<string>())).first;
00198 }
00199 i->second.insert(word);
00200 }
00201
00202 void
00203 FlintSpellingTable::add_word(const string & word, Xapian::termcount freqinc)
00204 {
00205 if (word.size() <= 1) return;
00206
00207 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
00208 if (i != wordfreq_changes.end()) {
00209
00210 if (i->second) {
00211 i->second += freqinc;
00212 return;
00213 }
00214 } else {
00215 string key = "W" + word;
00216 string data;
00217 if (get_exact_entry(key, data)) {
00218
00219 Xapian::termcount freq;
00220 const char * p = data.data();
00221 if (!unpack_uint_last(&p, p + data.size(), &freq) || freq == 0) {
00222 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00223 }
00224 wordfreq_changes[word] = freq + freqinc;
00225 return;
00226 }
00227 }
00228
00229
00230 if (i != wordfreq_changes.end()) {
00231 i->second = freqinc;
00232 } else {
00233 wordfreq_changes[word] = freqinc;
00234 }
00235
00236 fragment buf;
00237
00238 buf[0] = 'H';
00239 buf[1] = word[0];
00240 buf[2] = word[1];
00241 buf[3] = '\0';
00242 add_fragment(buf, word);
00243
00244
00245 buf[0] = 'T';
00246 buf[1] = word[word.size() - 2];
00247 buf[2] = word[word.size() - 1];
00248 buf[3] = '\0';
00249 add_fragment(buf, word);
00250
00251 if (word.size() <= 4) {
00252
00253
00254
00255
00256
00257
00258 buf[0] = 'B';
00259 buf[1] = word[0];
00260 buf[3] = '\0';
00261 add_fragment(buf, word);
00262 }
00263 if (word.size() > 2) {
00264
00265 buf[0] = 'M';
00266 for (size_t start = 0; start <= word.size() - 3; ++start) {
00267 memcpy(buf.data + 1, word.data() + start, 3);
00268 add_fragment(buf, word);
00269 }
00270 }
00271 }
00272
00273 void
00274 FlintSpellingTable::remove_fragment(fragment frag, const string & word)
00275 {
00276 map<fragment, set<string> >::iterator i = termlist_deltas.find(frag);
00277 if (i != termlist_deltas.end()) {
00278 i->second.erase(word);
00279 }
00280 }
00281
00282 void
00283 FlintSpellingTable::remove_word(const string & word, Xapian::termcount freqdec)
00284 {
00285 map<string, Xapian::termcount>::iterator i = wordfreq_changes.find(word);
00286 if (i != wordfreq_changes.end()) {
00287 if (i->second == 0) {
00288
00289 return;
00290 }
00291
00292 if (freqdec < i->second) {
00293 i->second -= freqdec;
00294 return;
00295 }
00296 }
00297
00298 {
00299 string key = "W" + word;
00300 string data;
00301 if (!get_exact_entry(key, data)) {
00302
00303 return;
00304 }
00305
00306 Xapian::termcount freq;
00307 const char *p = data.data();
00308 if (!unpack_uint_last(&p, p + data.size(), &freq)) {
00309 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00310 }
00311 if (freqdec < freq) {
00312 wordfreq_changes[word] = freq - freqdec;
00313 return;
00314 }
00315 }
00316
00317
00318 wordfreq_changes[word] = 0;
00319
00320 fragment buf;
00321
00322 buf[0] = 'H';
00323 buf[1] = word[0];
00324 buf[2] = word[1];
00325 buf[3] = '\0';
00326 remove_fragment(buf, word);
00327
00328
00329 buf[0] = 'T';
00330 buf[1] = word[word.size() - 2];
00331 buf[2] = word[word.size() - 1];
00332 buf[3] = '\0';
00333 remove_fragment(buf, word);
00334
00335 if (word.size() <= 4) {
00336
00337 buf[0] = 'B';
00338 buf[1] = word[0];
00339 buf[3] = '\0';
00340 remove_fragment(buf, word);
00341 }
00342 if (word.size() > 2) {
00343
00344 buf[0] = 'M';
00345 for (size_t start = 0; start <= word.size() - 3; ++start) {
00346 memcpy(buf.data + 1, word.data() + start, 3);
00347 remove_fragment(buf, word);
00348 }
00349 }
00350 }
00351
00352 struct TermListGreaterApproxSize {
00353 bool operator()(const TermList *a, const TermList *b) {
00354 return a->get_approx_size() > b->get_approx_size();
00355 }
00356 };
00357
00358 TermList *
00359 FlintSpellingTable::open_termlist(const string & word)
00360 {
00361 if (word.size() <= 1) return NULL;
00362
00363
00364
00365 if (!wordfreq_changes.empty()) merge_changes();
00366
00367
00368
00369 priority_queue<TermList*, vector<TermList*>, TermListGreaterApproxSize> pq;
00370 try {
00371 string data;
00372 fragment buf;
00373
00374
00375 buf[0] = 'H';
00376 buf[1] = word[0];
00377 buf[2] = word[1];
00378 if (get_exact_entry(string(buf), data))
00379 pq.push(new FlintSpellingTermList(data));
00380
00381
00382 buf[0] = 'T';
00383 buf[1] = word[word.size() - 2];
00384 buf[2] = word[word.size() - 1];
00385 if (get_exact_entry(string(buf), data))
00386 pq.push(new FlintSpellingTermList(data));
00387
00388 if (word.size() <= 4) {
00389
00390
00391
00392
00393
00394 buf[0] = 'B';
00395 buf[1] = word[0];
00396 buf[3] = '\0';
00397 if (get_exact_entry(string(buf), data))
00398 pq.push(new FlintSpellingTermList(data));
00399 }
00400 if (word.size() > 2) {
00401
00402 buf[0] = 'M';
00403 for (size_t start = 0; start <= word.size() - 3; ++start) {
00404 memcpy(buf.data + 1, word.data() + start, 3);
00405 if (get_exact_entry(string(buf), data))
00406 pq.push(new FlintSpellingTermList(data));
00407 }
00408
00409 if (word.size() == 3) {
00410
00411
00412
00413
00414 buf[1] = word[1];
00415 buf[2] = word[0];
00416 if (get_exact_entry(string(buf), data))
00417 pq.push(new FlintSpellingTermList(data));
00418
00419 buf[1] = word[0];
00420 buf[2] = word[2];
00421 buf[3] = word[1];
00422 if (get_exact_entry(string(buf), data))
00423 pq.push(new FlintSpellingTermList(data));
00424 }
00425 } else {
00426 Assert(word.size() == 2);
00427
00428
00429
00430
00431 buf[0] = 'H';
00432 buf[1] = word[1];
00433 buf[2] = word[0];
00434 if (get_exact_entry(string(buf), data))
00435 pq.push(new FlintSpellingTermList(data));
00436 buf[0] = 'T';
00437 if (get_exact_entry(string(buf), data))
00438 pq.push(new FlintSpellingTermList(data));
00439 }
00440
00441 if (pq.empty()) return NULL;
00442
00443
00444
00445
00446
00447
00448
00449
00450 while (pq.size() > 1) {
00451
00452
00453 TermList * termlist = pq.top();
00454 pq.pop();
00455
00456 termlist = new OrTermList(pq.top(), termlist);
00457 pq.pop();
00458 pq.push(termlist);
00459 }
00460
00461 return pq.top();
00462 } catch (...) {
00463
00464
00465 while (!pq.empty()) {
00466 delete pq.top();
00467 pq.pop();
00468 }
00469 throw;
00470 }
00471 }
00472
00473 Xapian::doccount
00474 FlintSpellingTable::get_word_frequency(const string & word) const
00475 {
00476 map<string, Xapian::termcount>::const_iterator i;
00477 i = wordfreq_changes.find(word);
00478 if (i != wordfreq_changes.end()) {
00479
00480 return i->second;
00481 }
00482
00483 string key = "W" + word;
00484 string data;
00485 if (get_exact_entry(key, data)) {
00486
00487 Xapian::termcount freq;
00488 const char *p = data.data();
00489 if (!unpack_uint_last(&p, p + data.size(), &freq)) {
00490 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00491 }
00492 return freq;
00493 }
00494
00495 return 0;
00496 }
00497
00499
00500 Xapian::termcount
00501 FlintSpellingTermList::get_approx_size() const
00502 {
00503
00504
00505 return data.size();
00506 }
00507
00508 std::string
00509 FlintSpellingTermList::get_termname() const
00510 {
00511 return current_term;
00512 }
00513
00514 Xapian::termcount
00515 FlintSpellingTermList::get_wdf() const
00516 {
00517 return 1;
00518 }
00519
00520 Xapian::doccount
00521 FlintSpellingTermList::get_termfreq() const
00522 {
00523 return 1;
00524 }
00525
00526 Xapian::termcount
00527 FlintSpellingTermList::get_collection_freq() const
00528 {
00529 return 1;
00530 }
00531
00532 TermList *
00533 FlintSpellingTermList::next()
00534 {
00535 if (p == data.size()) {
00536 p = 0;
00537 data.resize(0);
00538 return NULL;
00539 }
00540 if (!current_term.empty()) {
00541 if (p == data.size())
00542 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
00543 current_term.resize(byte(data[p++]) ^ MAGIC_XOR_VALUE);
00544 }
00545 size_t add;
00546 if (p == data.size() ||
00547 (add = byte(data[p]) ^ MAGIC_XOR_VALUE) >= data.size() - p)
00548 throw Xapian::DatabaseCorruptError("Bad spelling termlist");
00549 current_term.append(data.data() + p + 1, add);
00550 p += add + 1;
00551 return NULL;
00552 }
00553
00554 bool
00555 FlintSpellingTermList::at_end() const
00556 {
00557 return data.empty();
00558 }
00559
00560 Xapian::termcount
00561 FlintSpellingTermList::positionlist_count() const
00562 {
00563 throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_count() not implemented");
00564 }
00565
00566 Xapian::PositionIterator
00567 FlintSpellingTermList::positionlist_begin() const
00568 {
00569 throw Xapian::UnimplementedError("FlintSpellingTermList::positionlist_begin() not implemented");
00570 }