00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include <config.h>
00025 #include <stdio.h>
00026
00027 #include "omdebug.h"
00028
00029 #include "emptypostlist.h"
00030 #include "expandweight.h"
00031 #include "inmemory_database.h"
00032 #include "inmemory_document.h"
00033 #include "inmemory_alltermslist.h"
00034 #include "utils.h"
00035
00036 #include <string>
00037 #include <vector>
00038 #include <map>
00039 #include <list>
00040
00041 #include <xapian/error.h>
00042 #include <xapian/valueiterator.h>
00043
00044 using std::make_pair;
00045
00046 inline void
00047 InMemoryTerm::add_posting(const InMemoryPosting & post)
00048 {
00049
00050 vector<InMemoryPosting>::iterator p;
00051 p = lower_bound(docs.begin(), docs.end(),
00052 post, InMemoryPostingLessThan());
00053 if (p == docs.end() || InMemoryPostingLessThan()(post, *p)) {
00054 docs.insert(p, post);
00055 } else if (!p->valid) {
00056 *p = post;
00057 } else {
00058 (*p).merge(post);
00059 }
00060 }
00061
00062 inline void
00063 InMemoryDoc::add_posting(const InMemoryTermEntry & post)
00064 {
00065
00066 vector<InMemoryTermEntry>::iterator p;
00067 p = lower_bound(terms.begin(), terms.end(),
00068 post, InMemoryTermEntryLessThan());
00069 if (p == terms.end() || InMemoryTermEntryLessThan()(post, *p)) {
00070 terms.insert(p, post);
00071 } else {
00072 (*p).merge(post);
00073 }
00074 }
00075
00077
00079
00080 InMemoryPostList::InMemoryPostList(Xapian::Internal::RefCntPtr<const InMemoryDatabase> db_,
00081 const InMemoryTerm & term)
00082 : pos(term.docs.begin()),
00083 end(term.docs.end()),
00084 termfreq(term.term_freq),
00085 started(false),
00086 db(db_)
00087 {
00088
00089 Assert(pos != end);
00090 while (pos != end && !pos->valid) ++pos;
00091 }
00092
00093 Xapian::doccount
00094 InMemoryPostList::get_termfreq() const
00095 {
00096 return termfreq;
00097 }
00098
00099 Xapian::docid
00100 InMemoryPostList::get_docid() const
00101 {
00102
00103 Assert(started);
00104 Assert(!at_end());
00105
00106 return (*pos).did;
00107 }
00108
00109 PostList *
00110 InMemoryPostList::next(Xapian::weight )
00111 {
00112 if (started) {
00113 Assert(!at_end());
00114 ++pos;
00115 while (pos != end && !pos->valid) ++pos;
00116 } else {
00117 started = true;
00118 }
00119 return NULL;
00120 }
00121
00122 PostList *
00123 InMemoryPostList::skip_to(Xapian::docid did, Xapian::weight w_min)
00124 {
00125
00126
00127
00128
00129
00130
00131
00132 started = true;
00133 Assert(!at_end());
00134 while (!at_end() && (*pos).did < did) {
00135 (void) next(w_min);
00136 }
00137 return NULL;
00138 }
00139
00140 bool
00141 InMemoryPostList::at_end() const
00142 {
00143 return (pos == end);
00144 }
00145
00146 string
00147 InMemoryPostList::get_description() const
00148 {
00149 return "InMemoryPostList" + om_tostring(termfreq);
00150 }
00151
00152 Xapian::doclength
00153 InMemoryPostList::get_doclength() const
00154 {
00155 return db->get_doclength(get_docid());
00156 }
00157
00158 PositionList *
00159 InMemoryPostList::read_position_list()
00160 {
00161 mypositions.set_data(pos->positions);
00162 return &mypositions;
00163 }
00164
00165 PositionList *
00166 InMemoryPostList::open_position_list() const
00167 {
00168 return new InMemoryPositionList(pos->positions);
00169 }
00170
00171 Xapian::termcount
00172 InMemoryPostList::get_wdf() const
00173 {
00174 return (*pos).wdf;
00175 }
00176
00178
00180
00181 InMemoryTermList::InMemoryTermList(Xapian::Internal::RefCntPtr<const InMemoryDatabase> db_,
00182 Xapian::docid did_,
00183 const InMemoryDoc & doc,
00184 Xapian::doclength len)
00185 : pos(doc.terms.begin()), end(doc.terms.end()), terms(doc.terms.size()),
00186 started(false), db(db_), did(did_)
00187 {
00188 DEBUGLINE(DB, "InMemoryTermList::InMemoryTermList(): " <<
00189 terms << " terms starting from " << pos->tname);
00190 document_length = len;
00191 }
00192
00193 Xapian::termcount
00194 InMemoryTermList::get_wdf() const
00195 {
00196 Assert(started);
00197 Assert(!at_end());
00198 return (*pos).wdf;
00199 }
00200
00201 Xapian::doccount
00202 InMemoryTermList::get_termfreq() const
00203 {
00204 Assert(started);
00205 Assert(!at_end());
00206
00207 return db->get_termfreq((*pos).tname);
00208 }
00209
00210 Xapian::termcount
00211 InMemoryTermList::get_approx_size() const
00212 {
00213 return terms;
00214 }
00215
00216 void
00217 InMemoryTermList::accumulate_stats(Xapian::Internal::ExpandStats & stats) const
00218 {
00219 Assert(started);
00220 Assert(!at_end());
00221 stats.accumulate(InMemoryTermList::get_wdf(), document_length,
00222 InMemoryTermList::get_termfreq(),
00223 db->get_doccount());
00224 }
00225
00226 string
00227 InMemoryTermList::get_termname() const
00228 {
00229 Assert(started);
00230 Assert(!at_end());
00231 return (*pos).tname;
00232 }
00233
00234 TermList *
00235 InMemoryTermList::next()
00236 {
00237 if (started) {
00238 Assert(!at_end());
00239 pos++;
00240 } else {
00241 started = true;
00242 }
00243 return NULL;
00244 }
00245
00246 bool
00247 InMemoryTermList::at_end() const
00248 {
00249 Assert(started);
00250 return (pos == end);
00251 }
00252
00253 Xapian::termcount
00254 InMemoryTermList::positionlist_count() const
00255 {
00256 return db->positionlist_count(did, (*pos).tname);
00257 }
00258
00259 Xapian::PositionIterator
00260 InMemoryTermList::positionlist_begin() const
00261 {
00262 return Xapian::PositionIterator(db->open_position_list(did, (*pos).tname));
00263 }
00264
00266
00268
00269 InMemoryAllDocsPostList::InMemoryAllDocsPostList(Xapian::Internal::RefCntPtr<const InMemoryDatabase> db_)
00270 : did(0), db(db_)
00271 {
00272 }
00273
00274 Xapian::doccount
00275 InMemoryAllDocsPostList::get_termfreq() const
00276 {
00277 return db->totdocs;
00278 }
00279
00280 Xapian::docid
00281 InMemoryAllDocsPostList::get_docid() const
00282 {
00283 Assert(did > 0);
00284 Assert(did <= db->termlists.size());
00285 Assert(db->termlists[did - 1].is_valid);
00286 return did;
00287 }
00288
00289 Xapian::doclength
00290 InMemoryAllDocsPostList::get_doclength() const
00291 {
00292 return db->get_doclength(did);
00293 }
00294
00295 Xapian::termcount
00296 InMemoryAllDocsPostList::get_wdf() const
00297 {
00298 return 1;
00299 }
00300
00301 PositionList *
00302 InMemoryAllDocsPostList::read_position_list()
00303 {
00304 throw Xapian::UnimplementedError("Can't open position list for all docs iterator");
00305 }
00306
00307 PositionList *
00308 InMemoryAllDocsPostList::open_position_list() const
00309 {
00310 throw Xapian::UnimplementedError("Can't open position list for all docs iterator");
00311 }
00312
00313 PostList *
00314 InMemoryAllDocsPostList::next(Xapian::weight )
00315 {
00316 Assert(!at_end());
00317 do {
00318 ++did;
00319 } while (did <= db->termlists.size() && !db->termlists[did - 1].is_valid);
00320 return NULL;
00321 }
00322
00323 PostList *
00324 InMemoryAllDocsPostList::skip_to(Xapian::docid did_, Xapian::weight )
00325 {
00326 Assert(!at_end());
00327 if (did <= did_) {
00328 did = did_;
00329 while (did <= db->termlists.size() && !db->termlists[did - 1].is_valid) {
00330 ++did;
00331 }
00332 }
00333 return NULL;
00334 }
00335
00336 bool
00337 InMemoryAllDocsPostList::at_end() const
00338 {
00339 return (did > db->termlists.size());
00340 }
00341
00342 string
00343 InMemoryAllDocsPostList::get_description() const
00344 {
00345 return "InMemoryAllDocsPostList" + om_tostring(did);
00346 }
00347
00349
00351
00352 InMemoryDatabase::InMemoryDatabase()
00353 : totdocs(0), totlen(0), positions_present(false)
00354 {
00355
00356 transaction_state = TRANSACTION_UNIMPLEMENTED;
00357 }
00358
00359 InMemoryDatabase::~InMemoryDatabase()
00360 {
00361 dtor_called();
00362 }
00363
00364 LeafPostList *
00365 InMemoryDatabase::open_post_list(const string & tname) const
00366 {
00367 if (tname.empty()) {
00368 if (termlists.empty())
00369 return new EmptyPostList();
00370 Xapian::Internal::RefCntPtr<const InMemoryDatabase> ptrtothis(this);
00371 return new InMemoryAllDocsPostList(ptrtothis);
00372 }
00373 map<string, InMemoryTerm>::const_iterator i = postlists.find(tname);
00374 if (i == postlists.end() || i->second.term_freq == 0)
00375 return new EmptyPostList();
00376
00377 Xapian::Internal::RefCntPtr<const InMemoryDatabase> ptrtothis(this);
00378 LeafPostList * pl = new InMemoryPostList(ptrtothis, i->second);
00379 Assert(!pl->at_end());
00380 return pl;
00381 }
00382
00383 bool
00384 InMemoryDatabase::doc_exists(Xapian::docid did) const
00385 {
00386 return (did > 0 && did <= termlists.size() && termlists[did - 1].is_valid);
00387 }
00388
00389 Xapian::doccount
00390 InMemoryDatabase::get_termfreq(const string & tname) const
00391 {
00392 map<string, InMemoryTerm>::const_iterator i = postlists.find(tname);
00393 if (i == postlists.end()) return 0;
00394 return i->second.term_freq;
00395 }
00396
00397 Xapian::termcount
00398 InMemoryDatabase::get_collection_freq(const string &tname) const
00399 {
00400 map<string, InMemoryTerm>::const_iterator i = postlists.find(tname);
00401 if (i == postlists.end()) return 0;
00402 return i->second.collection_freq;
00403 }
00404
00405 Xapian::doccount
00406 InMemoryDatabase::get_doccount() const
00407 {
00408 return totdocs;
00409 }
00410
00411 Xapian::docid
00412 InMemoryDatabase::get_lastdocid() const
00413 {
00414 return termlists.size();
00415 }
00416
00417 Xapian::doclength
00418 InMemoryDatabase::get_avlength() const
00419 {
00420 if (totdocs == 0) return 0;
00421 return Xapian::doclength(totlen) / totdocs;
00422 }
00423
00424 Xapian::doclength
00425 InMemoryDatabase::get_doclength(Xapian::docid did) const
00426 {
00427 if (!doc_exists(did)) {
00428 throw Xapian::DocNotFoundError(string("Docid ") + om_tostring(did) +
00429 string(" not found"));
00430 }
00431 return doclengths[did - 1];
00432 }
00433
00434 TermList *
00435 InMemoryDatabase::open_term_list(Xapian::docid did) const
00436 {
00437 if (did == 0) throw Xapian::InvalidArgumentError("Docid 0 invalid");
00438 if (!doc_exists(did)) {
00439
00440 throw Xapian::DocNotFoundError(string("Docid ") + om_tostring(did) +
00441 string(" not found"));
00442 }
00443 return new InMemoryTermList(Xapian::Internal::RefCntPtr<const InMemoryDatabase>(this), did,
00444 termlists[did - 1], get_doclength(did));
00445 }
00446
00447 Xapian::Document::Internal *
00448 InMemoryDatabase::open_document(Xapian::docid did, bool ) const
00449 {
00450
00451 if (did == 0) throw Xapian::InvalidArgumentError("Docid 0 invalid");
00452 if (!doc_exists(did)) {
00453
00454 throw Xapian::DocNotFoundError(string("Docid ") + om_tostring(did) +
00455 string(" not found"));
00456 }
00457 return new InMemoryDocument(this, did, doclists[did - 1],
00458 valuelists[did - 1]);
00459 }
00460
00461 std::string
00462 InMemoryDatabase::get_metadata(const std::string & key) const
00463 {
00464 map<string, string>::const_iterator i = metadata.find(key);
00465 if (i == metadata.end())
00466 return string();
00467 return i->second;
00468 }
00469
00470 void
00471 InMemoryDatabase::set_metadata(const std::string & key,
00472 const std::string & value)
00473 {
00474 if (!value.empty()) {
00475 metadata[key] = value;
00476 } else {
00477 metadata.erase(key);
00478 }
00479 }
00480
00481 Xapian::termcount
00482 InMemoryDatabase::positionlist_count(Xapian::docid did,
00483 const string & tname) const
00484 {
00485 if (!doc_exists(did)) {
00486 return 0;
00487 }
00488 const InMemoryDoc &doc = termlists[did-1];
00489
00490 vector<InMemoryTermEntry>::const_iterator i;
00491 for (i = doc.terms.begin(); i != doc.terms.end(); ++i) {
00492 if (i->tname == tname) {
00493 return i->positions.size();
00494 }
00495 }
00496 return 0;
00497 }
00498
00499 PositionList *
00500 InMemoryDatabase::open_position_list(Xapian::docid did,
00501 const string & tname) const
00502 {
00503 if (!doc_exists(did)) {
00504 throw Xapian::DocNotFoundError("Document id " + om_tostring(did) +
00505 " doesn't exist in inmemory database");
00506 }
00507 const InMemoryDoc &doc = termlists[did-1];
00508
00509 vector<InMemoryTermEntry>::const_iterator i;
00510 for (i = doc.terms.begin(); i != doc.terms.end(); ++i) {
00511 if (i->tname == tname) {
00512 return new InMemoryPositionList(i->positions);
00513 }
00514 }
00515 throw Xapian::RangeError("No positionlist for term in document.");
00516 }
00517
00518 void
00519 InMemoryDatabase::add_values(Xapian::docid did,
00520 const map<Xapian::valueno, string> &values_)
00521 {
00522 if (did > valuelists.size()) {
00523 valuelists.resize(did);
00524 }
00525 valuelists[did-1] = values_;
00526 }
00527
00528
00529 void
00530 InMemoryDatabase::flush()
00531 {
00532 }
00533
00534
00535 void
00536 InMemoryDatabase::cancel()
00537 {
00538 }
00539
00540 void
00541 InMemoryDatabase::delete_document(Xapian::docid did)
00542 {
00543 if (!doc_exists(did)) {
00544 throw Xapian::DocNotFoundError(string("Docid ") + om_tostring(did) +
00545 string(" not found"));
00546 }
00547 termlists[did-1].is_valid = false;
00548 doclists[did-1] = "";
00549 valuelists[did-1].clear();
00550 totlen -= doclengths[did-1];
00551 doclengths[did-1] = 0;
00552 totdocs--;
00553
00554
00555 if (totdocs == 0) positions_present = false;
00556
00557 vector<InMemoryTermEntry>::const_iterator i;
00558 for (i = termlists[did - 1].terms.begin();
00559 i != termlists[did - 1].terms.end();
00560 ++i) {
00561 map<string, InMemoryTerm>::iterator t = postlists.find(i->tname);
00562 Assert(t != postlists.end());
00563 t->second.collection_freq -= i->wdf;
00564 --t->second.term_freq;
00565 vector<InMemoryPosting>::iterator posting = t->second.docs.begin();
00566 while (posting != t->second.docs.end()) {
00567
00568
00569
00570 if (posting->did == did) posting->valid = false;
00571 ++posting;
00572 }
00573 }
00574 termlists[did-1].terms.clear();
00575 }
00576
00577 void
00578 InMemoryDatabase::replace_document(Xapian::docid did,
00579 const Xapian::Document & document)
00580 {
00581 DEBUGLINE(DB, "InMemoryDatabase::replace_document(): replacing doc "
00582 << did);
00583
00584 if (doc_exists(did)) {
00585 doclists[did - 1] = "";
00586 valuelists[did - 1].clear();
00587 totlen -= doclengths[did - 1];
00588 totdocs--;
00589 } else if (did > termlists.size()) {
00590 termlists.resize(did);
00591 termlists[did - 1].is_valid = true;
00592 doclengths.resize(did);
00593 doclists.resize(did);
00594 valuelists.resize(did);
00595 } else {
00596 termlists[did - 1].is_valid = true;
00597 }
00598
00599 vector<InMemoryTermEntry>::const_iterator i;
00600 for (i = termlists[did - 1].terms.begin();
00601 i != termlists[did - 1].terms.end();
00602 ++i) {
00603 map<string, InMemoryTerm>::iterator t = postlists.find(i->tname);
00604 Assert(t != postlists.end());
00605 t->second.collection_freq -= i->wdf;
00606 --t->second.term_freq;
00607 vector<InMemoryPosting>::iterator posting = t->second.docs.begin();
00608 while (posting != t->second.docs.end()) {
00609
00610
00611
00612 if (posting->did == did) posting->valid = false;
00613 ++posting;
00614 }
00615 }
00616
00617 doclengths[did - 1] = 0;
00618 doclists[did - 1] = document.get_data();
00619
00620 finish_add_doc(did, document);
00621 }
00622
00623 Xapian::docid
00624 InMemoryDatabase::add_document(const Xapian::Document & document)
00625 {
00626 Xapian::docid did = make_doc(document.get_data());
00627
00628 DEBUGLINE(DB, "InMemoryDatabase::add_document(): adding doc " << did);
00629
00630 finish_add_doc(did, document);
00631
00632 return did;
00633 }
00634
00635 void
00636 InMemoryDatabase::finish_add_doc(Xapian::docid did, const Xapian::Document &document)
00637 {
00638 {
00639 map<Xapian::valueno, string> values;
00640 Xapian::ValueIterator k = document.values_begin();
00641 Xapian::ValueIterator k_end = document.values_end();
00642 for ( ; k != k_end; ++k) {
00643 values.insert(make_pair(k.get_valueno(), *k));
00644 DEBUGLINE(DB, "InMemoryDatabase::finish_add_doc(): adding value "
00645 << k.get_valueno() << " -> " << *k);
00646 }
00647 add_values(did, values);
00648 }
00649
00650 InMemoryDoc doc(true);
00651 Xapian::TermIterator i = document.termlist_begin();
00652 Xapian::TermIterator i_end = document.termlist_end();
00653 for ( ; i != i_end; ++i) {
00654 make_term(*i);
00655
00656 DEBUGLINE(DB, "InMemoryDatabase::finish_add_doc(): adding term "
00657 << *i);
00658 Xapian::PositionIterator j = i.positionlist_begin();
00659 Xapian::PositionIterator j_end = i.positionlist_end();
00660
00661 if (j == j_end) {
00662
00663 make_posting(&doc, *i, did, 0, i.get_wdf(), false);
00664 } else {
00665 positions_present = true;
00666 for ( ; j != j_end; ++j) {
00667 make_posting(&doc, *i, did, *j, i.get_wdf());
00668 }
00669 }
00670
00671 Assert(did > 0 && did <= doclengths.size());
00672 doclengths[did - 1] += i.get_wdf();
00673 totlen += i.get_wdf();
00674 postlists[*i].collection_freq += i.get_wdf();
00675 ++postlists[*i].term_freq;
00676 }
00677 swap(termlists[did - 1], doc);
00678
00679 totdocs++;
00680 }
00681
00682 void
00683 InMemoryDatabase::make_term(const string & tname)
00684 {
00685 postlists[tname];
00686 }
00687
00688 Xapian::docid
00689 InMemoryDatabase::make_doc(const string & docdata)
00690 {
00691 termlists.push_back(InMemoryDoc(true));
00692 doclengths.push_back(0);
00693 doclists.push_back(docdata);
00694
00695 AssertEqParanoid(termlists.size(), doclengths.size());
00696
00697 return termlists.size();
00698 }
00699
00700 void InMemoryDatabase::make_posting(InMemoryDoc * doc,
00701 const string & tname,
00702 Xapian::docid did,
00703 Xapian::termpos position,
00704 Xapian::termcount wdf,
00705 bool use_position)
00706 {
00707 Assert(doc);
00708 Assert(postlists.find(tname) != postlists.end());
00709 Assert(did > 0 && did <= termlists.size());
00710 Assert(did > 0 && did <= doclengths.size());
00711 Assert(doc_exists(did));
00712
00713
00714 InMemoryPosting posting;
00715 posting.did = did;
00716 if (use_position) {
00717 posting.positions.push_back(position);
00718 }
00719 posting.wdf = wdf;
00720 posting.valid = true;
00721
00722
00723 postlists[tname].add_posting(posting);
00724
00725
00726 InMemoryTermEntry termentry;
00727 termentry.tname = tname;
00728 if (use_position) {
00729 termentry.positions.push_back(position);
00730 }
00731 termentry.wdf = wdf;
00732
00733
00734 doc->add_posting(termentry);
00735 }
00736
00737 bool
00738 InMemoryDatabase::term_exists(const string & tname) const
00739 {
00740 Assert(!tname.empty());
00741 map<string, InMemoryTerm>::const_iterator i = postlists.find(tname);
00742 if (i == postlists.end()) return false;
00743 return (i->second.term_freq != 0);
00744 }
00745
00746 bool
00747 InMemoryDatabase::has_positions() const
00748 {
00749 return positions_present;
00750 }
00751
00752 TermList *
00753 InMemoryDatabase::open_allterms(const string & prefix) const
00754 {
00755 return new InMemoryAllTermsList(&postlists,
00756 Xapian::Internal::RefCntPtr<const InMemoryDatabase>(this),
00757 prefix);
00758 }