00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include <config.h>
00027
00028 #include "multimatch.h"
00029 #include "submatch.h"
00030 #include "localmatch.h"
00031 #include "emptysubmatch.h"
00032 #include "omdebug.h"
00033 #include "omenquireinternal.h"
00034
00035 #include "emptypostlist.h"
00036 #include "branchpostlist.h"
00037 #include "leafpostlist.h"
00038 #include "mergepostlist.h"
00039
00040 #include "document.h"
00041 #include "omqueryinternal.h"
00042
00043 #include "submatch.h"
00044 #include "stats.h"
00045
00046 #include "msetcmp.h"
00047
00048 #include <xapian/errorhandler.h>
00049 #include <xapian/version.h>
00050
00051 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00052 #include "remotesubmatch.h"
00053 #include "remote-database.h"
00054 #endif
00055
00056 #include <algorithm>
00057 #include <cfloat>
00058 #include <queue>
00059 #include <vector>
00060 #include <map>
00061 #include <set>
00062
00063 using namespace std;
00064
00065 const Xapian::Enquire::Internal::sort_setting REL =
00066 Xapian::Enquire::Internal::REL;
00067 const Xapian::Enquire::Internal::sort_setting REL_VAL =
00068 Xapian::Enquire::Internal::REL_VAL;
00069 const Xapian::Enquire::Internal::sort_setting VAL =
00070 Xapian::Enquire::Internal::VAL;
00071 #if 0 // VAL_REL isn't currently used which causes a warning with SGI CC.
00072 const Xapian::Enquire::Internal::sort_setting VAL_REL =
00073 Xapian::Enquire::Internal::VAL_REL;
00074 #endif
00075
00083 static void
00084 split_rset_by_db(const Xapian::RSet * rset,
00085 Xapian::doccount number_of_subdbs,
00086 vector<Xapian::RSet> & subrsets)
00087 {
00088 DEBUGCALL_STATIC(MATCH, void, "split_rset_by_db",
00089 (rset ? *rset : Xapian::RSet()) <<
00090 ", " << number_of_subdbs <<
00091 ", [subrsets(size=" << subrsets.size() << "]");
00092 if (rset) {
00093 if (number_of_subdbs == 1) {
00094
00095 subrsets.push_back(*rset);
00096 } else {
00097
00098
00099 subrsets.reserve(number_of_subdbs);
00100 for (size_t i = 0; i < number_of_subdbs; ++i) {
00101 subrsets.push_back(Xapian::RSet());
00102 }
00103
00104 const set<Xapian::docid> & items = rset->internal->get_items();
00105 set<Xapian::docid>::const_iterator j;
00106 for (j = items.begin(); j != items.end(); ++j) {
00107 Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
00108 Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
00109 subrsets[subdatabase].add_document(local_docid);
00110 }
00111 }
00112 } else {
00113
00114 subrsets.resize(number_of_subdbs);
00115 }
00116 Assert(subrsets.size() == number_of_subdbs);
00117 }
00118
00137 static void
00138 prepare_sub_matches(std::vector<Xapian::Internal::RefCntPtr<SubMatch> > & leaves,
00139 Xapian::ErrorHandler * errorhandler,
00140 Stats & stats)
00141 {
00142 DEBUGCALL_STATIC(MATCH, void, "prepare_sub_matches",
00143 "[leaves(size=" << leaves.size() << ")], " <<
00144 errorhandler << ", " << stats);
00145
00146 vector<bool> prepared;
00147 prepared.resize(leaves.size(), false);
00148 size_t unprepared = leaves.size();
00149 bool nowait = true;
00150 while (unprepared) {
00151 for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
00152 if (prepared[leaf]) continue;
00153 try {
00154 if (leaves[leaf]->prepare_match(nowait, stats)) {
00155 prepared[leaf] = true;
00156 --unprepared;
00157 }
00158 } catch (Xapian::Error & e) {
00159 if (!errorhandler) throw;
00160
00161 DEBUGLINE(EXCEPTION, "Calling error handler for prepare_match() on a SubMatch.");
00162 (*errorhandler)(e);
00163
00164 leaves[leaf] = new EmptySubMatch();
00165 prepared[leaf] = true;
00166 --unprepared;
00167 }
00168 }
00169
00170
00171 nowait = false;
00172 }
00173 }
00174
00176
00178 MultiMatch::MultiMatch(const Xapian::Database &db_,
00179 const Xapian::Query::Internal * query_,
00180 Xapian::termcount qlen,
00181 const Xapian::RSet * omrset,
00182 Xapian::valueno collapse_key_,
00183 int percent_cutoff_, Xapian::weight weight_cutoff_,
00184 Xapian::Enquire::docid_order order_,
00185 Xapian::valueno sort_key_,
00186 Xapian::Enquire::Internal::sort_setting sort_by_,
00187 bool sort_value_forward_,
00188 const Xapian::Sorter * sorter_,
00189 Xapian::ErrorHandler * errorhandler_,
00190 Stats & stats,
00191 const Xapian::Weight * weight_)
00192 : db(db_), query(query_),
00193 collapse_key(collapse_key_), percent_cutoff(percent_cutoff_),
00194 weight_cutoff(weight_cutoff_), order(order_),
00195 sort_key(sort_key_), sort_by(sort_by_),
00196 sort_value_forward(sort_value_forward_), sorter(sorter_),
00197 errorhandler(errorhandler_), weight(weight_),
00198 is_remote(db.internal.size())
00199 {
00200 DEBUGCALL(MATCH, void, "MultiMatch", db_ << ", " << query_ << ", " <<
00201 qlen << ", " << (omrset ? *omrset : Xapian::RSet()) << ", " <<
00202 collapse_key_ << ", " <<
00203 percent_cutoff_ << ", " << weight_cutoff_ << ", " <<
00204 int(order_) << ", " << sort_key_ << ", " <<
00205 int(sort_by_) << ", " << sort_value_forward_ << ", " <<
00206 sorter_ << ", " <<
00207 errorhandler_ << ", " << stats << ", [weight_]");
00208
00209 if (!query) return;
00210 query->validate_query();
00211
00212 Xapian::doccount number_of_subdbs = db.internal.size();
00213 vector<Xapian::RSet> subrsets;
00214 split_rset_by_db(omrset, number_of_subdbs, subrsets);
00215
00216 for (size_t i = 0; i != number_of_subdbs; ++i) {
00217 Xapian::Database::Internal *subdb = db.internal[i].get();
00218 Assert(subdb);
00219 Xapian::Internal::RefCntPtr<SubMatch> smatch;
00220 try {
00221
00222 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00223 RemoteDatabase *rem_db = subdb->as_remotedatabase();
00224 if (rem_db) {
00225 is_remote[i] = true;
00226 rem_db->set_query(query, qlen, collapse_key, order, sort_key,
00227 sort_by, sort_value_forward, percent_cutoff,
00228 weight_cutoff, weight, subrsets[i]);
00229 bool decreasing_relevance =
00230 (sort_by == REL || sort_by == REL_VAL);
00231 smatch = new RemoteSubMatch(rem_db, decreasing_relevance);
00232 } else {
00233 #endif
00234 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
00235 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00236 }
00237 #endif
00238 } catch (Xapian::Error & e) {
00239 if (!errorhandler) throw;
00240 DEBUGLINE(EXCEPTION, "Calling error handler for creation of a SubMatch from a database and query.");
00241 (*errorhandler)(e);
00242
00243 smatch = new EmptySubMatch;
00244 }
00245 leaves.push_back(smatch);
00246 }
00247
00248 prepare_sub_matches(leaves, errorhandler, stats);
00249 }
00250
00251 string
00252 MultiMatch::get_collapse_key(PostList *pl, Xapian::docid did,
00253 Xapian::valueno keyno, Xapian::Internal::RefCntPtr<Xapian::Document::Internal> &doc)
00254 {
00255 DEBUGCALL(MATCH, string, "MultiMatch::get_collapse_key", pl << ", " << did << ", " << keyno << ", [doc]");
00256 const string *key = pl->get_collapse_key();
00257 if (key) RETURN(*key);
00258 if (doc.get() == 0) {
00259 unsigned int multiplier = db.internal.size();
00260 Assert(multiplier != 0);
00261 Xapian::doccount n = (did - 1) % multiplier;
00262 Xapian::docid m = (did - 1) / multiplier + 1;
00263
00264 doc = db.internal[n]->open_document(m, true);
00265 }
00266 RETURN(doc->get_value(keyno));
00267 }
00268
00269 Xapian::weight
00270 MultiMatch::getorrecalc_maxweight(PostList *pl)
00271 {
00272 DEBUGCALL(MATCH, Xapian::weight, "MultiMatch::getorrecalc_maxweight", pl);
00273 Xapian::weight wt;
00274 if (recalculate_w_max) {
00275 DEBUGLINE(MATCH, "recalculating max weight");
00276 wt = pl->recalc_maxweight();
00277 recalculate_w_max = false;
00278 } else {
00279 wt = pl->get_maxweight();
00280 AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
00281 }
00282 DEBUGLINE(MATCH, "max possible doc weight = " << wt);
00283 RETURN(wt);
00284 }
00285
00286 void
00287 MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
00288 Xapian::doccount check_at_least,
00289 Xapian::MSet & mset,
00290 const Stats & stats,
00291 const Xapian::MatchDecider *mdecider,
00292 const Xapian::MatchDecider *matchspy)
00293 {
00294 DEBUGCALL(MATCH, void, "MultiMatch::get_mset", first << ", " << maxitems
00295 << ", " << check_at_least << ", ...");
00296 if (check_at_least < maxitems) check_at_least = maxitems;
00297
00298 if (!query) {
00299 mset = Xapian::MSet();
00300 return;
00301 }
00302
00303 Assert(!leaves.empty());
00304
00305
00306
00307 if (leaves.size() == 1 && is_remote[0]) {
00308 RemoteSubMatch * rem_match;
00309 rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
00310 rem_match->start_match(first, maxitems, check_at_least, stats);
00311 rem_match->get_mset(mset);
00312 return;
00313 }
00314
00315
00316 {
00317 vector<Xapian::Internal::RefCntPtr<SubMatch> >::iterator leaf;
00318 for (leaf = leaves.begin(); leaf != leaves.end(); ++leaf) {
00319 try {
00320 (*leaf)->start_match(0, first + maxitems,
00321 first + check_at_least, stats);
00322 } catch (Xapian::Error & e) {
00323 if (!errorhandler) throw;
00324 DEBUGLINE(EXCEPTION, "Calling error handler for "
00325 "start_match() on a SubMatch.");
00326 (*errorhandler)(e);
00327
00328 *leaf = Xapian::Internal::RefCntPtr<SubMatch>(new EmptySubMatch());
00329 }
00330 }
00331 }
00332
00333
00334 vector<PostList *> postlists;
00335 map<string, Xapian::MSet::Internal::TermFreqAndWeight> termfreqandwts;
00336 map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
00337 termfreqandwts_ptr = &termfreqandwts;
00338
00339
00340
00341
00342
00343 Xapian::doccount definite_matches_not_seen = 0;
00344 for (size_t i = 0; i != leaves.size(); ++i) {
00345 PostList *pl;
00346 try {
00347 pl = leaves[i]->get_postlist_and_term_info(this,
00348 termfreqandwts_ptr);
00349 if (termfreqandwts_ptr && !termfreqandwts.empty())
00350 termfreqandwts_ptr = NULL;
00351 if (is_remote[i]) {
00352 if (pl->get_termfreq_min() > first + maxitems) {
00353 DEBUGLINE(MATCH, "Found " <<
00354 pl->get_termfreq_min() - (first + maxitems)
00355 << " definite matches in remote "
00356 "submatch which aren't passed to local "
00357 "match");
00358 definite_matches_not_seen += pl->get_termfreq_min();
00359 definite_matches_not_seen -= first + maxitems;
00360 }
00361 }
00362 } catch (Xapian::Error & e) {
00363 if (!errorhandler) throw;
00364 DEBUGLINE(EXCEPTION, "Calling error handler for "
00365 "get_term_info() on a SubMatch.");
00366 (*errorhandler)(e);
00367
00368
00369 leaves[i] = new EmptySubMatch();
00370 pl = new EmptyPostList;
00371 }
00372 postlists.push_back(pl);
00373 }
00374 Assert(!postlists.empty());
00375
00376
00377 PostList *pl;
00378 if (postlists.size() == 1) {
00379 pl = postlists.front();
00380 } else {
00381 pl = new MergePostList(postlists, this, errorhandler);
00382 }
00383
00384 DEBUGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00385
00386
00387 Xapian::doccount docs_matched = 0;
00388 Xapian::weight greatest_wt = 0;
00389 vector<Xapian::Internal::MSetItem> items;
00390
00391
00392 const Xapian::weight max_weight = pl->recalc_maxweight();
00393
00394 DEBUGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00395 recalculate_w_max = false;
00396
00397 Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
00398 Xapian::doccount matches_lower_bound = 0;
00399 Xapian::doccount matches_estimated = pl->get_termfreq_est();
00400
00401 if (mdecider == NULL && matchspy == NULL) {
00402
00403
00404
00405 matches_lower_bound = pl->get_termfreq_min();
00406 }
00407
00408
00409
00410 if (check_at_least == 0) {
00411 delete pl;
00412 if (collapse_key != Xapian::BAD_VALUENO) {
00413
00414
00415 if (matches_lower_bound > 1) matches_lower_bound = 1;
00416 }
00417
00418 mset = Xapian::MSet(new Xapian::MSet::Internal(
00419 first,
00420 matches_upper_bound,
00421 matches_lower_bound,
00422 matches_estimated,
00423 max_weight, greatest_wt, items,
00424 termfreqandwts,
00425 0));
00426 return;
00427 }
00428
00429
00430
00431
00432 Xapian::doccount duplicates_found = 0;
00433 Xapian::doccount documents_considered = 0;
00434
00435
00436 Xapian::doccount decider_considered = 0;
00437
00438 Xapian::doccount decider_denied = 0;
00439
00440
00441
00442
00443 Xapian::doccount null_collapse_values = 0;
00444
00445
00446
00447 Xapian::doccount max_msize = first + maxitems;
00448 items.reserve(max_msize + 1);
00449
00450
00451
00452 Xapian::Internal::MSetItem min_item(0.0, 0);
00453
00454
00455 Xapian::weight min_weight = weight_cutoff;
00456
00457
00458 Xapian::weight percent_cutoff_factor = percent_cutoff / 100.0;
00459
00460
00461 percent_cutoff_factor -= DBL_EPSILON;
00462
00463
00464 map<string, pair<Xapian::Internal::MSetItem,Xapian::weight> > collapse_tab;
00465
00467 bool sort_forward = (order != Xapian::Enquire::DESCENDING);
00468 MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483 bool is_heap = false;
00484
00485 while (true) {
00486 if (rare(recalculate_w_max)) {
00487 if (min_weight > 0.0) {
00488 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00489 DEBUGLINE(MATCH, "*** TERMINATING EARLY (1)");
00490 break;
00491 }
00492 }
00493 }
00494
00495 if (rare(next_handling_prune(pl, min_weight, this))) {
00496 DEBUGLINE(MATCH, "*** REPLACING ROOT");
00497
00498 if (min_weight > 0.0) {
00499
00500
00501
00502 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00503 DEBUGLINE(MATCH, "*** TERMINATING EARLY (2)");
00504 break;
00505 }
00506 }
00507 }
00508
00509 if (rare(pl->at_end())) {
00510 DEBUGLINE(MATCH, "Reached end of potential matches");
00511 break;
00512 }
00513
00514
00515
00516
00517 Xapian::weight wt = 0.0;
00518 bool calculated_weight = false;
00519 if (sort_by != VAL || min_weight > 0.0) {
00520 wt = pl->get_weight();
00521 if (wt < min_weight) {
00522 DEBUGLINE(MATCH, "Rejecting potential match due to insufficient weight");
00523 continue;
00524 }
00525 calculated_weight = true;
00526 }
00527
00528 Xapian::Internal::RefCntPtr<Xapian::Document::Internal> doc;
00529 Xapian::docid did = pl->get_docid();
00530 DEBUGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
00531 Xapian::Internal::MSetItem new_item(wt, did);
00532 if (sort_by != REL) {
00533 const unsigned int multiplier = db.internal.size();
00534 Assert(multiplier != 0);
00535 Xapian::doccount n = (new_item.did - 1) % multiplier;
00536 Xapian::docid m = (new_item.did - 1) / multiplier + 1;
00537 doc = db.internal[n]->open_document(m, true);
00538 if (sorter) {
00539 new_item.sort_key = (*sorter)(Xapian::Document(doc.get()));
00540 } else {
00541 new_item.sort_key = doc->get_value(sort_key);
00542 }
00543
00544
00545
00546
00547
00548 if (!mcmp(new_item, min_item)) {
00549 if (matchspy == NULL && mdecider == NULL &&
00550 collapse_key == Xapian::BAD_VALUENO) {
00551
00552
00553 DEBUGLINE(MATCH, "Making note of match item which sorts lower than min_item");
00554 ++docs_matched;
00555 continue;
00556 }
00557 if (docs_matched >= check_at_least) {
00558
00559 DEBUGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
00560 continue;
00561 }
00562
00563
00564
00565 DEBUGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
00566 }
00567 }
00568
00569
00570 if (matchspy != NULL || mdecider != NULL) {
00571 const unsigned int multiplier = db.internal.size();
00572 Assert(multiplier != 0);
00573 Xapian::doccount n = (did - 1) % multiplier;
00574
00575
00576 if (!is_remote[n]) {
00577 if (doc.get() == 0) {
00578 Xapian::docid m = (did - 1) / multiplier + 1;
00579 doc = db.internal[n]->open_document(m, true);
00580 Assert(doc.get());
00581 }
00582 Xapian::Document mydoc(doc.get());
00583
00584 ++decider_considered;
00585 if (matchspy && !matchspy->operator()(mydoc)) {
00586 ++decider_denied;
00587 continue;
00588 }
00589 if (mdecider && !mdecider->operator()(mydoc)) {
00590 ++decider_denied;
00591 continue;
00592 }
00593 }
00594 }
00595
00596 if (!calculated_weight) {
00597
00598 wt = pl->get_weight();
00599 new_item.wt = wt;
00600 }
00601
00602 bool pushback = true;
00603 documents_considered++;
00604
00605
00606 if (collapse_key != Xapian::BAD_VALUENO) {
00607 new_item.collapse_key = get_collapse_key(pl, did, collapse_key,
00608 doc);
00609
00610
00611 if (new_item.collapse_key.empty()) {
00612 ++null_collapse_values;
00613 } else {
00614 map<string, pair<Xapian::Internal::MSetItem, Xapian::weight> >::iterator oldkey;
00615 oldkey = collapse_tab.find(new_item.collapse_key);
00616 if (oldkey == collapse_tab.end()) {
00617 DEBUGLINE(MATCH, "collapsem: new key: " <<
00618 new_item.collapse_key);
00619
00620 collapse_tab.insert(make_pair(new_item.collapse_key,
00621 make_pair(new_item, Xapian::weight(0))));
00622 } else {
00623 ++duplicates_found;
00624 Xapian::Internal::MSetItem &old_item = oldkey->second.first;
00625
00626 if (mcmp(old_item, new_item)) {
00627 DEBUGLINE(MATCH, "collapsem: better exists: " <<
00628 new_item.collapse_key);
00629
00630 ++old_item.collapse_count;
00631
00632 if (new_item.wt > oldkey->second.second) {
00633 oldkey->second.second = new_item.wt;
00634 }
00635 continue;
00636 }
00637
00638
00639 new_item.collapse_count = old_item.collapse_count + 1;
00640
00641
00642
00643 Assert(!items.empty());
00644
00645
00646
00647
00648
00649 Xapian::weight old_wt = old_item.wt;
00650 if (old_wt >= min_weight && mcmp(old_item, min_item)) {
00651
00652
00653 Xapian::docid olddid = old_item.did;
00654 vector<Xapian::Internal::MSetItem>::iterator i;
00655 for (i = items.begin(); i != items.end(); ++i) {
00656 if (i->did == olddid) {
00657 DEBUGLINE(MATCH, "collapsem: removing " <<
00658 olddid << ": " <<
00659 new_item.collapse_key);
00660
00661
00662
00663
00664
00665
00666 *i = new_item;
00667 pushback = false;
00668 is_heap = false;
00669 break;
00670 }
00671 }
00672 }
00673
00674
00675 oldkey->second = make_pair(new_item, old_wt);
00676 }
00677 }
00678 }
00679
00680
00681 if (pushback) {
00682 ++docs_matched;
00683 if (items.size() >= max_msize) {
00684 items.push_back(new_item);
00685 if (!is_heap) {
00686 is_heap = true;
00687 make_heap(items.begin(), items.end(), mcmp);
00688 } else {
00689 push_heap<vector<Xapian::Internal::MSetItem>::iterator,
00690 MSetCmp>(items.begin(), items.end(), mcmp);
00691 }
00692 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00693 MSetCmp>(items.begin(), items.end(), mcmp);
00694 items.pop_back();
00695
00696 min_item = items.front();
00697 if (sort_by == REL || sort_by == REL_VAL) {
00698 if (docs_matched >= check_at_least) {
00699 if (sort_by == REL) {
00700
00701
00702
00703 if (rare(max_weight == 0 && sort_forward)) {
00704
00705
00706
00707
00708
00709 if (leaves.size() == 1) break;
00710 }
00711 }
00712 if (min_item.wt > min_weight) {
00713 DEBUGLINE(MATCH, "Setting min_weight to " <<
00714 min_item.wt << " from " << min_weight);
00715 min_weight = min_item.wt;
00716 }
00717 }
00718 }
00719 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00720 DEBUGLINE(MATCH, "*** TERMINATING EARLY (3)");
00721 break;
00722 }
00723 } else {
00724 items.push_back(new_item);
00725 is_heap = false;
00726 if (sort_by == REL && items.size() == max_msize) {
00727 if (docs_matched >= check_at_least) {
00728
00729
00730
00731 if (rare(max_weight == 0 && sort_forward)) {
00732
00733
00734
00735
00736
00737 if (leaves.size() == 1) break;
00738 }
00739 }
00740 }
00741 }
00742 }
00743
00744
00745 if (wt > greatest_wt) {
00746 greatest_wt = wt;
00747 if (percent_cutoff) {
00748 Xapian::weight w = wt * percent_cutoff_factor;
00749 if (w > min_weight) {
00750 min_weight = w;
00751 if (!is_heap) {
00752 is_heap = true;
00753 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00754 MSetCmp>(items.begin(), items.end(), mcmp);
00755 }
00756 while (!items.empty() && items.front().wt < min_weight) {
00757 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00758 MSetCmp>(items.begin(), items.end(), mcmp);
00759 Assert(items.back().wt < min_weight);
00760 items.pop_back();
00761 }
00762 #ifdef XAPIAN_DEBUG_PARANOID
00763 vector<Xapian::Internal::MSetItem>::const_iterator i;
00764 for (i = items.begin(); i != items.end(); ++i) {
00765 Assert(i->wt >= min_weight);
00766 }
00767 #endif
00768 }
00769 }
00770 }
00771 }
00772
00773
00774 delete pl;
00775
00776 double percent_scale = 0;
00777 if (!items.empty() && greatest_wt > 0) {
00778
00779
00780 vector<Xapian::Internal::MSetItem>::const_iterator best;
00781 best = min_element(items.begin(), items.end(), mcmp);
00782
00783 unsigned int multiplier = db.internal.size();
00784 Assert(multiplier != 0);
00785 Xapian::doccount n = (best->did - 1) % multiplier;
00786
00787
00788
00789 if (is_remote[n]) {
00790 RemoteSubMatch * rem_match;
00791 rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
00792 percent_scale = rem_match->get_percent_factor();
00793 } else if (termfreqandwts.size() > 1) {
00794 Xapian::termcount matching_terms = 0;
00795 map<string,
00796 Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator i;
00797
00798 Xapian::TermIterator docterms = db.termlist_begin(best->did);
00799 Xapian::TermIterator docterms_end = db.termlist_end(best->did);
00800 while (docterms != docterms_end) {
00801 i = termfreqandwts.find(*docterms);
00802 if (i != termfreqandwts.end()) {
00803 percent_scale += i->second.termweight;
00804 ++matching_terms;
00805 if (matching_terms == termfreqandwts.size()) break;
00806 }
00807 ++docterms;
00808 }
00809
00810 i = termfreqandwts.find("");
00811 if (i != termfreqandwts.end()) {
00812 percent_scale += i->second.termweight;
00813 ++matching_terms;
00814 }
00815 if (matching_terms < termfreqandwts.size()) {
00816
00817 double denom = 0;
00818 for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
00819 denom += i->second.termweight;
00820
00821 DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
00822 Assert(percent_scale <= denom);
00823 denom *= greatest_wt;
00824 Assert(denom > 0);
00825 percent_scale /= denom;
00826 } else {
00827
00828 percent_scale = 1.0 / greatest_wt;
00829 }
00830 } else {
00831
00832
00833 percent_scale = 1.0 / greatest_wt;
00834 }
00835 Assert(percent_scale > 0);
00836 if (percent_cutoff) {
00837
00838
00839
00840
00841
00842 Xapian::weight min_wt = percent_cutoff_factor / percent_scale;
00843 if (!is_heap) {
00844 is_heap = true;
00845 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00846 MSetCmp>(items.begin(), items.end(), mcmp);
00847 }
00848 while (!items.empty() && items.front().wt < min_wt) {
00849 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00850 MSetCmp>(items.begin(), items.end(), mcmp);
00851 Assert(items.back().wt < min_wt);
00852 items.pop_back();
00853 }
00854 #ifdef XAPIAN_DEBUG_PARANOID
00855 vector<Xapian::Internal::MSetItem>::const_iterator j;
00856 for (j = items.begin(); j != items.end(); ++j) {
00857 Assert(j->wt >= min_wt);
00858 }
00859 #endif
00860 }
00861 percent_scale *= 100.0;
00862 }
00863
00864 DEBUGLINE(MATCH,
00865 "docs_matched = " << docs_matched <<
00866 ", definite_matches_not_seen = " << definite_matches_not_seen <<
00867 ", matches_lower_bound = " << matches_lower_bound <<
00868 ", matches_estimated = " << matches_estimated <<
00869 ", matches_upper_bound = " << matches_upper_bound);
00870
00871
00872
00873 docs_matched += definite_matches_not_seen;
00874
00875 if (items.size() < max_msize) {
00876
00877
00878 DEBUGLINE(MATCH, "items.size() = " << items.size() <<
00879 ", max_msize = " << max_msize << ", setting bounds equal");
00880 Assert(definite_matches_not_seen == 0);
00881 Assert(percent_cutoff || docs_matched == items.size());
00882 matches_lower_bound = matches_upper_bound = matches_estimated
00883 = items.size();
00884 } else if (docs_matched < check_at_least) {
00885
00886
00887 DEBUGLINE(MATCH, "Setting bounds equal");
00888 matches_lower_bound = matches_upper_bound = matches_estimated
00889 = docs_matched;
00890 } else {
00891 Assert(matches_estimated >= matches_lower_bound);
00892 Assert(matches_estimated <= matches_upper_bound);
00893
00894
00895
00896
00897 double estimate_scale = 1.0;
00898
00899 if (collapse_key != Xapian::BAD_VALUENO) {
00900
00901
00902
00903
00904 DEBUGLINE(MATCH, "Adjusting bounds due to collapse_key");
00905 matches_lower_bound = null_collapse_values + collapse_tab.size();
00906
00907
00908
00909
00910 if (documents_considered > 0) {
00911 double unique = double(documents_considered - duplicates_found);
00912 double unique_rate = unique / double(documents_considered);
00913 estimate_scale *= unique_rate;
00914 }
00915
00916
00917
00918 matches_upper_bound -= duplicates_found;
00919
00920 DEBUGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
00921 ", matches_estimated=" << matches_estimated <<
00922 "*" << estimate_scale <<
00923 ", matches_upper_bound=" << matches_upper_bound);
00924 }
00925
00926 if (matchspy || mdecider) {
00927 if (collapse_key == Xapian::BAD_VALUENO && !percent_cutoff) {
00928
00929
00930 matches_lower_bound = max(docs_matched, matches_lower_bound);
00931 }
00932
00933
00934
00935
00936 if (decider_considered > 0) {
00937 double accept = double(decider_considered - decider_denied);
00938 double accept_rate = accept / double(decider_considered);
00939 estimate_scale *= accept_rate;
00940 }
00941
00942
00943
00944
00945
00946 matches_upper_bound -= decider_denied;
00947 }
00948
00949 if (percent_cutoff) {
00950 estimate_scale *= (1.0 - percent_cutoff_factor);
00951
00952
00953
00954
00955
00956
00957
00958
00959
00960 matches_lower_bound = items.size();
00961
00962
00963 DEBUGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
00964 percent_cutoff << "): now have matches_estimated=" <<
00965 matches_estimated << ", matches_lower_bound=" <<
00966 matches_lower_bound);
00967 }
00968
00969 if (estimate_scale != 1.0) {
00970 matches_estimated =
00971 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
00972 if (matches_estimated < matches_lower_bound)
00973 matches_estimated = matches_lower_bound;
00974 }
00975
00976 if (collapse_key != Xapian::BAD_VALUENO || matchspy || mdecider) {
00977 DEBUGLINE(MATCH, "Clamping estimate between bounds: "
00978 "matches_lower_bound = " << matches_lower_bound <<
00979 ", matches_estimated = " << matches_estimated <<
00980 ", matches_upper_bound = " << matches_upper_bound);
00981 Assert(matches_lower_bound <= matches_upper_bound);
00982 matches_estimated = max(matches_estimated, matches_lower_bound);
00983 matches_estimated = min(matches_estimated, matches_upper_bound);
00984 } else if (!percent_cutoff) {
00985 Assert(docs_matched <= matches_upper_bound);
00986 if (docs_matched > matches_lower_bound)
00987 matches_lower_bound = docs_matched;
00988 if (docs_matched > matches_estimated)
00989 matches_estimated = docs_matched;
00990 }
00991 }
00992
00993 DEBUGLINE(MATCH, items.size() << " items in potential mset");
00994
00995 if (first > 0) {
00996
00997 if (items.size() <= first) {
00998 items.clear();
00999 } else {
01000 DEBUGLINE(MATCH, "finding " << first << "th");
01001
01002
01003
01004
01005 vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
01006 nth = items.rbegin() + first;
01007 nth_element(items.rbegin(), nth, items.rend(), mcmp);
01008
01009 items.erase(items.begin() + items.size() - first, items.end());
01010 }
01011 }
01012
01013 DEBUGLINE(MATCH, "sorting " << items.size() << " entries");
01014
01015
01016 sort(items.begin(), items.end(), mcmp);
01017
01018 if (!items.empty()) {
01019 DEBUGLINE(MATCH, "min weight in mset = " << items.back().wt);
01020 DEBUGLINE(MATCH, "max weight in mset = " << items[0].wt);
01021 }
01022
01023 Assert(matches_estimated >= matches_lower_bound);
01024 Assert(matches_estimated <= matches_upper_bound);
01025
01026
01027
01028
01029
01030
01031
01032
01033
01034 if (collapse_key != Xapian::BAD_VALUENO && !items.empty() &&
01035 !collapse_tab.empty()) {
01036
01037
01038 Xapian::weight min_wt = percent_cutoff_factor / (percent_scale / 100);
01039 vector<Xapian::Internal::MSetItem>::iterator i;
01040 for (i = items.begin(); i != items.end() && !collapse_tab.empty(); ++i) {
01041
01042 if ( !i->collapse_key.empty()) {
01043 map<string, pair<Xapian::Internal::MSetItem, Xapian::weight> >::iterator key;
01044 key = collapse_tab.find(i->collapse_key);
01045
01046
01047
01048
01049 Assert(key != collapse_tab.end());
01050
01051
01052
01053 if (key->second.second < min_wt)
01054 i->collapse_count = 0;
01055 else
01056 i->collapse_count = key->second.first.collapse_count;
01057
01058
01059 collapse_tab.erase(key);
01060 }
01061 }
01062 }
01063
01064 mset = Xapian::MSet(new Xapian::MSet::Internal(
01065 first,
01066 matches_upper_bound,
01067 matches_lower_bound,
01068 matches_estimated,
01069 max_weight, greatest_wt, items,
01070 termfreqandwts,
01071 percent_scale));
01072 }
01073
01074
01075
01076 void
01077 MultiMatch::recalc_maxweight()
01078 {
01079 DEBUGCALL(MATCH, void, "MultiMatch::recalc_maxweight", "");
01080 recalculate_w_max = true;
01081 }