matcher/multimatch.cc

Go to the documentation of this file.
00001 /* multimatch.cc
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2001,2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2005,2006,2007,2008 Olly Betts
00006  * Copyright 2003 Orange PCS Ltd
00007  * Copyright 2003 Sam Liddicott
00008  * Copyright 2007,2008 Lemur Consulting Ltd
00009  *
00010  * This program is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU General Public License as
00012  * published by the Free Software Foundation; either version 2 of the
00013  * License, or (at your option) any later version.
00014  *
00015  * This program is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  * GNU General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU General Public License
00021  * along with this program; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00023  * USA
00024  */
00025 
00026 #include <config.h>
00027 
00028 #include "multimatch.h"
00029 #include "submatch.h"
00030 #include "localmatch.h"
00031 #include "emptysubmatch.h"
00032 #include "omdebug.h"
00033 #include "omenquireinternal.h"
00034 
00035 #include "emptypostlist.h"
00036 #include "branchpostlist.h"
00037 #include "leafpostlist.h"
00038 #include "mergepostlist.h"
00039 
00040 #include "document.h"
00041 #include "omqueryinternal.h"
00042 
00043 #include "submatch.h"
00044 #include "stats.h"
00045 
00046 #include "msetcmp.h"
00047 
00048 #include <xapian/errorhandler.h>
00049 #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
00050 
00051 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00052 #include "remotesubmatch.h"
00053 #include "remote-database.h"
00054 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
00055 
00056 #include <algorithm>
00057 #include <cfloat> // For DBL_EPSILON.
00058 #include <queue>
00059 #include <vector>
00060 #include <map>
00061 #include <set>
00062 
00063 using namespace std;
00064 
00065 const Xapian::Enquire::Internal::sort_setting REL =
00066         Xapian::Enquire::Internal::REL;
00067 const Xapian::Enquire::Internal::sort_setting REL_VAL =
00068         Xapian::Enquire::Internal::REL_VAL;
00069 const Xapian::Enquire::Internal::sort_setting VAL =
00070         Xapian::Enquire::Internal::VAL;
00071 #if 0 // VAL_REL isn't currently used which causes a warning with SGI CC.
00072 const Xapian::Enquire::Internal::sort_setting VAL_REL =
00073         Xapian::Enquire::Internal::VAL_REL;
00074 #endif
00075 
00083 static void
00084 split_rset_by_db(const Xapian::RSet * rset,
00085                  Xapian::doccount number_of_subdbs,
00086                  vector<Xapian::RSet> & subrsets)
00087 {
00088     DEBUGCALL_STATIC(MATCH, void, "split_rset_by_db",
00089                      (rset ? *rset : Xapian::RSet()) <<
00090                      ", " << number_of_subdbs <<
00091                      ", [subrsets(size=" << subrsets.size() << "]");
00092     if (rset) {
00093         if (number_of_subdbs == 1) {
00094             // The common case of a single database is easy to handle.
00095             subrsets.push_back(*rset);
00096         } else {
00097             // Can't just use vector::resize() here, since that creates N
00098             // copies of the same RSet!
00099             subrsets.reserve(number_of_subdbs);
00100             for (size_t i = 0; i < number_of_subdbs; ++i) {
00101                 subrsets.push_back(Xapian::RSet());
00102             }
00103 
00104             const set<Xapian::docid> & items = rset->internal->get_items();
00105             set<Xapian::docid>::const_iterator j;
00106             for (j = items.begin(); j != items.end(); ++j) {
00107                 Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
00108                 Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
00109                 subrsets[subdatabase].add_document(local_docid);
00110             }
00111         }
00112     } else {
00113         // NB vector::resize() creates N copies of the same empty RSet.
00114         subrsets.resize(number_of_subdbs);
00115     }
00116     Assert(subrsets.size() == number_of_subdbs);
00117 }
00118 
00137 static void
00138 prepare_sub_matches(std::vector<Xapian::Internal::RefCntPtr<SubMatch> > & leaves,
00139                     Xapian::ErrorHandler * errorhandler,
00140                     Stats & stats)
00141 {
00142     DEBUGCALL_STATIC(MATCH, void, "prepare_sub_matches",
00143                      "[leaves(size=" << leaves.size() << ")], " <<
00144                      errorhandler << ", " << stats);
00145     // We use a vector<bool> to track which SubMatches we're already prepared.
00146     vector<bool> prepared;
00147     prepared.resize(leaves.size(), false);
00148     size_t unprepared = leaves.size();
00149     bool nowait = true;
00150     while (unprepared) {
00151         for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
00152             if (prepared[leaf]) continue;
00153             try {
00154                 if (leaves[leaf]->prepare_match(nowait, stats)) {
00155                     prepared[leaf] = true;
00156                     --unprepared;
00157                 }
00158             } catch (Xapian::Error & e) {
00159                 if (!errorhandler) throw;
00160 
00161                 DEBUGLINE(EXCEPTION, "Calling error handler for prepare_match() on a SubMatch.");
00162                 (*errorhandler)(e);
00163                 // Continue match without this sub-match.
00164                 leaves[leaf] = new EmptySubMatch();
00165                 prepared[leaf] = true;
00166                 --unprepared;
00167             }
00168         }
00169         // Use blocking IO on subsequent passes, so that we don't go into
00170         // a tight loop.
00171         nowait = false;
00172     }
00173 }
00174 
00176 // Initialisation and cleaning up //
00178 MultiMatch::MultiMatch(const Xapian::Database &db_,
00179                        const Xapian::Query::Internal * query_,
00180                        Xapian::termcount qlen,
00181                        const Xapian::RSet * omrset,
00182                        Xapian::valueno collapse_key_,
00183                        int percent_cutoff_, Xapian::weight weight_cutoff_,
00184                        Xapian::Enquire::docid_order order_,
00185                        Xapian::valueno sort_key_,
00186                        Xapian::Enquire::Internal::sort_setting sort_by_,
00187                        bool sort_value_forward_,
00188                        const Xapian::Sorter * sorter_,
00189                        Xapian::ErrorHandler * errorhandler_,
00190                        Stats & stats,
00191                        const Xapian::Weight * weight_)
00192         : db(db_), query(query_),
00193           collapse_key(collapse_key_), percent_cutoff(percent_cutoff_),
00194           weight_cutoff(weight_cutoff_), order(order_),
00195           sort_key(sort_key_), sort_by(sort_by_),
00196           sort_value_forward(sort_value_forward_), sorter(sorter_),
00197           errorhandler(errorhandler_), weight(weight_),
00198           is_remote(db.internal.size())
00199 {
00200     DEBUGCALL(MATCH, void, "MultiMatch", db_ << ", " << query_ << ", " <<
00201               qlen << ", " << (omrset ? *omrset : Xapian::RSet()) << ", " <<
00202               collapse_key_ << ", " <<
00203               percent_cutoff_ << ", " << weight_cutoff_ << ", " <<
00204               int(order_) << ", " << sort_key_ << ", " <<
00205               int(sort_by_) << ", " << sort_value_forward_ << ", " <<
00206               sorter_ << ", " <<
00207               errorhandler_ << ", " << stats << ", [weight_]");
00208 
00209     if (!query) return;
00210     query->validate_query();
00211 
00212     Xapian::doccount number_of_subdbs = db.internal.size();
00213     vector<Xapian::RSet> subrsets;
00214     split_rset_by_db(omrset, number_of_subdbs, subrsets);
00215 
00216     for (size_t i = 0; i != number_of_subdbs; ++i) {
00217         Xapian::Database::Internal *subdb = db.internal[i].get();
00218         Assert(subdb);
00219         Xapian::Internal::RefCntPtr<SubMatch> smatch;
00220         try {
00221             // There is currently only one special case, for network databases.
00222 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00223             RemoteDatabase *rem_db = subdb->as_remotedatabase();
00224             if (rem_db) {
00225                 is_remote[i] = true;
00226                 rem_db->set_query(query, qlen, collapse_key, order, sort_key,
00227                                   sort_by, sort_value_forward, percent_cutoff,
00228                                   weight_cutoff, weight, subrsets[i]);
00229                 bool decreasing_relevance =
00230                     (sort_by == REL || sort_by == REL_VAL);
00231                 smatch = new RemoteSubMatch(rem_db, decreasing_relevance);
00232             } else {
00233 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
00234                 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
00235 #ifdef XAPIAN_HAS_REMOTE_BACKEND
00236             }
00237 #endif /* XAPIAN_HAS_REMOTE_BACKEND */
00238         } catch (Xapian::Error & e) {
00239             if (!errorhandler) throw;
00240             DEBUGLINE(EXCEPTION, "Calling error handler for creation of a SubMatch from a database and query.");
00241             (*errorhandler)(e);
00242             // Continue match without this sub-postlist.
00243             smatch = new EmptySubMatch;
00244         }
00245         leaves.push_back(smatch);
00246     }
00247 
00248     prepare_sub_matches(leaves, errorhandler, stats);
00249 }
00250 
00251 string
00252 MultiMatch::get_collapse_key(PostList *pl, Xapian::docid did,
00253                              Xapian::valueno keyno, Xapian::Internal::RefCntPtr<Xapian::Document::Internal> &doc)
00254 {
00255     DEBUGCALL(MATCH, string, "MultiMatch::get_collapse_key", pl << ", " << did << ", " << keyno << ", [doc]");
00256     const string *key = pl->get_collapse_key();
00257     if (key) RETURN(*key);
00258     if (doc.get() == 0) {
00259         unsigned int multiplier = db.internal.size();
00260         Assert(multiplier != 0);
00261         Xapian::doccount n = (did - 1) % multiplier; // which actual database
00262         Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
00263 
00264         doc = db.internal[n]->open_document(m, true);
00265     }
00266     RETURN(doc->get_value(keyno));
00267 }
00268 
00269 Xapian::weight
00270 MultiMatch::getorrecalc_maxweight(PostList *pl)
00271 {
00272     DEBUGCALL(MATCH, Xapian::weight, "MultiMatch::getorrecalc_maxweight", pl);
00273     Xapian::weight wt;
00274     if (recalculate_w_max) {
00275         DEBUGLINE(MATCH, "recalculating max weight");
00276         wt = pl->recalc_maxweight();
00277         recalculate_w_max = false;
00278     } else {
00279         wt = pl->get_maxweight();
00280         AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
00281     }
00282     DEBUGLINE(MATCH, "max possible doc weight = " << wt);
00283     RETURN(wt);
00284 }
00285 
00286 void
00287 MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
00288                      Xapian::doccount check_at_least,
00289                      Xapian::MSet & mset,
00290                      const Stats & stats,
00291                      const Xapian::MatchDecider *mdecider,
00292                      const Xapian::MatchDecider *matchspy)
00293 {
00294     DEBUGCALL(MATCH, void, "MultiMatch::get_mset", first << ", " << maxitems
00295               << ", " << check_at_least << ", ...");
00296     if (check_at_least < maxitems) check_at_least = maxitems;
00297 
00298     if (!query) {
00299         mset = Xapian::MSet(); // FIXME: mset.get_firstitem() will return 0 not first
00300         return;
00301     }
00302 
00303     Assert(!leaves.empty());
00304 
00305     // If there's only one database and it's remote, we can just unserialise
00306     // its MSet and return that.
00307     if (leaves.size() == 1 && is_remote[0]) {
00308         RemoteSubMatch * rem_match;
00309         rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
00310         rem_match->start_match(first, maxitems, check_at_least, stats);
00311         rem_match->get_mset(mset);
00312         return;
00313     }
00314 
00315     // Start matchers.
00316     {
00317         vector<Xapian::Internal::RefCntPtr<SubMatch> >::iterator leaf;
00318         for (leaf = leaves.begin(); leaf != leaves.end(); ++leaf) {
00319             try {
00320                 (*leaf)->start_match(0, first + maxitems,
00321                                      first + check_at_least, stats);
00322             } catch (Xapian::Error & e) {
00323                 if (!errorhandler) throw;
00324                 DEBUGLINE(EXCEPTION, "Calling error handler for "
00325                           "start_match() on a SubMatch.");
00326                 (*errorhandler)(e);
00327                 // Continue match without this sub-match.
00328                 *leaf = Xapian::Internal::RefCntPtr<SubMatch>(new EmptySubMatch());
00329             }
00330         }
00331     }
00332 
00333     // Get postlists and term info
00334     vector<PostList *> postlists;
00335     map<string, Xapian::MSet::Internal::TermFreqAndWeight> termfreqandwts;
00336     map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
00337     termfreqandwts_ptr = &termfreqandwts;
00338 
00339     // Keep a count of matches which we know exist, but we won't see.  This
00340     // occurs when a submatch is remote, and returns a lower bound on the
00341     // number of matching documents which is higher than the number of
00342     // documents it returns (because it wasn't asked for more documents).
00343     Xapian::doccount definite_matches_not_seen = 0;
00344     for (size_t i = 0; i != leaves.size(); ++i) {
00345         PostList *pl;
00346         try {
00347             pl = leaves[i]->get_postlist_and_term_info(this,
00348                                                        termfreqandwts_ptr);
00349             if (termfreqandwts_ptr && !termfreqandwts.empty())
00350                 termfreqandwts_ptr = NULL;
00351             if (is_remote[i]) {
00352                 if (pl->get_termfreq_min() > first + maxitems) {
00353                     DEBUGLINE(MATCH, "Found " <<
00354                               pl->get_termfreq_min() - (first + maxitems)
00355                               << " definite matches in remote "
00356                               "submatch which aren't passed to local "
00357                               "match");
00358                     definite_matches_not_seen += pl->get_termfreq_min();
00359                     definite_matches_not_seen -= first + maxitems;
00360                 }
00361             }
00362         } catch (Xapian::Error & e) {
00363             if (!errorhandler) throw;
00364             DEBUGLINE(EXCEPTION, "Calling error handler for "
00365                       "get_term_info() on a SubMatch.");
00366             (*errorhandler)(e);
00367             // FIXME: check if *ALL* the remote servers have failed!
00368             // Continue match without this sub-match.
00369             leaves[i] = new EmptySubMatch();
00370             pl = new EmptyPostList;
00371         }
00372         postlists.push_back(pl);
00373     }
00374     Assert(!postlists.empty());
00375 
00376     // Get a single combined postlist
00377     PostList *pl;
00378     if (postlists.size() == 1) {
00379         pl = postlists.front();
00380     } else {
00381         pl = new MergePostList(postlists, this, errorhandler);
00382     }
00383 
00384     DEBUGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00385 
00386     // Empty result set
00387     Xapian::doccount docs_matched = 0;
00388     Xapian::weight greatest_wt = 0;
00389     vector<Xapian::Internal::MSetItem> items;
00390 
00391     // maximum weight a document could possibly have
00392     const Xapian::weight max_weight = pl->recalc_maxweight();
00393 
00394     DEBUGLINE(MATCH, "pl = (" << pl->get_description() << ")");
00395     recalculate_w_max = false;
00396 
00397     Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
00398     Xapian::doccount matches_lower_bound = 0;
00399     Xapian::doccount matches_estimated   = pl->get_termfreq_est();
00400 
00401     if (mdecider == NULL && matchspy == NULL) {
00402         // If we have a matcher decider or match spy, the lower bound must be
00403         // set to 0 as we could discard all hits.  Otherwise set it to the
00404         // minimum number of entries which the postlist could return.
00405         matches_lower_bound = pl->get_termfreq_min();
00406     }
00407 
00408     // Check if any results have been asked for (might just be wanting
00409     // maxweight).
00410     if (check_at_least == 0) {
00411         delete pl;
00412         if (collapse_key != Xapian::BAD_VALUENO) {
00413             // Lower bound must be set to no more than 1, since it's possible
00414             // that all hits will be collapsed to a single hit.
00415             if (matches_lower_bound > 1) matches_lower_bound = 1;
00416         }
00417 
00418         mset = Xapian::MSet(new Xapian::MSet::Internal(
00419                                            first,
00420                                            matches_upper_bound,
00421                                            matches_lower_bound,
00422                                            matches_estimated,
00423                                            max_weight, greatest_wt, items,
00424                                            termfreqandwts,
00425                                            0));
00426         return;
00427     }
00428 
00429     // duplicates_found and documents_considered are used solely to keep track
00430     // of how frequently collapses are occurring, so that the matches_estimated
00431     // can be adjusted accordingly.
00432     Xapian::doccount duplicates_found = 0;
00433     Xapian::doccount documents_considered = 0;
00434 
00435     // Number of documents considered by a decider or matchspy.
00436     Xapian::doccount decider_considered = 0;
00437     // Number of documents denied by the decider or matchspy.
00438     Xapian::doccount decider_denied = 0;
00439 
00440     // We keep track of the number of null collapse values because this allows
00441     // us to provide a better value for matches_lower_bound if there are null
00442     // collapse values.
00443     Xapian::doccount null_collapse_values = 0;
00444 
00445     // Set max number of results that we want - this is used to decide
00446     // when to throw away unwanted items.
00447     Xapian::doccount max_msize = first + maxitems;
00448     items.reserve(max_msize + 1);
00449 
00450     // Tracks the minimum item currently eligible for the MSet - we compare
00451     // candidate items against this.
00452     Xapian::Internal::MSetItem min_item(0.0, 0);
00453 
00454     // Minimum weight an item must have to be worth considering.
00455     Xapian::weight min_weight = weight_cutoff;
00456 
00457     // Factor to multiply maximum weight seen by to get the cutoff weight.
00458     Xapian::weight percent_cutoff_factor = percent_cutoff / 100.0;
00459     // Corresponding correction to that in omenquire.cc to account for excess
00460     // precision on x86.
00461     percent_cutoff_factor -= DBL_EPSILON;
00462 
00463     // Table of keys which have been seen already, for collapsing.
00464     map<string, pair<Xapian::Internal::MSetItem,Xapian::weight> > collapse_tab;
00465 
00467     bool sort_forward = (order != Xapian::Enquire::DESCENDING);
00468     MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
00469 
00470     // Perform query
00471 
00472     // We form the mset in two stages.  In the first we fill up our working
00473     // mset.  Adding a new document does not remove another.
00474     //
00475     // In the second, we consider documents which rank higher than the current
00476     // lowest ranking document in the mset.  Each document added expels the
00477     // current lowest ranking document.
00478     //
00479     // If a percentage cutoff is in effect, it can cause the matcher to return
00480     // from the second stage from the first.
00481 
00482     // Is the mset a valid heap?
00483     bool is_heap = false;
00484 
00485     while (true) {
00486         if (rare(recalculate_w_max)) {
00487             if (min_weight > 0.0) {
00488                 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00489                     DEBUGLINE(MATCH, "*** TERMINATING EARLY (1)");
00490                     break;
00491                 }
00492             }
00493         }
00494 
00495         if (rare(next_handling_prune(pl, min_weight, this))) {
00496             DEBUGLINE(MATCH, "*** REPLACING ROOT");
00497 
00498             if (min_weight > 0.0) {
00499                 // No need for a full recalc (unless we've got to do one
00500                 // because of a prune elsewhere) - we're just switching to a
00501                 // subtree.
00502                 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00503                     DEBUGLINE(MATCH, "*** TERMINATING EARLY (2)");
00504                     break;
00505                 }
00506             }
00507         }
00508 
00509         if (rare(pl->at_end())) {
00510             DEBUGLINE(MATCH, "Reached end of potential matches");
00511             break;
00512         }
00513 
00514         // Only calculate the weight if we need it for mcmp, or there's a
00515         // percentage or weight cutoff in effect.  Otherwise we calculate it
00516         // below if we haven't already rejected this candidate.
00517         Xapian::weight wt = 0.0;
00518         bool calculated_weight = false;
00519         if (sort_by != VAL || min_weight > 0.0) {
00520             wt = pl->get_weight();
00521             if (wt < min_weight) {
00522                 DEBUGLINE(MATCH, "Rejecting potential match due to insufficient weight");
00523                 continue;
00524             }
00525             calculated_weight = true;
00526         }
00527 
00528         Xapian::Internal::RefCntPtr<Xapian::Document::Internal> doc;
00529         Xapian::docid did = pl->get_docid();
00530         DEBUGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
00531         Xapian::Internal::MSetItem new_item(wt, did);
00532         if (sort_by != REL) {
00533             const unsigned int multiplier = db.internal.size();
00534             Assert(multiplier != 0);
00535             Xapian::doccount n = (new_item.did - 1) % multiplier; // which actual database
00536             Xapian::docid m = (new_item.did - 1) / multiplier + 1; // real docid in that database
00537             doc = db.internal[n]->open_document(m, true);
00538             if (sorter) {
00539                 new_item.sort_key = (*sorter)(Xapian::Document(doc.get()));
00540             } else {
00541                 new_item.sort_key = doc->get_value(sort_key);
00542             }
00543 
00544             // We're sorting by value (in part at least), so compare the item
00545             // against the lowest currently in the proto-mset.  If sort_by is
00546             // VAL, then new_item.wt won't yet be set, but that doesn't
00547             // matter since it's not used by the sort function.
00548             if (!mcmp(new_item, min_item)) {
00549                 if (matchspy == NULL && mdecider == NULL &&
00550                     collapse_key == Xapian::BAD_VALUENO) {
00551                     // Document was definitely suitable for mset - no more
00552                     // processing needed.
00553                     DEBUGLINE(MATCH, "Making note of match item which sorts lower than min_item");
00554                     ++docs_matched;
00555                     continue;
00556                 }
00557                 if (docs_matched >= check_at_least) {
00558                     // We've seen enough items - we can drop this one.
00559                     DEBUGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
00560                     continue;
00561                 }
00562                 // We can't drop the item, because we need to show it
00563                 // to the matchspy, test whether the mdecider would
00564                 // accept it, and/or test whether it would be collapsed.
00565                 DEBUGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
00566             }
00567         }
00568 
00569         // Use the match spy and/or decision functors (if specified).
00570         if (matchspy != NULL || mdecider != NULL) {
00571             const unsigned int multiplier = db.internal.size();
00572             Assert(multiplier != 0);
00573             Xapian::doccount n = (did - 1) % multiplier; // which actual database
00574             // If the results are from a remote database, then the functor will
00575             // already have been applied there so we can skip this step.
00576             if (!is_remote[n]) {
00577                 if (doc.get() == 0) {
00578                     Xapian::docid m = (did - 1) / multiplier + 1; // real docid in that database
00579                     doc = db.internal[n]->open_document(m, true);
00580                     Assert(doc.get());
00581                 }
00582                 Xapian::Document mydoc(doc.get());
00583 
00584                 ++decider_considered;
00585                 if (matchspy && !matchspy->operator()(mydoc)) {
00586                     ++decider_denied;
00587                     continue;
00588                 }
00589                 if (mdecider && !mdecider->operator()(mydoc)) {
00590                     ++decider_denied;
00591                     continue;
00592                 }
00593             }
00594         }
00595 
00596         if (!calculated_weight) {
00597             // we didn't calculate the weight above, but now we will need it
00598             wt = pl->get_weight();
00599             new_item.wt = wt;
00600         }
00601 
00602         bool pushback = true;
00603         documents_considered++;
00604 
00605         // Perform collapsing on key if requested.
00606         if (collapse_key != Xapian::BAD_VALUENO) {
00607             new_item.collapse_key = get_collapse_key(pl, did, collapse_key,
00608                                                      doc);
00609 
00610             // Don't collapse on null key
00611             if (new_item.collapse_key.empty()) {
00612                 ++null_collapse_values;
00613             } else {
00614                 map<string, pair<Xapian::Internal::MSetItem, Xapian::weight> >::iterator oldkey;
00615                 oldkey = collapse_tab.find(new_item.collapse_key);
00616                 if (oldkey == collapse_tab.end()) {
00617                     DEBUGLINE(MATCH, "collapsem: new key: " <<
00618                               new_item.collapse_key);
00619                     // Key not been seen before
00620                     collapse_tab.insert(make_pair(new_item.collapse_key,
00621                                         make_pair(new_item, Xapian::weight(0))));
00622                 } else {
00623                     ++duplicates_found;
00624                     Xapian::Internal::MSetItem &old_item = oldkey->second.first;
00625                     // FIXME: what about the (sort_by != REL) case here?
00626                     if (mcmp(old_item, new_item)) {
00627                         DEBUGLINE(MATCH, "collapsem: better exists: " <<
00628                                   new_item.collapse_key);
00629                         // There's already a better match with this key
00630                         ++old_item.collapse_count;
00631                         // But maybe the weight is worth noting
00632                         if (new_item.wt > oldkey->second.second) {
00633                             oldkey->second.second = new_item.wt;
00634                         }
00635                         continue;
00636                     }
00637                     // Make a note of the updated collapse count in the
00638                     // replacement item
00639                     new_item.collapse_count = old_item.collapse_count + 1;
00640 
00641                     // There was a previous item in the collapse tab so
00642                     // the MSet can't be empty.
00643                     Assert(!items.empty());
00644 
00645                     // This is best potential MSet entry with this key which
00646                     // we've seen so far.  Check if the previous best entry
00647                     // with this key might still be in the proto-MSet.  If it
00648                     // might be, we need to check through for it.
00649                     Xapian::weight old_wt = old_item.wt;
00650                     if (old_wt >= min_weight && mcmp(old_item, min_item)) {
00651                         // Scan through (unsorted) MSet looking for entry.
00652                         // FIXME: more efficient way than just scanning?
00653                         Xapian::docid olddid = old_item.did;
00654                         vector<Xapian::Internal::MSetItem>::iterator i;
00655                         for (i = items.begin(); i != items.end(); ++i) {
00656                             if (i->did == olddid) {
00657                                 DEBUGLINE(MATCH, "collapsem: removing " <<
00658                                           olddid << ": " <<
00659                                           new_item.collapse_key);
00660                                 // We can replace an arbitrary element in
00661                                 // O(log N) but have to do it by hand (in this
00662                                 // case the new elt is bigger, so we just swap
00663                                 // down the tree).
00664                                 // FIXME: implement this, and clean up is_heap
00665                                 // handling
00666                                 *i = new_item;
00667                                 pushback = false;
00668                                 is_heap = false;
00669                                 break;
00670                             }
00671                         }
00672                     }
00673 
00674                     // Keep the old weight as it is now second best so far
00675                     oldkey->second = make_pair(new_item, old_wt);
00676                 }
00677             }
00678         }
00679 
00680         // OK, actually add the item to the mset.
00681         if (pushback) {
00682             ++docs_matched;
00683             if (items.size() >= max_msize) {
00684                 items.push_back(new_item);
00685                 if (!is_heap) {
00686                     is_heap = true;
00687                     make_heap(items.begin(), items.end(), mcmp);
00688                 } else {
00689                     push_heap<vector<Xapian::Internal::MSetItem>::iterator,
00690                               MSetCmp>(items.begin(), items.end(), mcmp);
00691                 }
00692                 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00693                          MSetCmp>(items.begin(), items.end(), mcmp);
00694                 items.pop_back();
00695 
00696                 min_item = items.front();
00697                 if (sort_by == REL || sort_by == REL_VAL) {
00698                     if (docs_matched >= check_at_least) {
00699                         if (sort_by == REL) {
00700                             // We're done if this is a forward boolean match
00701                             // with only one database (bodgetastic, FIXME
00702                             // better if we can!)
00703                             if (rare(max_weight == 0 && sort_forward)) {
00704                                 // In the multi database case, MergePostList
00705                                 // currently processes each database
00706                                 // sequentially (which actually may well be
00707                                 // more efficient) so the docids in general
00708                                 // won't arrive in order.
00709                                 if (leaves.size() == 1) break;
00710                             }
00711                         }
00712                         if (min_item.wt > min_weight) {
00713                             DEBUGLINE(MATCH, "Setting min_weight to " <<
00714                                       min_item.wt << " from " << min_weight);
00715                             min_weight = min_item.wt;
00716                         }
00717                     }
00718                 }
00719                 if (rare(getorrecalc_maxweight(pl) < min_weight)) {
00720                     DEBUGLINE(MATCH, "*** TERMINATING EARLY (3)");
00721                     break;
00722                 }
00723             } else {
00724                 items.push_back(new_item);
00725                 is_heap = false;
00726                 if (sort_by == REL && items.size() == max_msize) {
00727                     if (docs_matched >= check_at_least) {
00728                         // We're done if this is a forward boolean match
00729                         // with only one database (bodgetastic, FIXME
00730                         // better if we can!)
00731                         if (rare(max_weight == 0 && sort_forward)) {
00732                             // In the multi database case, MergePostList
00733                             // currently processes each database
00734                             // sequentially (which actually may well be
00735                             // more efficient) so the docids in general
00736                             // won't arrive in order.
00737                             if (leaves.size() == 1) break;
00738                         }
00739                     }
00740                 }
00741             }
00742         }
00743 
00744         // Keep a track of the greatest weight we've seen.
00745         if (wt > greatest_wt) {
00746             greatest_wt = wt;
00747             if (percent_cutoff) {
00748                 Xapian::weight w = wt * percent_cutoff_factor;
00749                 if (w > min_weight) {
00750                     min_weight = w;
00751                     if (!is_heap) {
00752                         is_heap = true;
00753                         make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00754                                   MSetCmp>(items.begin(), items.end(), mcmp);
00755                     }
00756                     while (!items.empty() && items.front().wt < min_weight) {
00757                         pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00758                                  MSetCmp>(items.begin(), items.end(), mcmp);
00759                         Assert(items.back().wt < min_weight);
00760                         items.pop_back();
00761                     }
00762 #ifdef XAPIAN_DEBUG_PARANOID
00763                     vector<Xapian::Internal::MSetItem>::const_iterator i;
00764                     for (i = items.begin(); i != items.end(); ++i) {
00765                         Assert(i->wt >= min_weight);
00766                     }
00767 #endif
00768                 }
00769             }
00770         }
00771     }
00772 
00773     // done with posting list tree
00774     delete pl;
00775 
00776     double percent_scale = 0;
00777     if (!items.empty() && greatest_wt > 0) {
00778         // Find the document with the highest weight, then total up the
00779         // weights for the terms it contains
00780         vector<Xapian::Internal::MSetItem>::const_iterator best;
00781         best = min_element(items.begin(), items.end(), mcmp);
00782 
00783         unsigned int multiplier = db.internal.size();
00784         Assert(multiplier != 0);
00785         Xapian::doccount n = (best->did - 1) % multiplier; // which actual database
00786         // If the top result is from a remote database, then we can
00787         // just use the percentage scaling calculated for that
00788         // database.
00789         if (is_remote[n]) {
00790             RemoteSubMatch * rem_match;
00791             rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
00792             percent_scale = rem_match->get_percent_factor();
00793         } else if (termfreqandwts.size() > 1) {
00794             Xapian::termcount matching_terms = 0;
00795             map<string,
00796                 Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator i;
00797 
00798             Xapian::TermIterator docterms = db.termlist_begin(best->did);
00799             Xapian::TermIterator docterms_end = db.termlist_end(best->did);
00800             while (docterms != docterms_end) {
00801                 i = termfreqandwts.find(*docterms);
00802                 if (i != termfreqandwts.end()) {
00803                     percent_scale += i->second.termweight;
00804                     ++matching_terms;
00805                     if (matching_terms == termfreqandwts.size()) break;
00806                 }
00807                 ++docterms;
00808             }
00809             // Special case for MatchAll queries
00810             i = termfreqandwts.find("");
00811             if (i != termfreqandwts.end()) {
00812                 percent_scale += i->second.termweight;
00813                 ++matching_terms;
00814             }
00815             if (matching_terms < termfreqandwts.size()) {
00816                 // OK, work out weight corresponding to 100%
00817                 double denom = 0;
00818                 for (i = termfreqandwts.begin(); i != termfreqandwts.end(); ++i)
00819                     denom += i->second.termweight;
00820 
00821                 DEBUGLINE(MATCH, "denom = " << denom << " percent_scale = " << percent_scale);
00822                 Assert(percent_scale <= denom);
00823                 denom *= greatest_wt;
00824                 Assert(denom > 0);
00825                 percent_scale /= denom;
00826             } else {
00827                 // If all the terms match, the 2 sums of weights cancel
00828                 percent_scale = 1.0 / greatest_wt;
00829             }
00830         } else {
00831             // If there's only a single term in the query, the top document
00832             // must score 100%.
00833             percent_scale = 1.0 / greatest_wt;
00834         }
00835         Assert(percent_scale > 0);
00836         if (percent_cutoff) {
00837             // FIXME: better to sort and binary chop maybe?  we
00838             // could use the sort above to find "best" too.
00839             // Or we could just use a linear scan here instead.
00840 
00841             // trim the mset to the correct answer...
00842             Xapian::weight min_wt = percent_cutoff_factor / percent_scale;
00843             if (!is_heap) {
00844                 is_heap = true;
00845                 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
00846                           MSetCmp>(items.begin(), items.end(), mcmp);
00847             }
00848             while (!items.empty() && items.front().wt < min_wt) {
00849                 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
00850                          MSetCmp>(items.begin(), items.end(), mcmp);
00851                 Assert(items.back().wt < min_wt);
00852                 items.pop_back();
00853             }
00854 #ifdef XAPIAN_DEBUG_PARANOID
00855             vector<Xapian::Internal::MSetItem>::const_iterator j;
00856             for (j = items.begin(); j != items.end(); ++j) {
00857                 Assert(j->wt >= min_wt);
00858             }
00859 #endif
00860         }
00861         percent_scale *= 100.0;
00862     }
00863 
00864     DEBUGLINE(MATCH,
00865               "docs_matched = " << docs_matched <<
00866               ", definite_matches_not_seen = " << definite_matches_not_seen <<
00867               ", matches_lower_bound = " << matches_lower_bound <<
00868               ", matches_estimated = " << matches_estimated <<
00869               ", matches_upper_bound = " << matches_upper_bound);
00870 
00871     // Adjust docs_matched to take account of documents which matched remotely
00872     // but weren't sent across.
00873     docs_matched += definite_matches_not_seen;
00874 
00875     if (items.size() < max_msize) {
00876         // We have fewer items in the mset than we tried to get for it, so we
00877         // must have all the matches in it.
00878         DEBUGLINE(MATCH, "items.size() = " << items.size() <<
00879                   ", max_msize = " << max_msize << ", setting bounds equal");
00880         Assert(definite_matches_not_seen == 0);
00881         Assert(percent_cutoff || docs_matched == items.size());
00882         matches_lower_bound = matches_upper_bound = matches_estimated
00883             = items.size();
00884     } else if (docs_matched < check_at_least) {
00885         // We have seen fewer matches than we checked for, so we must have seen
00886         // all the matches.
00887         DEBUGLINE(MATCH, "Setting bounds equal");
00888         matches_lower_bound = matches_upper_bound = matches_estimated
00889             = docs_matched;
00890     } else {
00891         Assert(matches_estimated >= matches_lower_bound);
00892         Assert(matches_estimated <= matches_upper_bound);
00893 
00894         // We can end up scaling the estimate more than once, so collect
00895         // the scale factors and apply them in one go to avoid rounding
00896         // more than once.
00897         double estimate_scale = 1.0;
00898 
00899         if (collapse_key != Xapian::BAD_VALUENO) {
00900             // Lower bound must be set to no more than the number of null
00901             // collapse values seen plus the number of unique non-null collapse
00902             // values seen, since it's possible that all further hits will be
00903             // collapsed to values we've already seen.
00904             DEBUGLINE(MATCH, "Adjusting bounds due to collapse_key");
00905             matches_lower_bound = null_collapse_values + collapse_tab.size();
00906 
00907             // The estimate for the number of hits can be modified by
00908             // multiplying it by the rate at which we've been finding
00909             // unique documents.
00910             if (documents_considered > 0) {
00911                 double unique = double(documents_considered - duplicates_found);
00912                 double unique_rate = unique / double(documents_considered);
00913                 estimate_scale *= unique_rate;
00914             }
00915 
00916             // We can safely reduce the upper bound by the number of
00917             // duplicates we've seen.
00918             matches_upper_bound -= duplicates_found;
00919 
00920             DEBUGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
00921                       ", matches_estimated=" << matches_estimated <<
00922                       "*" << estimate_scale <<
00923                       ", matches_upper_bound=" << matches_upper_bound);
00924         }
00925 
00926         if (matchspy || mdecider) {
00927             if (collapse_key == Xapian::BAD_VALUENO && !percent_cutoff) {
00928                 // We're not collapsing or doing a percentage cutoff, so
00929                 // docs_matched is a lower bound on the total number of matches.
00930                 matches_lower_bound = max(docs_matched, matches_lower_bound);
00931             }
00932 
00933             // The estimate for the number of hits can be modified by
00934             // multiplying it by the rate at which the match decider has
00935             // been accepting documents.
00936             if (decider_considered > 0) {
00937                 double accept = double(decider_considered - decider_denied);
00938                 double accept_rate = accept / double(decider_considered);
00939                 estimate_scale *= accept_rate;
00940             }
00941 
00942             // If a document is denied by a match decider, it is not possible
00943             // for it to found to be a duplicate, so it is safe to also reduce
00944             // the upper bound by the number of documents denied by a match
00945             // decider.
00946             matches_upper_bound -= decider_denied;
00947         }
00948 
00949         if (percent_cutoff) {
00950             estimate_scale *= (1.0 - percent_cutoff_factor);
00951             // another approach:
00952             // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
00953             // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
00954 
00955             // Very likely an underestimate, but we can't really do better
00956             // without checking further matches...  Only possibility would be
00957             // to track how many docs made the min weight test but didn't make
00958             // the candidate set since the last greatest_wt change, which we
00959             // could use if the top documents matched all the prob terms.
00960             matches_lower_bound = items.size();
00961             // matches_upper_bound could be reduced by the number of documents
00962             // which fail the min weight test
00963             DEBUGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
00964                       percent_cutoff << "): now have matches_estimated=" <<
00965                       matches_estimated << ", matches_lower_bound=" <<
00966                       matches_lower_bound);
00967         }
00968 
00969         if (estimate_scale != 1.0) {
00970             matches_estimated =
00971                 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
00972             if (matches_estimated < matches_lower_bound)
00973                 matches_estimated = matches_lower_bound;
00974         }
00975 
00976         if (collapse_key != Xapian::BAD_VALUENO || matchspy || mdecider) {
00977             DEBUGLINE(MATCH, "Clamping estimate between bounds: "
00978                       "matches_lower_bound = " << matches_lower_bound <<
00979                       ", matches_estimated = " << matches_estimated <<
00980                       ", matches_upper_bound = " << matches_upper_bound);
00981             Assert(matches_lower_bound <= matches_upper_bound);
00982             matches_estimated = max(matches_estimated, matches_lower_bound);
00983             matches_estimated = min(matches_estimated, matches_upper_bound);
00984         } else if (!percent_cutoff) {
00985             Assert(docs_matched <= matches_upper_bound);
00986             if (docs_matched > matches_lower_bound)
00987                 matches_lower_bound = docs_matched;
00988             if (docs_matched > matches_estimated)
00989                 matches_estimated = docs_matched;
00990         }
00991     }
00992 
00993     DEBUGLINE(MATCH, items.size() << " items in potential mset");
00994 
00995     if (first > 0) {
00996         // Remove unwanted leading entries
00997         if (items.size() <= first) {
00998             items.clear();
00999         } else {
01000             DEBUGLINE(MATCH, "finding " << first << "th");
01001             // We perform nth_element() on reverse iterators so that the
01002             // unwanted elements end up at the end of items, which means
01003             // that the call to erase() to remove them doesn't have to copy
01004             // any elements.
01005             vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
01006             nth = items.rbegin() + first;
01007             nth_element(items.rbegin(), nth, items.rend(), mcmp);
01008             // Erase the trailing ``first'' elements
01009             items.erase(items.begin() + items.size() - first, items.end());
01010         }
01011     }
01012 
01013     DEBUGLINE(MATCH, "sorting " << items.size() << " entries");
01014 
01015     // Need a stable sort, but this is provided by comparison operator
01016     sort(items.begin(), items.end(), mcmp);
01017 
01018     if (!items.empty()) {
01019         DEBUGLINE(MATCH, "min weight in mset = " << items.back().wt);
01020         DEBUGLINE(MATCH, "max weight in mset = " << items[0].wt);
01021     }
01022 
01023     Assert(matches_estimated >= matches_lower_bound);
01024     Assert(matches_estimated <= matches_upper_bound);
01025 
01026     // We may need to qualify any collapse_count to see if the highest weight
01027     // collapsed item would have qualified percent_cutoff
01028     // We WILL need to restore collapse_count to the mset by taking from
01029     // collapse_tab; this is what comes of copying around whole objects
01030     // instead of taking references, we find it hard to update collapse_count
01031     // of an item that has already been pushed-back as we don't know where it
01032     // is any more.  If we keep or find references we won't need to mess with
01033     // is_heap so much maybe?
01034     if (collapse_key != Xapian::BAD_VALUENO && /*percent_cutoff &&*/ !items.empty() &&
01035         !collapse_tab.empty()) {
01036         // Nicked this formula from above, but for some reason percent_scale
01037         // has since been multiplied by 100 so we take that into account
01038         Xapian::weight min_wt = percent_cutoff_factor / (percent_scale / 100);
01039         vector<Xapian::Internal::MSetItem>::iterator i;
01040         for (i = items.begin(); i != items.end() && !collapse_tab.empty(); ++i) {
01041             // Is this a collapsed hit?
01042             if (/*i->collapse_count > 0 &&*/ !i->collapse_key.empty()) {
01043                 map<string, pair<Xapian::Internal::MSetItem, Xapian::weight> >::iterator key;
01044                 key = collapse_tab.find(i->collapse_key);
01045                 // Because we collapse, each collapse key can only occur once
01046                 // in the items, we remove from collapse_tab here as processed
01047                 // so we can quit early.  Therefore each time we find an item
01048                 // with a collapse_key the collapse_key must be in collapse_tab
01049                 Assert(key != collapse_tab.end());
01050                 // If weight of top collapsed item is not relevant enough
01051                 // then collapse count is bogus in every way
01052                 // FIXME: Should this be <=?
01053                 if (key->second.second < min_wt)
01054                     i->collapse_count = 0;
01055                 else
01056                     i->collapse_count = key->second.first.collapse_count;
01057                 // When collapse_tab is finally empty we can finish this process
01058                 // without examining any further hits
01059                 collapse_tab.erase(key);
01060             }
01061         }
01062     }
01063 
01064     mset = Xapian::MSet(new Xapian::MSet::Internal(
01065                                        first,
01066                                        matches_upper_bound,
01067                                        matches_lower_bound,
01068                                        matches_estimated,
01069                                        max_weight, greatest_wt, items,
01070                                        termfreqandwts,
01071                                        percent_scale));
01072 }
01073 
01074 // This method is called by branch postlists when they rebalance
01075 // in order to recalculate the weights in the tree
01076 void
01077 MultiMatch::recalc_maxweight()
01078 {
01079     DEBUGCALL(MATCH, void, "MultiMatch::recalc_maxweight", "");
01080     recalculate_w_max = true;
01081 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.