matcher/exactphrasepostlist.cc

Go to the documentation of this file.
00001 
00021 #include <config.h>
00022 
00023 #include "exactphrasepostlist.h"
00024 #include "positionlist.h"
00025 #include "omdebug.h"
00026 
00027 #include <algorithm>
00028 #include <vector>
00029 
00030 using namespace std;
00031 
00032 ExactPhrasePostList::ExactPhrasePostList(PostList *source_,
00033                                          const vector<PostList*> &terms_)
00034         : SelectPostList(source_), terms(terms_)
00035 {
00036     size_t n = terms_.size();
00037     poslists = new PositionList*[n];
00038     try {
00039         order = new unsigned[n];
00040     } catch (...) {
00041         delete [] poslists;
00042         throw;
00043     }
00044     for (size_t i = 0; i < n; ++i) order[i] = i;
00045 }
00046 
00047 ExactPhrasePostList::~ExactPhrasePostList()
00048 {
00049     delete [] poslists;
00050     delete [] order;
00051 }
00052 
00053 void
00054 ExactPhrasePostList::start_position_list(unsigned i)
00055 {
00056     unsigned index = order[i];
00057     poslists[i] = terms[index]->read_position_list();
00058     poslists[i]->index = index;
00059 }
00060 
00061 class TermCompare {
00062     vector<PostList *> & terms;
00063 
00064   public:
00065     TermCompare(vector<PostList *> & terms_) : terms(terms_) { }
00066 
00067     bool operator()(unsigned a, unsigned b) const {
00068         return terms[a]->get_wdf() < terms[b]->get_wdf();
00069     }
00070 };
00071 
00072 bool
00073 ExactPhrasePostList::test_doc()
00074 {
00075     DEBUGCALL(MATCH, bool, "ExactPhrasePostList::test_doc", "");
00076 
00077     if (terms.size() <= 1) RETURN(true);
00078 
00079     // We often don't need to read all the position lists, so rather than using
00080     // the shortest position lists first, we approximate by using the terms
00081     // with the lowest wdf first.  This will typically give the same or a very
00082     // similar order.
00083     sort(order, order + terms.size(), TermCompare(terms));
00084 
00085     // If the first term we check only occurs too close to the start of the
00086     // document, we only need to read one term's positions.  E.g. search for
00087     // "ripe mango" when the only occurrence of 'mango' in the current document
00088     // is at position 0.
00089     start_position_list(0);
00090     poslists[0]->skip_to(poslists[0]->index);
00091     if (poslists[0]->at_end()) RETURN(false);
00092 
00093     // If we get here, we'll need to read the positionlists for at least two
00094     // terms, so check the true positionlist length for the two terms with the
00095     // lowest wdf and if necessary swap them so the true shorter one is first.
00096     start_position_list(1);
00097     if (poslists[0]->get_size() < poslists[1]->get_size()) {
00098         poslists[1]->skip_to(poslists[1]->index);
00099         if (poslists[1]->at_end()) RETURN(false);
00100         swap(poslists[0], poslists[1]);
00101     }
00102 
00103     unsigned read_hwm = 1;
00104     Xapian::termpos idx0 = poslists[0]->index;
00105     do {
00106         Xapian::termpos base = poslists[0]->get_position() - idx0;
00107         unsigned i = 1;
00108         while (true) {
00109             if (i > read_hwm) {
00110                 read_hwm = i;
00111                 start_position_list(i);
00112                 // FIXME: consider comparing with poslist[0] and swapping
00113                 // if less common.  Should we allow for the number of positions
00114                 // we've read from poslist[0] already?
00115             }
00116             Xapian::termpos required = base + poslists[i]->index;
00117             poslists[i]->skip_to(required);
00118             if (poslists[i]->at_end()) RETURN(false);
00119             if (poslists[i]->get_position() != required) break;
00120             if (++i == terms.size()) RETURN(true);
00121         }
00122         poslists[0]->next();
00123     } while (!poslists[0]->at_end());
00124     RETURN(false);
00125 }
00126 
00127 Xapian::termcount
00128 ExactPhrasePostList::get_wdf() const
00129 {
00130     // Calculate the an estimate for the wdf of an exact phrase postlist.
00131     //
00132     // We use the minimum wdf of a sub-postlist as our estimate.  See the
00133     // comment in NearPostList::get_wdf for justification of this estimate.
00134     //
00135     // We divide the value calculated for a NearPostList by 3, as a very rough
00136     // heuristic to represent the fact that the words must occur exactly in
00137     // order, and phrases are therefore rarer than near matches and (non-exact)
00138     // phrase matches.
00139 
00140     std::vector<PostList *>::const_iterator i = terms.begin();
00141     Xapian::termcount wdf = (*i)->get_wdf();
00142     for (; i != terms.end(); i++) {
00143         wdf = std::min(wdf, (*i)->get_wdf());
00144     }
00145     return wdf;
00146 }
00147 
00148 Xapian::doccount
00149 ExactPhrasePostList::get_termfreq_est() const
00150 {
00151     // It's hard to estimate how many times the exact phrase will occur as
00152     // it depends a lot on the phrase, but usually the exact phrase will
00153     // occur significantly less often than the individual terms.
00154     return source->get_termfreq_est() / 4;
00155 }
00156 
00157 string
00158 ExactPhrasePostList::get_description() const
00159 {
00160     return "(ExactPhrase " + source->get_description() + ")";
00161 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.