matcher/phrasepostlist.cc

Go to the documentation of this file.
00001 /* phrasepostlist.cc: Return only items where terms are near or form a phrase
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2007 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 
00025 #include "phrasepostlist.h"
00026 #include "positionlist.h"
00027 #include "omassert.h"
00028 #include "omdebug.h"
00029 #include "utils.h"
00030 
00031 #include <algorithm>
00032 
00036 class PositionListCmpLt {
00037     public:
00040         bool operator()(const PositionList *a, const PositionList *b) {
00041             return a->get_size() < b->get_size();
00042         }
00043 };
00044 
00045 
00048 bool
00049 NearPostList::test_doc()
00050 {
00051     DEBUGCALL(MATCH, bool, "NearPostList::test_doc", "");
00052     std::vector<PositionList *> plists;
00053 
00054     std::vector<PostList *>::iterator i;
00055     for (i = terms.begin(); i != terms.end(); i++) {
00056         PositionList * p = (*i)->read_position_list();
00057         // If p is NULL, the backend doesn't support positionlists
00058         if (!p) return false;
00059         plists.push_back(p);
00060     }
00061 
00062     std::sort(plists.begin(), plists.end(), PositionListCmpLt());
00063 
00064     Xapian::termpos pos;
00065     do {
00066         plists[0]->next();
00067         if (plists[0]->at_end()) RETURN(false);
00068         pos = plists[0]->get_position();
00069     } while (!do_test(plists, 1, pos, pos));
00070 
00071     RETURN(true);
00072 }
00073 
00074 bool
00075 NearPostList::do_test(std::vector<PositionList *> &plists, Xapian::termcount i,
00076                       Xapian::termcount min, Xapian::termcount max)
00077 {
00078     DEBUGCALL(MATCH, bool, "NearPostList::do_test", "[plists], " << i << ", " << min << ", " << max);
00079     DEBUGLINE(MATCH, "docid = " << get_docid() << ", window = " << window);
00080     Xapian::termcount tmp = max + 1;
00081     // take care to avoid underflow
00082     if (window <= tmp) tmp -= window; else tmp = 0;
00083     plists[i]->skip_to(tmp);
00084     while (!plists[i]->at_end()) {
00085         Xapian::termpos pos = plists[i]->get_position();
00086         DEBUGLINE(MATCH, "[" << i << "]: " << max - window + 1 << " " << min
00087                   << " " << pos << " " << max << " " << min + window - 1);
00088         if (pos > min + window - 1) RETURN(false);
00089         if (i + 1 == plists.size()) RETURN(true);
00090         if (pos < min) min = pos;
00091         else if (pos > max) max = pos;
00092         if (do_test(plists, i + 1, min, max)) RETURN(true);
00093         plists[i]->next();
00094     }
00095     RETURN(false);
00096 }
00097 
00098 Xapian::termcount
00099 NearPostList::get_wdf() const
00100 {
00101     // Calculate an estimate for the wdf of a near postlist.
00102     //
00103     // The natural definition of the wdf for a near postlist is the number of
00104     // times the terms occur in a group within the windowsize in the document -
00105     // in other words, the number of times a routine like do_test() could find
00106     // a match, if it kept going after finding the first one.  However,
00107     // calculating this would be expensive (in CPU time at least - the position
00108     // lists are probably already read from disk), and the benefit is dubious
00109     // (given that the estimate here is probably only going to be used for
00110     // calculating the weight for synonym postlists, and it's not clear that
00111     // the above definition is exactly appropriate in this situation, anyway).
00112     //
00113     // Instead, we could do an estimate of this value, based on the the lengths
00114     // of the position lists.  Rather than having to open the position lists,
00115     // we could use the wdfs, which will be the same value unless the wdfs have
00116     // been artificially inflated - in which case we probably want to use the
00117     // inflated value anyway (it will have been inflated to represent the fact
00118     // that this term is more important than others, so we should make use of
00119     // this hint).
00120     //
00121     // A reasonable way to calculate an estimate would be to consider the
00122     // document to be split into W (overlapping) windows, each 1 term apart,
00123     // and of the same length as the windowsize used for the phrase.  Then,
00124     // calculate the probability that each term is found in this window (as
00125     // Prob = wdf / doclen), multiply these probabilities to get the
00126     // probability that they're all found in the window, and then multiply by
00127     // the windowsize again to get an estimate.
00128     //
00129     // However, this requires the doclen, which won't always be available (ie,
00130     // if the weighting scheme doesn't use it, we won't have read it from
00131     // disk).
00132     //
00133     // Rather than force an access to disk to get the doclen, we use an even
00134     // rougher estimate: the minimum wdf in the sub-lists.  This is actually
00135     // the upper bound for the wdf (since the phrase can only occur the same
00136     // number of times as the least frequent of its constituent terms).
00137     //
00138     // If this estimate proves to give bad results, we can revisit this and try
00139     // a better approximation.
00140 
00141     std::vector<PostList *>::const_iterator i = terms.begin();
00142     Xapian::termcount wdf = (*i)->get_wdf();
00143     for (; i != terms.end(); i++) {
00144         wdf = std::min(wdf, (*i)->get_wdf());
00145     }
00146 
00147     // Ensure that we always return a wdf of at least 1, since we know there
00148     // was at least one occurrence of the phrase.
00149     return std::max(wdf, 1u);
00150 }
00151 
00152 std::string
00153 NearPostList::get_description() const
00154 {
00155     return "(Near " + om_tostring(window) + " " + source->get_description() + ")";
00156 }
00157 
00158 
00159 
00162 bool
00163 PhrasePostList::test_doc()
00164 {
00165     DEBUGCALL(MATCH, bool, "PhrasePostList::test_doc", "");
00166     std::vector<PositionList *> plists;
00167 
00168     std::vector<PostList *>::iterator i;
00169     for (i = terms.begin(); i != terms.end(); i++) {
00170         PositionList * p = (*i)->read_position_list();
00171         // If p is NULL, the backend doesn't support positionlists
00172         if (!p) return false;
00173         p->index = i - terms.begin();
00174         plists.push_back(p);
00175     }
00176 
00177     std::sort(plists.begin(), plists.end(), PositionListCmpLt());
00178 
00179     Xapian::termpos pos;
00180     Xapian::termpos idx, min;
00181     do {
00182         plists[0]->next();
00183         if (plists[0]->at_end()) {
00184             DEBUGLINE(MATCH, "--MISS--");
00185             RETURN(false);
00186         }
00187         pos = plists[0]->get_position();
00188         idx = plists[0]->index;
00189         min = pos + plists.size() - idx;
00190         if (min > window) min -= window; else min = 0;
00191     } while (!do_test(plists, 1, min, pos + window - idx));
00192     DEBUGLINE(MATCH, "**HIT**");
00193     RETURN(true);
00194 }
00195 
00196 bool
00197 PhrasePostList::do_test(std::vector<PositionList *> &plists, Xapian::termcount i,
00198                         Xapian::termcount min, Xapian::termcount max)
00199 {
00200     DEBUGCALL(MATCH, bool, "PhrasePostList::do_test", "[plists],  " << i << ", " << min << ", " << max);
00201     DEBUGLINE(MATCH, "docid = " << get_docid() << ", window = " << window);
00202     Xapian::termpos idxi = plists[i]->index;
00203     DEBUGLINE(MATCH, "my idx in phrase is " << idxi);
00204 
00205     Xapian::termpos mymin = min + idxi;
00206     Xapian::termpos mymax = max - plists.size() + idxi;
00207     DEBUGLINE(MATCH, "MIN = " << mymin << " MAX = " << mymax);
00208     // FIXME: this is worst case O(n^2) where n = length of phrase
00209     // Can we do better?
00210     for (Xapian::termcount j = 0; j < i; j++) {
00211         Xapian::termpos idxj = plists[j]->index;
00212         if (idxj > idxi) {
00213             Xapian::termpos tmp = plists[j]->get_position() + idxj - idxi;
00214             DEBUGLINE(MATCH, "ABOVE " << tmp);
00215             if (tmp < mymax) mymax = tmp;
00216         } else {
00217             AssertRel(idxi, !=, idxj);
00218             Xapian::termpos tmp = plists[j]->get_position() + idxi - idxj;
00219             DEBUGLINE(MATCH, "BELOW " << tmp);
00220             if (tmp > mymin) mymin = tmp;
00221         }
00222         DEBUGLINE(MATCH, "min = " << mymin << " max = " << mymax);
00223     }
00224     plists[i]->skip_to(mymin);
00225 
00226     while (!plists[i]->at_end()) {
00227         Xapian::termpos pos = plists[i]->get_position();
00228         DEBUGLINE(MATCH, " " << mymin << " " << pos << " " << mymax);
00229         if (pos > mymax) RETURN(false);
00230         if (i + 1 == plists.size()) RETURN(true);
00231         Xapian::termpos tmp = pos + window - idxi;
00232         if (tmp < max) max = tmp;
00233         tmp = pos + plists.size() - idxi;
00234         if (tmp > window) {
00235             tmp -= window;
00236             if (tmp > min) min = tmp;
00237         }
00238         if (do_test(plists, i + 1, min, max)) RETURN(true);
00239         plists[i]->next();
00240     }
00241     RETURN(false);
00242 }
00243 
00244 Xapian::termcount
00245 PhrasePostList::get_wdf() const
00246 {
00247     // Calculate the an estimate for the wdf of a phrase postlist.
00248     //
00249     // We use the minimum wdf of a sub-postlist as our estimate.  See the
00250     // comment in NearPostList::get_wdf for justification of this estimate.
00251     //
00252     // We divide the value calculated for a NearPostList by 2, as a very rough
00253     // heuristic to represent the fact that the words must occur in order, and
00254     // phrases are therefore rarer than near matches.
00255 
00256     std::vector<PostList *>::const_iterator i = terms.begin();
00257     Xapian::termcount wdf = (*i)->get_wdf();
00258     for (; i != terms.end(); i++) {
00259         wdf = std::min(wdf, (*i)->get_wdf());
00260     }
00261 
00262     // Ensure that we always return a wdf of at least 1, since we know there
00263     // was at least one occurrence of the phrase.
00264     return std::max(wdf / 2, 1u);
00265 }
00266 
00267 std::string
00268 PhrasePostList::get_description() const
00269 {
00270     return "(Phrase " + om_tostring(window) + " "
00271            + source->get_description() + ")";
00272 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.