matcher/andpostlist.cc

Go to the documentation of this file.
00001 /* andpostlist.cc: Return only items which are in both sublists
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2003,2004,2007 Olly Betts
00006  * Copyright 2007 Lemur Consulting Ltd
00007  *
00008  * This program is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU General Public License as
00010  * published by the Free Software Foundation; either version 2 of the
00011  * License, or (at your option) any later version.
00012  *
00013  * This program is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  * GNU General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU General Public License
00019  * along with this program; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00021  * USA
00022  */
00023 
00024 #include <config.h>
00025 
00026 #include "andpostlist.h"
00027 #include "omassert.h"
00028 #include "omdebug.h"
00029 
00030 inline void
00031 AndPostList::process_next_or_skip_to(Xapian::weight w_min, PostList *ret)
00032 {
00033     DEBUGCALL(MATCH, void, "AndPostList::process_next_or_skip_to",
00034               w_min << ", " << ret);
00035     head = 0;
00036     handle_prune(r, ret);
00037     DEBUGLINE(MATCH, "r at_end = " << r->at_end());
00038     if (r->at_end()) return;
00039 
00040     // r has just been advanced by next or skip_to so must be > head
00041     // (and head is the current position of l)
00042     Xapian::docid rhead = r->get_docid();
00043     DEBUGLINE(MATCH, "rhead " << rhead);
00044     DEBUGLINE(MATCH, "w_min " << w_min << " rmax " << rmax);
00045     skip_to_handling_prune(l, rhead, w_min - rmax, matcher);
00046     DEBUGLINE(MATCH, "l at_end = " << l->at_end());
00047     if (l->at_end()) return;
00048 
00049     Xapian::docid lhead = l->get_docid();
00050     DEBUGLINE(MATCH, "lhead " << lhead);
00051 
00052     while (lhead != rhead) {
00053         if (lhead < rhead) {
00054             // FIXME: CSE these w_min values?
00055             // But note that lmax and rmax may change on recalc_maxweight...
00056             skip_to_handling_prune(l, rhead, w_min - rmax, matcher);
00057             DEBUGLINE(MATCH, "l at_end = " << l->at_end());
00058             if (l->at_end()) {
00059                 head = 0;
00060                 return;
00061             }
00062             lhead = l->get_docid();
00063             DEBUGLINE(MATCH, "lhead " << lhead);
00064         } else {
00065             skip_to_handling_prune(r, lhead, w_min - lmax, matcher);
00066             DEBUGLINE(MATCH, "r at_end = " << r->at_end());
00067             if (r->at_end()) {
00068                 head = 0;
00069                 return;
00070             }
00071             rhead = r->get_docid();
00072             DEBUGLINE(MATCH, "rhead " << rhead);
00073         }
00074     }
00075 
00076     head = lhead;
00077     return;
00078 }
00079 
00080 AndPostList::AndPostList(PostList *left_, PostList *right_,
00081                          MultiMatch *matcher_,
00082                          Xapian::doccount dbsize_,
00083                          bool replacement)
00084         : BranchPostList(left_, right_, matcher_), head(0), lmax(0), rmax(0),
00085           dbsize(dbsize_)
00086 {
00087     DEBUGCALL(MATCH, void, "AndPostList", left_ << ", " << right_ << ", " << matcher_ << ", " << dbsize_ << ", " << replacement);
00088     if (l->get_termfreq_est() > r->get_termfreq_est()) swap(l, r);
00089     if (replacement) {
00090         // Initialise the maxweights from the kids so we can avoid forcing
00091         // a full maxweight recalc
00092         lmax = l->get_maxweight();
00093         rmax = r->get_maxweight();
00094     }
00095 }
00096 
00097 PostList *
00098 AndPostList::next(Xapian::weight w_min)
00099 {
00100     DEBUGCALL(MATCH, PostList *, "AndPostList::next", w_min);
00101     process_next_or_skip_to(w_min, r->next(w_min - lmax));
00102     RETURN(NULL);
00103 }
00104 
00105 PostList *
00106 AndPostList::skip_to(Xapian::docid did, Xapian::weight w_min)
00107 {
00108     DEBUGCALL(MATCH, PostList *, "AndPostList::skip_to", did << ", " << w_min);
00109     if (did > head)
00110         process_next_or_skip_to(w_min, r->skip_to(did, w_min - lmax));
00111     RETURN(NULL);
00112 }
00113 
00114 Xapian::doccount
00115 AndPostList::get_termfreq_max() const
00116 {
00117     DEBUGCALL(MATCH, Xapian::doccount, "AndPostList::get_termfreq_max", "");
00118     RETURN(std::min(l->get_termfreq_max(), r->get_termfreq_max()));
00119 }
00120 
00121 Xapian::doccount
00122 AndPostList::get_termfreq_min() const
00123 {
00124     DEBUGCALL(MATCH, Xapian::doccount, "AndPostList::get_termfreq_min", "");
00125     // To have minimum matching documents, sets of documents matching both
00126     // components of the AND must be minimal in size, and maximally disjoint.
00127     //
00128     // The overlap is then given by:
00129     //  = lower_bound(left) + lower_bound(right) - dbsize
00130     Xapian::doccount lmin = l->get_termfreq_min();
00131     Xapian::doccount sum = lmin + r->get_termfreq_min();
00132     // If sum < lmin, then the calculation overflowed and the true value of
00133     // sum must be > dbsize.
00134     if (sum < lmin || sum > dbsize) {
00135         RETURN(sum - dbsize);
00136     }
00137     RETURN(0u);
00138 }
00139 
00140 Xapian::doccount
00141 AndPostList::get_termfreq_est() const
00142 {
00143     DEBUGCALL(MATCH, Xapian::doccount, "AndPostList::get_termfreq_est", "");
00144     // Estimate assuming independence:
00145     // P(l and r) = P(l) . P(r)
00146     double lest = static_cast<double>(l->get_termfreq_est());
00147     double rest = static_cast<double>(r->get_termfreq_est());
00148     RETURN(static_cast<Xapian::doccount>(lest * rest / dbsize + 0.5));
00149 }
00150 
00151 Xapian::docid
00152 AndPostList::get_docid() const
00153 {
00154     DEBUGCALL(MATCH, Xapian::docid, "AndPostList::get_docid", "");
00155     RETURN(head);
00156 }
00157 
00158 // only called if we are doing a probabilistic AND
00159 Xapian::weight
00160 AndPostList::get_weight() const
00161 {
00162     DEBUGCALL(MATCH, Xapian::weight, "AndPostList::get_weight", "");
00163     RETURN(l->get_weight() + r->get_weight());
00164 }
00165 
00166 // only called if we are doing a probabilistic operation
00167 Xapian::weight
00168 AndPostList::get_maxweight() const
00169 {
00170     DEBUGCALL(MATCH, Xapian::weight, "AndPostList::get_maxweight", "");
00171     RETURN(lmax + rmax);
00172 }
00173 
00174 Xapian::weight
00175 AndPostList::recalc_maxweight()
00176 {
00177     DEBUGCALL(MATCH, Xapian::weight, "AndPostList::recalc_maxweight", "");
00178     lmax = l->recalc_maxweight();
00179     rmax = r->recalc_maxweight();
00180     RETURN(AndPostList::get_maxweight());
00181 }
00182 
00183 bool
00184 AndPostList::at_end() const
00185 {
00186     DEBUGCALL(MATCH, bool, "AndPostList::at_end", "");
00187     RETURN(head == 0);
00188 }
00189 
00190 std::string
00191 AndPostList::get_description() const
00192 {
00193     return "(" + l->get_description() + " And " + r->get_description() + ")";
00194 }
00195 
00196 Xapian::doclength
00197 AndPostList::get_doclength() const
00198 {
00199     DEBUGCALL(MATCH, Xapian::doclength, "AndPostList::get_doclength", "");
00200     Xapian::doclength doclength = l->get_doclength();
00201     DEBUGLINE(MATCH, "docid=" << head);
00202     AssertEqDouble(l->get_doclength(), r->get_doclength());
00203     RETURN(doclength);
00204 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.