matcher/queryoptimiser.cc

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2007,2008 Olly Betts
00005  * Copyright (C) 2008 Lemur Consulting Ltd
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00020  */
00021 
00022 #include <config.h>
00023 
00024 #include "queryoptimiser.h"
00025 
00026 #include "andmaybepostlist.h"
00027 #include "andnotpostlist.h"
00028 #include "autoptr.h"
00029 #include "emptypostlist.h"
00030 #include "exactphrasepostlist.h"
00031 #include "multiandpostlist.h"
00032 #include "multimatch.h"
00033 #include "omassert.h"
00034 #include "omdebug.h"
00035 #include "omqueryinternal.h"
00036 #include "orpostlist.h"
00037 #include "phrasepostlist.h"
00038 #include "postlist.h"
00039 #include "valuegepostlist.h"
00040 #include "valuerangepostlist.h"
00041 #include "xorpostlist.h"
00042 
00043 #include <algorithm>
00044 #include <list>
00045 #include <map>
00046 #include <string>
00047 #include <vector>
00048 
00049 using namespace std;
00050 
00051 PostList *
00052 QueryOptimiser::do_subquery(const Xapian::Query::Internal * query, double factor)
00053 {
00054     DEBUGCALL(MATCH, PostList *, "QueryOptimiser::do_subquery",
00055               query << ", " << factor);
00056 
00057     // Handle QueryMatchNothing.
00058     if (!query) RETURN(new EmptyPostList());
00059 
00060     switch (query->op) {
00061         case Xapian::Query::Internal::OP_LEAF:
00062             RETURN(do_leaf(query, factor));
00063 
00064         case Xapian::Query::OP_AND:
00065         case Xapian::Query::OP_FILTER:
00066         case Xapian::Query::OP_NEAR:
00067         case Xapian::Query::OP_PHRASE:
00068             RETURN(do_and_like(query, factor));
00069 
00070         case Xapian::Query::OP_OR:
00071         case Xapian::Query::OP_XOR:
00072         case Xapian::Query::OP_ELITE_SET:
00073             RETURN(do_or_like(query, factor));
00074 
00075         case Xapian::Query::OP_AND_NOT: {
00076             AssertEq(query->subqs.size(), 2);
00077             PostList * l = do_subquery(query->subqs[0], factor);
00078             PostList * r = do_subquery(query->subqs[1], 0.0);
00079             RETURN(new AndNotPostList(l, r, matcher, db_size));
00080         }
00081 
00082         case Xapian::Query::OP_AND_MAYBE: {
00083             AssertEq(query->subqs.size(), 2);
00084             PostList * l = do_subquery(query->subqs[0], factor);
00085             PostList * r = do_subquery(query->subqs[1], factor);
00086             RETURN(new AndMaybePostList(l, r, matcher, db_size));
00087         }
00088 
00089         case Xapian::Query::OP_VALUE_RANGE: {
00090             Xapian::valueno valno(query->parameter);
00091             const string & range_begin = query->tname;
00092             const string & range_end = query->str_parameter;
00093             RETURN(new ValueRangePostList(&db, valno, range_begin, range_end));
00094         }
00095 
00096         case Xapian::Query::OP_VALUE_GE: {
00097             Xapian::valueno valno(query->parameter);
00098             const string & range_begin = query->tname;
00099             RETURN(new ValueGePostList(&db, valno, range_begin));
00100         }
00101 
00102         case Xapian::Query::OP_VALUE_LE: {
00103             Xapian::valueno valno(query->parameter);
00104             const string & range_end = query->tname;
00105             RETURN(new ValueRangePostList(&db, valno, "", range_end));
00106         }
00107 
00108         case Xapian::Query::OP_SCALE_WEIGHT: {
00109             AssertEq(query->subqs.size(), 1);
00110             double sub_factor = factor;
00111             if (sub_factor != 0.0) sub_factor *= query->get_dbl_parameter();
00112             RETURN(do_subquery(query->subqs[0], sub_factor));
00113         }
00114 
00115         default:
00116             Assert(false);
00117             RETURN(NULL);
00118     }
00119 }
00120 
00121 struct PosFilter {
00122     PosFilter(Xapian::Query::Internal::op_t op_, size_t begin_, size_t end_,
00123               Xapian::termcount window_)
00124         : op(op_), begin(begin_), end(end_), window(window_) { }
00125 
00126     Xapian::Query::Internal::op_t op;
00127 
00129     size_t begin, end;
00130 
00131     Xapian::termcount window;
00132 };
00133 
00134 PostList *
00135 QueryOptimiser::do_and_like(const Xapian::Query::Internal *query, double factor)
00136 {
00137     DEBUGCALL(MATCH, PostList *, "QueryOptimiser::do_and_like",
00138               query << ", " << factor);
00139 
00140     list<PosFilter> pos_filters;
00141     vector<PostList *> plists;
00142     do_and_like(query, factor, plists, pos_filters);
00143     AssertRel(plists.size(), >=, 2);
00144 
00145     PostList * pl;
00146     pl = new MultiAndPostList(plists.begin(), plists.end(), matcher, db_size);
00147 
00148     // Sort the positional filters to try to apply them in an efficient order.
00149     // FIXME: We need to figure out what that is!  Try applying lowest cf/tf
00150     // first?
00151 
00152     // Apply any positional filters.
00153     list<PosFilter>::iterator i;
00154     for (i = pos_filters.begin(); i != pos_filters.end(); ++i) {
00155         const PosFilter & filter = *i;
00156 
00157         // FIXME: make NearPostList, etc ctors take a pair of itors so we don't
00158         // need to create this temporary vector.
00159         vector<PostList *> terms(plists.begin() + filter.begin,
00160                                  plists.begin() + filter.end);
00161 
00162         Xapian::termcount window = filter.window;
00163         if (filter.op == Xapian::Query::OP_NEAR) {
00164             pl = new NearPostList(pl, window, terms);
00165         } else if (window == filter.end - filter.begin) {
00166             AssertEq(filter.op, Xapian::Query::OP_PHRASE);
00167             pl = new ExactPhrasePostList(pl, terms);
00168         } else {
00169             AssertEq(filter.op, Xapian::Query::OP_PHRASE);
00170             pl = new PhrasePostList(pl, window, terms);
00171         }
00172     }
00173 
00174     RETURN(pl);
00175 }
00176 
00177 inline bool is_and_like(Xapian::Query::Internal::op_t op) {
00178     return op == Xapian::Query::OP_AND || op == Xapian::Query::OP_FILTER ||
00179            op == Xapian::Query::OP_NEAR || op == Xapian::Query::OP_PHRASE;
00180 }
00181 
00182 void
00183 QueryOptimiser::do_and_like(const Xapian::Query::Internal *query, double factor,
00184                             vector<PostList *> & and_plists,
00185                             list<PosFilter> & pos_filters)
00186 {
00187     DEBUGCALL(MATCH, void, "QueryOptimiser::do_and_like",
00188               query << ", " << factor << ", [and_plists], [pos_filters]");
00189 
00190     Xapian::Query::Internal::op_t op = query->op;
00191     Assert(is_and_like(op));
00192 
00193     bool positional = false;
00194     if (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR) {
00195         // If this sub-database has no positional information, change
00196         // OP_PHRASE/OP_NEAR into OP_AND so that we actually return some
00197         // matches.
00198         if (!db.has_positions()) {
00199             op = Xapian::Query::OP_AND;
00200         } else {
00201             positional = true;
00202         }
00203     }
00204 
00205     const Xapian::Query::Internal::subquery_list &queries = query->subqs;
00206     AssertRel(queries.size(), >=, 2);
00207 
00208     for (size_t i = 0; i != queries.size(); ++i) {
00209         // The second branch of OP_FILTER is always boolean.
00210         if (i == 1 && op == Xapian::Query::OP_FILTER) factor = 0.0;
00211 
00212         const Xapian::Query::Internal * subq = queries[i];
00213         if (is_and_like(subq->op)) {
00214             do_and_like(subq, factor, and_plists, pos_filters);
00215         } else {
00216             PostList * pl = do_subquery(subq, factor);
00217             and_plists.push_back(pl);
00218         }
00219     }
00220 
00221     if (positional) {
00222         // Record the positional filter to apply higher up the tree.
00223         size_t end = and_plists.size();
00224         size_t begin = end - queries.size();
00225         Xapian::termcount window = query->parameter;
00226 
00227         pos_filters.push_back(PosFilter(op, begin, end, window));
00228     }
00229 }
00230 
00236 struct CmpMaxOrTerms {
00245     bool operator()(const PostList *a, const PostList *b) {
00246         if (a->get_termfreq_max() == 0) return false;
00247         if (b->get_termfreq_max() == 0) return true;
00248 
00249 #if defined(__i386__) || defined(__mc68000__)
00250         // On some architectures, most common of which is x86, floating point
00251         // values are calculated and stored in registers with excess precision.
00252         // If the two get_maxweight() calls below return identical values in a
00253         // register, the excess precision may be dropped for one of them but
00254         // not the other (e.g. because the compiler saves the first calculated
00255         // weight to memory while calculating the second, then reloads it to
00256         // compare).  This leads to both a > b and b > a being true, which
00257         // violates the antisymmetry property of the strict weak ordering
00258         // required by nth_element().  This can have serious consequences (e.g.
00259         // segfaults).
00260         //
00261         // To avoid this, we store each result in a volatile double prior to
00262         // comparing them.  This means that the result of this test should
00263         // match that on other architectures with the same double format (which
00264         // is desirable), and actually has less overhead than rounding both
00265         // results to float (which is another approach which works).
00266         volatile double a_max_wt = a->get_maxweight();
00267         volatile double b_max_wt = b->get_maxweight();
00268         return a_max_wt > b_max_wt;
00269 #else
00270         return a->get_maxweight() > b->get_maxweight();
00271 #endif
00272     }
00273 };
00274 
00275 template<class CLASS> struct delete_ptr {
00276     void operator()(CLASS *p) { delete p; }
00277 };
00278 
00280 struct ComparePostListTermFreqAscending {
00282     bool operator()(const PostList *a, const PostList *b) {
00283         return a->get_termfreq_est() > b->get_termfreq_est();
00284     }
00285 };
00286 
00287 PostList *
00288 QueryOptimiser::do_or_like(const Xapian::Query::Internal *query, double factor)
00289 {
00290     DEBUGCALL(MATCH, PostList *, "QueryOptimiser::do_or_like",
00291               query << ", " << factor);
00292 
00293     // FIXME: we could optimise by merging OP_ELITE_SET and OP_OR like we do
00294     // for AND-like operations.
00295     Xapian::Query::Internal::op_t op = query->op;
00296     Assert(op == Xapian::Query::OP_ELITE_SET || op == Xapian::Query::OP_OR ||
00297            op == Xapian::Query::OP_XOR);
00298 
00299     const Xapian::Query::Internal::subquery_list &queries = query->subqs;
00300     AssertRel(queries.size(), >=, 2);
00301 
00302     vector<PostList *> postlists;
00303     postlists.reserve(queries.size());
00304 
00305     Xapian::Query::Internal::subquery_list::const_iterator q;
00306     for (q = queries.begin(); q != queries.end(); ++q) {
00307         postlists.push_back(do_subquery(*q, factor));
00308     }
00309 
00310     if (op == Xapian::Query::OP_ELITE_SET) {
00311         // Select the best elite_set_size terms.
00312         Xapian::termcount elite_set_size = query->parameter;
00313         Assert(elite_set_size > 0);
00314 
00315         if (postlists.size() > elite_set_size) {
00316             // Call recalc_maxweight() as otherwise get_maxweight()
00317             // may not be valid before next() or skip_to()
00318             for_each(postlists.begin(), postlists.end(),
00319                      mem_fun(&PostList::recalc_maxweight));
00320 
00321             nth_element(postlists.begin(),
00322                         postlists.begin() + elite_set_size - 1,
00323                         postlists.end(), CmpMaxOrTerms());
00324 
00325             for_each(postlists.begin() + elite_set_size, postlists.end(),
00326                      delete_ptr<PostList>());
00327 
00328             if (elite_set_size == 1) RETURN(postlists[0]);
00329 
00330             postlists.resize(elite_set_size);
00331         }
00332     }
00333 
00334     // Make postlists into a heap so that the postlist with the greatest term
00335     // frequency is at the top of the heap.
00336     make_heap(postlists.begin(), postlists.end(),
00337               ComparePostListTermFreqAscending());
00338 
00339     // Now build a tree of binary OrPostList or XorPostList objects.  The
00340     // algorithm used to build the tree is like that used to build an
00341     // optimal Huffman coding tree.  If we called next() repeatedly, this
00342     // arrangement would minimise the number of method calls.  Generally we
00343     // don't actually do that, but this arrangement is still likely to be a
00344     // good one, and it does minimise the work in the worst case.
00345     AssertRel(postlists.size(), >=, 2);
00346     while (true) {
00347         // We build the tree such that at each branch:
00348         //
00349         //   l.get_termfreq_est() >= r.get_termfreq_est()
00350         //
00351         // We do this so that the OrPostList and XorPostList classes can be
00352         // optimised assuming that this is the case.
00353         PostList * r = postlists.front();
00354         pop_heap(postlists.begin(), postlists.end(),
00355                  ComparePostListTermFreqAscending());
00356         postlists.pop_back();
00357         PostList * l = postlists.front();
00358 
00359         PostList * pl;
00360         if (op == Xapian::Query::OP_XOR) {
00361             pl = new XorPostList(l, r, matcher, db_size);
00362         } else {
00363             pl = new OrPostList(l, r, matcher, db_size);
00364         }
00365 
00366         if (postlists.size() == 1) RETURN(pl);
00367 
00368         pop_heap(postlists.begin(), postlists.end(),
00369                  ComparePostListTermFreqAscending());
00370         postlists.back() = pl;
00371         push_heap(postlists.begin(), postlists.end(),
00372                   ComparePostListTermFreqAscending());
00373     }
00374 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.