expand/expand.cc

Go to the documentation of this file.
00001 /* expand.cc
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2007 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 
00025 #include <xapian/expanddecider.h>
00026 
00027 #include "expand.h"
00028 #include "expandweight.h"
00029 #include "rset.h"
00030 #include "ortermlist.h"
00031 #include "omdebug.h"
00032 
00033 #include <algorithm>
00034 #include "autoptr.h"
00035 
00036 #include "omenquireinternal.h"
00037 
00038 using std::nth_element;
00039 using std::priority_queue;
00040 using std::sort;
00041 using std::vector;
00042 using namespace Xapian::Internal;
00043 
00044 class OmESetCmp {
00045     public:
00046         bool operator()(const Xapian::Internal::ESetItem &a, const Xapian::Internal::ESetItem &b) {
00047             if (a.wt > b.wt) return true;
00048             if (a.wt != b.wt) return false;
00049             return a.tname > b.tname;
00050         }
00051 };
00052 
00053 class TLPCmpGt {
00054     public:
00055         bool operator()(const TermList *a, const TermList *b) {
00056             return a->get_approx_size() > b->get_approx_size();
00057         }
00058 };
00059 
00060 AutoPtr<TermList>
00061 OmExpand::build_tree(const RSetI *rset)
00062 {
00063     // Put items in priority queue, such that items with greatest size
00064     // are returned first.
00065     // This is the same idea as for a set of postlists ORed together in
00066     // the matcher.
00067     //
00068     // FIXME: try using a heap instead (C++ sect 18.8)?
00069     priority_queue<TermList*, vector<TermList*>, TLPCmpGt> pq;
00070     try {
00071         set<Xapian::docid>::const_iterator i;
00072         for (i = rset->documents.begin(); i != rset->documents.end(); ++i) {
00073             unsigned int multiplier = db.internal.size();
00074             Xapian::docid realdid = (*i - 1) / multiplier + 1;
00075             Xapian::doccount dbnumber = (*i - 1) % multiplier;
00076 
00077             AutoPtr<TermList> tl(db.internal[dbnumber]->open_term_list(realdid));
00078             pq.push(tl.get());
00079             tl.release();
00080         }
00081 
00082         if (pq.empty()) {
00083             return AutoPtr<TermList>(0);
00084         }
00085 
00086         // Build a tree balanced by the term frequencies
00087         // (similar to building a huffman encoding tree).
00088         //
00089         // This scheme reduces the number of objects terms from large docs
00090         // get "pulled" through, reducing the amount of work done which
00091         // speeds things up.
00092         while (true) {
00093             AutoPtr<TermList> tl(pq.top());
00094             pq.pop();
00095 
00096             DEBUGLINE(EXPAND,
00097                       "OmExpand: adding termlist " << tl.get() << " to tree");
00098             if (pq.empty()) {
00099                 return tl;
00100             }
00101 
00102             // NB right is always <= left - we can use this to optimise
00103             AutoPtr<OrTermList> newtl(new OrTermList(pq.top(), tl.get()));
00104             tl.release();
00105             pq.pop();
00106             pq.push(newtl.get());
00107             newtl.release();
00108         }
00109     } catch (...) {
00110         while (!pq.empty()) {
00111             delete pq.top();
00112             pq.pop();
00113         }
00114         throw;
00115     }
00116 }
00117 
00118 void
00119 OmExpand::expand(Xapian::termcount max_esize,
00120                  Xapian::ESet & eset,
00121                  const RSetI * rset,
00122                  const Xapian::ExpandDecider * decider,
00123                  bool use_exact_termfreq,
00124                  double expand_k)
00125 {
00126     DEBUGCALL(EXPAND, void, "OmExpand::expand", max_esize << ", " << eset << ", " << rset << ", " << decider << ", " << use_exact_termfreq << ", " << expand_k);
00127     eset.internal->items.clear();
00128     eset.internal->ebound = 0;
00129 
00130     DEBUGLINE(EXPAND, "OmExpand::expand()");
00131     if (rset->get_rsize() == 0 || max_esize == 0)
00132         return; // No possibility of results
00133     DEBUGLINE(EXPAND, "OmExpand::expand() 2");
00134 
00135     Xapian::weight w_min = 0;
00136 
00137     AutoPtr<TermList> merger(build_tree(rset));
00138     if (merger.get() == 0) return;
00139 
00140     // Start weighting scheme
00141     Xapian::Internal::ExpandWeight ewt(db, rset->get_rsize(), use_exact_termfreq, expand_k);
00142 
00143     while (1) {
00144         {
00145             TermList *ret = merger->next();
00146             if (ret) {
00147                 DEBUGLINE(EXPAND, "*** REPLACING ROOT");
00148                 merger = ret;
00149             }
00150         }
00151 
00152         if (merger->at_end()) break;
00153 
00154         string tname = merger->get_termname();
00155         if (!decider || (*decider)(tname)) {
00156             eset.internal->ebound++;
00157 
00158             Xapian::weight wt = ewt.get_weight(merger.get(), tname);
00159 
00160             if (wt > w_min) {
00161                 eset.internal->items.push_back(Xapian::Internal::ESetItem(wt, tname));
00162 
00163                 // FIXME: find balance between larger size for more efficient
00164                 // nth_element and smaller size for better w_min optimisations
00165                 // Or perhaps better, switch to using minheap like matcher now
00166                 // uses.
00167                 if (eset.internal->items.size() == max_esize * 2) {
00168                     // find last element we care about
00169                     DEBUGLINE(EXPAND, "finding nth");
00170                     nth_element(eset.internal->items.begin(),
00171                                 eset.internal->items.begin() + max_esize - 1,
00172                                 eset.internal->items.end(),
00173                                 OmESetCmp());
00174                     // erase elements which don't make the grade
00175                     eset.internal->items.erase(eset.internal->items.begin() + max_esize,
00176                                      eset.internal->items.end());
00177                     w_min = eset.internal->items.back().wt;
00178                     DEBUGLINE(EXPAND, "eset size = " << eset.internal->items.size());
00179                 }
00180             }
00181         }
00182     }
00183 
00184     if (eset.internal->items.size() > max_esize) {
00185         // find last element we care about
00186         DEBUGLINE(EXPAND, "finding nth");
00187         nth_element(eset.internal->items.begin(),
00188                     eset.internal->items.begin() + max_esize - 1,
00189                     eset.internal->items.end(), OmESetCmp());
00190         // erase elements which don't make the grade
00191         eset.internal->items.erase(eset.internal->items.begin() + max_esize, eset.internal->items.end());
00192     }
00193     DEBUGLINE(EXPAND, "sorting");
00194 
00195     // Need a stable sort, but this is provided by comparison operator
00196     sort(eset.internal->items.begin(), eset.internal->items.end(), OmESetCmp());
00197 
00198     DEBUGLINE(EXPAND, "esize = " << eset.internal->items.size() << ", ebound = " << eset.internal->ebound);
00199     if (eset.internal->items.size()) {
00200         DEBUGLINE(EXPAND, "max weight in eset = " << eset.internal->items.front().wt
00201                  << ", min weight in eset = " << eset.internal->items.back().wt);
00202     }
00203 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.