00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024
00025 #include <xapian/expanddecider.h>
00026
00027 #include "expand.h"
00028 #include "expandweight.h"
00029 #include "rset.h"
00030 #include "ortermlist.h"
00031 #include "omdebug.h"
00032
00033 #include <algorithm>
00034 #include "autoptr.h"
00035
00036 #include "omenquireinternal.h"
00037
00038 using std::nth_element;
00039 using std::priority_queue;
00040 using std::sort;
00041 using std::vector;
00042 using namespace Xapian::Internal;
00043
00044 class OmESetCmp {
00045 public:
00046 bool operator()(const Xapian::Internal::ESetItem &a, const Xapian::Internal::ESetItem &b) {
00047 if (a.wt > b.wt) return true;
00048 if (a.wt != b.wt) return false;
00049 return a.tname > b.tname;
00050 }
00051 };
00052
00053 class TLPCmpGt {
00054 public:
00055 bool operator()(const TermList *a, const TermList *b) {
00056 return a->get_approx_size() > b->get_approx_size();
00057 }
00058 };
00059
00060 AutoPtr<TermList>
00061 OmExpand::build_tree(const RSetI *rset)
00062 {
00063
00064
00065
00066
00067
00068
00069 priority_queue<TermList*, vector<TermList*>, TLPCmpGt> pq;
00070 try {
00071 set<Xapian::docid>::const_iterator i;
00072 for (i = rset->documents.begin(); i != rset->documents.end(); ++i) {
00073 unsigned int multiplier = db.internal.size();
00074 Xapian::docid realdid = (*i - 1) / multiplier + 1;
00075 Xapian::doccount dbnumber = (*i - 1) % multiplier;
00076
00077 AutoPtr<TermList> tl(db.internal[dbnumber]->open_term_list(realdid));
00078 pq.push(tl.get());
00079 tl.release();
00080 }
00081
00082 if (pq.empty()) {
00083 return AutoPtr<TermList>(0);
00084 }
00085
00086
00087
00088
00089
00090
00091
00092 while (true) {
00093 AutoPtr<TermList> tl(pq.top());
00094 pq.pop();
00095
00096 DEBUGLINE(EXPAND,
00097 "OmExpand: adding termlist " << tl.get() << " to tree");
00098 if (pq.empty()) {
00099 return tl;
00100 }
00101
00102
00103 AutoPtr<OrTermList> newtl(new OrTermList(pq.top(), tl.get()));
00104 tl.release();
00105 pq.pop();
00106 pq.push(newtl.get());
00107 newtl.release();
00108 }
00109 } catch (...) {
00110 while (!pq.empty()) {
00111 delete pq.top();
00112 pq.pop();
00113 }
00114 throw;
00115 }
00116 }
00117
00118 void
00119 OmExpand::expand(Xapian::termcount max_esize,
00120 Xapian::ESet & eset,
00121 const RSetI * rset,
00122 const Xapian::ExpandDecider * decider,
00123 bool use_exact_termfreq,
00124 double expand_k)
00125 {
00126 DEBUGCALL(EXPAND, void, "OmExpand::expand", max_esize << ", " << eset << ", " << rset << ", " << decider << ", " << use_exact_termfreq << ", " << expand_k);
00127 eset.internal->items.clear();
00128 eset.internal->ebound = 0;
00129
00130 DEBUGLINE(EXPAND, "OmExpand::expand()");
00131 if (rset->get_rsize() == 0 || max_esize == 0)
00132 return;
00133 DEBUGLINE(EXPAND, "OmExpand::expand() 2");
00134
00135 Xapian::weight w_min = 0;
00136
00137 AutoPtr<TermList> merger(build_tree(rset));
00138 if (merger.get() == 0) return;
00139
00140
00141 Xapian::Internal::ExpandWeight ewt(db, rset->get_rsize(), use_exact_termfreq, expand_k);
00142
00143 while (1) {
00144 {
00145 TermList *ret = merger->next();
00146 if (ret) {
00147 DEBUGLINE(EXPAND, "*** REPLACING ROOT");
00148 merger = ret;
00149 }
00150 }
00151
00152 if (merger->at_end()) break;
00153
00154 string tname = merger->get_termname();
00155 if (!decider || (*decider)(tname)) {
00156 eset.internal->ebound++;
00157
00158 Xapian::weight wt = ewt.get_weight(merger.get(), tname);
00159
00160 if (wt > w_min) {
00161 eset.internal->items.push_back(Xapian::Internal::ESetItem(wt, tname));
00162
00163
00164
00165
00166
00167 if (eset.internal->items.size() == max_esize * 2) {
00168
00169 DEBUGLINE(EXPAND, "finding nth");
00170 nth_element(eset.internal->items.begin(),
00171 eset.internal->items.begin() + max_esize - 1,
00172 eset.internal->items.end(),
00173 OmESetCmp());
00174
00175 eset.internal->items.erase(eset.internal->items.begin() + max_esize,
00176 eset.internal->items.end());
00177 w_min = eset.internal->items.back().wt;
00178 DEBUGLINE(EXPAND, "eset size = " << eset.internal->items.size());
00179 }
00180 }
00181 }
00182 }
00183
00184 if (eset.internal->items.size() > max_esize) {
00185
00186 DEBUGLINE(EXPAND, "finding nth");
00187 nth_element(eset.internal->items.begin(),
00188 eset.internal->items.begin() + max_esize - 1,
00189 eset.internal->items.end(), OmESetCmp());
00190
00191 eset.internal->items.erase(eset.internal->items.begin() + max_esize, eset.internal->items.end());
00192 }
00193 DEBUGLINE(EXPAND, "sorting");
00194
00195
00196 sort(eset.internal->items.begin(), eset.internal->items.end(), OmESetCmp());
00197
00198 DEBUGLINE(EXPAND, "esize = " << eset.internal->items.size() << ", ebound = " << eset.internal->ebound);
00199 if (eset.internal->items.size()) {
00200 DEBUGLINE(EXPAND, "max weight in eset = " << eset.internal->items.front().wt
00201 << ", min weight in eset = " << eset.internal->items.back().wt);
00202 }
00203 }