expand/expandweight.cc

Go to the documentation of this file.
00001 /* expandweight.cc: C++ class for weight calculation routines
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2003,2004,2007 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 
00025 #include <math.h>
00026 
00027 #include "expandweight.h"
00028 #include "termlist.h"
00029 
00030 #include "omassert.h"
00031 #include "omdebug.h"
00032 
00033 using namespace std;
00034 
00035 namespace Xapian {
00036 namespace Internal {
00037 
00038 ExpandWeight::ExpandWeight(const Xapian::Database &db_,
00039                            Xapian::doccount rsize_,
00040                            bool use_exact_termfreq_,
00041                            double expand_k_)
00042         : db(db_),
00043           rsize(rsize_),
00044           use_exact_termfreq(use_exact_termfreq_),
00045           expand_k(expand_k_)
00046 {
00047     DEBUGCALL(MATCH, void, "ExpandWeight", db_ << ", " << rsize_ << ", " << use_exact_termfreq_ << ", " << expand_k_);
00048 }
00049 
00050 Xapian::weight
00051 ExpandWeight::get_weight(TermList * merger, const std::string &tname) const
00052 {
00053     DEBUGCALL(MATCH, Xapian::weight, "ExpandWeight::get_weight", "[merger], " << tname);
00054     ExpandStats stats(db.get_avlength(), expand_k);
00055     merger->accumulate_stats(stats);
00056     double termfreq = double(stats.termfreq);
00057     const double rtermfreq = stats.rtermfreq;
00058 
00059     Xapian::doccount dbsize = db.get_doccount();
00060     if (stats.dbsize != dbsize) {
00061         if (!use_exact_termfreq) {
00062             termfreq *= double(dbsize) / stats.dbsize;
00063             DEBUGLINE(EXPAND, "Approximating termfreq of `" << tname << "': " <<
00064                       stats.termfreq << " * " << dbsize << " / " <<
00065                       stats.dbsize << " = " << termfreq << " (true value is:" <<
00066                       db.get_termfreq(tname) << ")");
00067             // termfreq must be at least rtermfreq since there are at least
00068             // rtermfreq documents indexed by this term.  And it can't be
00069             // more than (dbsize - rsize + rtermfreq) since the number
00070             // of relevant documents not indexed by this term can't be
00071             // more than the number of documents not indexed by this term.
00072             if (termfreq < rtermfreq) {
00073                 termfreq = rtermfreq;
00074             } else {
00075                 const double upper_bound = dbsize - rsize + rtermfreq;
00076                 if (termfreq > upper_bound) termfreq = upper_bound;
00077             }
00078         } else {
00079             termfreq = db.get_termfreq(tname);
00080             DEBUGLINE(EXPAND, "Asked database for termfreq of `" << tname <<
00081                       "': " << termfreq);
00082         }
00083     }
00084 
00085     DEBUGMSG(EXPAND, "ExpandWeight::get_weight: "
00086              "N=" << dbsize << ", "
00087              "n=" << termfreq << ", "
00088              "R=" << rsize << ", "
00089              "r=" << rtermfreq << ", "
00090              "mult=" << stats.multiplier);
00091 
00092     Xapian::weight tw;
00093     tw = (rtermfreq + 0.5) * (dbsize - rsize - termfreq + rtermfreq + 0.5) /
00094             ((rsize - rtermfreq + 0.5) * (termfreq - rtermfreq + 0.5));
00095     Assert(tw > 0);
00096 
00097     // FIXME This is to guarantee nice properties (monotonic increase) of the
00098     // weighting function.  Actually, I think the important point is that
00099     // it ensures that tw is positive.
00100     // Check whether this actually helps / whether it hinders efficiency
00101     if (tw < 2) {
00102         tw = tw / 2 + 1;
00103     }
00104     tw = log(tw);
00105 
00106     DEBUGLINE(EXPAND, " => Term weight = " << tw <<
00107               " Expand weight = " << stats.multiplier * tw);
00108 
00109     //RETURN(rtermfreq * tw);
00110     RETURN(stats.multiplier * tw);
00111 }
00112 
00113 }
00114 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.