matcher/tradweight.cc

Go to the documentation of this file.
00001 /* tradweight.cc: C++ class for weight calculation routines
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2006 Olly Betts
00006  * Copyright 2007 Lemur Consulting Ltd
00007  *
00008  * This program is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU General Public License as
00010  * published by the Free Software Foundation; either version 2 of the
00011  * License, or (at your option) any later version.
00012  *
00013  * This program is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  * GNU General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU General Public License
00019  * along with this program; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00021  * USA
00022  */
00023 
00024 #include <config.h>
00025 
00026 #include <math.h>
00027 
00028 #include <xapian/enquire.h>
00029 
00030 #include "omassert.h"
00031 #include "omdebug.h"
00032 #include "serialise-double.h"
00033 #include "weightinternal.h"
00034 
00035 using namespace std;
00036 
00037 namespace Xapian {
00038 
00039 TradWeight * TradWeight::clone() const {
00040     return new TradWeight(param_k);
00041 }
00042 
00043 string TradWeight::name() const { return "Trad"; }
00044 
00045 string TradWeight::serialise() const {
00046     return serialise_double(param_k);
00047 }
00048 
00049 TradWeight * TradWeight::unserialise(const string & s) const {
00050     const char *p = s.data();
00051     const char *p_end = p + s.size();
00052     double param_k_ = unserialise_double(&p, p_end);
00053     // FIXME: should check that (p == p_end).
00054     return new TradWeight(param_k_);
00055 }
00056 
00057 // Calculate weights using statistics retrieved from databases
00058 void
00059 TradWeight::calc_termweight() const
00060 {
00061     DEBUGCALL(MATCH, void, "TradWeight::calc_termweight", "");
00062 
00063     lenpart = internal->average_length;
00064     // lenpart == 0 if there are no documents, or only empty documents.
00065     if (lenpart != 0) lenpart = param_k / lenpart;
00066 
00067     Xapian::doccount termfreq = internal->termfreq;
00068 
00069     DEBUGLINE(WTCALC, "Statistics: N=" << internal->collection_size <<
00070               " n_t=" << termfreq << " lenpart=" << lenpart);
00071 
00072     Xapian::weight tw = 0;
00073     if (internal->rset_size != 0) {
00074         Xapian::doccount rtermfreq = internal->reltermfreq;
00075 
00076         DEBUGLINE(WTCALC, " R=" << internal->rset_size << " r_t=" << rtermfreq);
00077 
00078         // termfreq must be at least rtermfreq since there are at least
00079         // rtermfreq documents indexed by this term.  And it can't be more than
00080         // (internal->collection_size - internal->rset_size + rtermfreq) since
00081         // the number of relevant documents not indexed by this term can't be
00082         // more than the number of documents not indexed by this term.
00083         Assert(termfreq >= rtermfreq);
00084         Assert(termfreq <= internal->collection_size - internal->rset_size + rtermfreq);
00085 
00086         tw = ((rtermfreq + 0.5) *
00087               (internal->collection_size - internal->rset_size - termfreq + rtermfreq + 0.5)) /
00088              ((internal->rset_size - rtermfreq + 0.5) *
00089               (termfreq - rtermfreq + 0.5));
00090     } else {
00091         tw = (internal->collection_size - termfreq + 0.5) / (termfreq + 0.5);
00092     }
00093 
00094     Assert(tw > 0);
00095 
00096     // FIXME This is to guarantee nice properties (monotonic increase) of the
00097     // weighting function.  Actually, I think the important point is that
00098     // it ensures that tw is positive.
00099     // Check whether this actually helps / whether it hinders efficiency
00100     if (tw < 2) {
00101         tw = tw / 2 + 1;
00102     }
00103     tw = log(tw);
00104 
00105     DEBUGLINE(WTCALC, " => termweight = " << tw);
00106     termweight = tw;
00107     weight_calculated = true;
00108 }
00109 
00110 Xapian::weight
00111 TradWeight::get_sumpart(Xapian::termcount wdf, Xapian::doclength len) const
00112 {
00113     DEBUGCALL(MATCH, Xapian::weight, "TradWeight::get_sumpart", wdf << ", " << len);
00114     if (!weight_calculated) calc_termweight();
00115 
00116     Xapian::weight wt = double(wdf) / (len * lenpart + wdf);
00117 
00118     wt *= termweight;
00119 
00120     RETURN(wt);
00121 }
00122 
00123 Xapian::weight
00124 TradWeight::get_maxpart() const
00125 {
00126     DEBUGCALL(MATCH, Xapian::weight, "TradWeight::get_maxpart", "");
00127     if (!weight_calculated) calc_termweight();
00128 
00129     RETURN(termweight);
00130 }
00131 
00132 Xapian::weight
00133 TradWeight::get_sumextra(Xapian::doclength /*len*/) const
00134 {
00135     DEBUGCALL(MATCH, Xapian::weight, "TradWeight::get_sumextra", "/*len*/");
00136     RETURN(0);
00137 }
00138 
00139 Xapian::weight
00140 TradWeight::get_maxextra() const
00141 {
00142     DEBUGCALL(MATCH, Xapian::weight, "TradWeight::get_maxextra", "");
00143     RETURN(0);
00144 }
00145 
00146 bool TradWeight::get_sumpart_needs_doclength() const {
00147     if (!weight_calculated) calc_termweight();
00148     return (lenpart != 0);
00149 }
00150 
00151 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.