matcher/bm25weight.cc

Go to the documentation of this file.
00001 /* bm25weight.cc: Class for BM25 weight calculation
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts
00006  * Copyright 2007 Lemur Consulting Ltd
00007  *
00008  * This program is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU General Public License as
00010  * published by the Free Software Foundation; either version 2 of the
00011  * License, or (at your option) any later version.
00012  *
00013  * This program is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  * GNU General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU General Public License
00019  * along with this program; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00021  * USA
00022  */
00023 
00024 #include <config.h>
00025 
00026 #include <math.h>
00027 
00028 #include <xapian/enquire.h>
00029 
00030 #include "omassert.h"
00031 #include "omdebug.h"
00032 #include "serialise-double.h"
00033 #include "weightinternal.h"
00034 
00035 using namespace std;
00036 
00037 namespace Xapian {
00038 
00039 BM25Weight * BM25Weight::clone() const {
00040     return new BM25Weight(k1, k2, k3, b, min_normlen);
00041 }
00042 
00043 string BM25Weight::name() const { return "BM25"; }
00044 
00045 string BM25Weight::serialise() const {
00046     string result = serialise_double(k1);
00047     result += serialise_double(k2);
00048     result += serialise_double(k3);
00049     result += serialise_double(b);
00050     result += serialise_double(min_normlen);
00051     return result;
00052 }
00053 
00054 BM25Weight * BM25Weight::unserialise(const string & s) const {
00055     const char *p = s.data();
00056     const char *p_end = p + s.size();
00057     double k1_ = unserialise_double(&p, p_end);
00058     double k2_ = unserialise_double(&p, p_end);
00059     double k3_ = unserialise_double(&p, p_end);
00060     double b_ = unserialise_double(&p, p_end);
00061     double min_normlen_ = unserialise_double(&p, p_end);
00062     // FIXME: should check that (p == p_end).
00063     return new BM25Weight(k1_, k2_, k3_, b_, min_normlen_);
00064 }
00065 
00066 // Calculate weights using statistics retrieved from databases
00067 void
00068 BM25Weight::calc_termweight() const
00069 {
00070     DEBUGCALL(MATCH, void, "BM25Weight::calc_termweight", "");
00071 
00072     lenpart = internal->average_length;
00073     // lenpart == 0 if there are no documents, or only empty documents.
00074     if (lenpart != 0) lenpart = 1 / lenpart;
00075 
00076     Xapian::doccount termfreq = internal->termfreq;
00077 
00078     DEBUGLINE(WTCALC, "Statistics: N=" << internal->collection_size <<
00079               " n_t=" << termfreq << " lenpart=" << lenpart);
00080 
00081     Xapian::weight tw = 0;
00082     if (internal->rset_size != 0) {
00083         Xapian::doccount rtermfreq = internal->reltermfreq;
00084 
00085         DEBUGLINE(WTCALC, " R=" << internal->rset_size << " r_t=" << rtermfreq);
00086 
00087         // termfreq must be at least rtermfreq since there are at least
00088         // rtermfreq documents indexed by this term.  And it can't be more than
00089         // (internal->collection_size - internal->rset_size + rtermfreq) since
00090         // the number of relevant documents not indexed by this term can't be
00091         // more than the number of documents not indexed by this term.
00092         Assert(termfreq >= rtermfreq);
00093         Assert(termfreq <= internal->collection_size - internal->rset_size + rtermfreq);
00094 
00095         tw = ((rtermfreq + 0.5) *
00096               (internal->collection_size - internal->rset_size - termfreq + rtermfreq + 0.5)) /
00097              ((internal->rset_size - rtermfreq + 0.5) *
00098               (termfreq - rtermfreq + 0.5));
00099     } else {
00100         tw = (internal->collection_size - termfreq + 0.5) / (termfreq + 0.5);
00101     }
00102 
00103     Assert(tw > 0);
00104 
00105     if (tw < 2) {
00106         tw = tw / 2 + 1;
00107     }
00108     tw = log(tw);
00109 
00110     tw *= (k3 + 1) * wqf / (k3 + wqf);
00111 
00112     DEBUGLINE(WTCALC, " => termweight = " << tw);
00113     termweight = tw;
00114     weight_calculated = true;
00115 }
00116 
00117 Xapian::weight
00118 BM25Weight::get_sumpart(Xapian::termcount wdf, Xapian::doclength len) const
00119 {
00120     DEBUGCALL(MATCH, Xapian::weight, "BM25Weight::get_sumpart", wdf << ", " << len);
00121     if (!weight_calculated) calc_termweight();
00122 
00123     Xapian::doclength normlen = len * lenpart;
00124     if (normlen < min_normlen) normlen = min_normlen;
00125 
00126     double denom = k1 * (normlen * b + (1 - b)) + wdf;
00127     Xapian::weight wt;
00128     if (denom != 0) {
00129         wt = double(wdf) * (k1 + 1) / denom;
00130     } else {
00131         wt = 0;
00132     }
00133     DEBUGLINE(WTCALC, "(wdf,len,lenpart) = (" << wdf << "," << len << "," <<
00134               lenpart << ") => wtadj = " << wt);
00135 
00136     wt *= termweight;
00137 
00138     DEBUGLINE(WTCALC, " => sumpart = " << wt);
00139 
00140     RETURN(wt);
00141 }
00142 
00143 Xapian::weight
00144 BM25Weight::get_maxpart() const
00145 {
00146     DEBUGCALL(MATCH, Xapian::weight, "BM25Weight::get_maxpart", "");
00147     if (!weight_calculated) calc_termweight();
00148     RETURN((k1 + 1) * termweight);
00149 }
00150 
00151 /* Should return k2 * querysize * (1-len) / (1+len)
00152  * However, want to return a positive value, so add (k2 * querysize) to
00153  * return.  ie: return 2 * k2 * querysize / (1 + len)
00154  */
00155 Xapian::weight
00156 BM25Weight::get_sumextra(Xapian::doclength len) const
00157 {
00158     DEBUGCALL(MATCH, Xapian::weight, "BM25Weight::get_sumextra", len);
00159     if (!weight_calculated) calc_termweight();
00160 
00161     Xapian::doclength normlen = len * lenpart;
00162     if (normlen < min_normlen) normlen = min_normlen;
00163     Xapian::weight extra = 2 * k2 * querysize / (1 + normlen);
00164     DEBUGLINE(WTCALC, "len = " << len << " querysize = " << querysize <<
00165               " => normlen = " << normlen << " => sumextra = " << extra);
00166     RETURN(extra);
00167 }
00168 
00169 Xapian::weight
00170 BM25Weight::get_maxextra() const
00171 {
00172     DEBUGCALL(MATCH, Xapian::weight, "BM25Weight::get_maxextra", "");
00173     Xapian::weight maxextra = 2 * k2 * querysize;
00174     DEBUGLINE(WTCALC, "querysize = " << querysize <<
00175               " => maxextra = " << maxextra);
00176     RETURN(maxextra);
00177 }
00178 
00179 bool BM25Weight::get_sumpart_needs_doclength() const {
00180     if (!weight_calculated) calc_termweight();
00181     return (b != 0 && k1 != 0 && lenpart != 0);
00182 }
00183 
00184 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.