00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include <config.h>
00025
00026 #include <math.h>
00027
00028 #include <xapian/enquire.h>
00029
00030 #include "omassert.h"
00031 #include "omdebug.h"
00032 #include "serialise-double.h"
00033 #include "weightinternal.h"
00034
00035 using namespace std;
00036
00037 namespace Xapian {
00038
00039 BM25Weight * BM25Weight::clone() const {
00040 return new BM25Weight(k1, k2, k3, b, min_normlen);
00041 }
00042
00043 string BM25Weight::name() const { return "BM25"; }
00044
00045 string BM25Weight::serialise() const {
00046 string result = serialise_double(k1);
00047 result += serialise_double(k2);
00048 result += serialise_double(k3);
00049 result += serialise_double(b);
00050 result += serialise_double(min_normlen);
00051 return result;
00052 }
00053
00054 BM25Weight * BM25Weight::unserialise(const string & s) const {
00055 const char *p = s.data();
00056 const char *p_end = p + s.size();
00057 double k1_ = unserialise_double(&p, p_end);
00058 double k2_ = unserialise_double(&p, p_end);
00059 double k3_ = unserialise_double(&p, p_end);
00060 double b_ = unserialise_double(&p, p_end);
00061 double min_normlen_ = unserialise_double(&p, p_end);
00062
00063 return new BM25Weight(k1_, k2_, k3_, b_, min_normlen_);
00064 }
00065
00066
00067 void
00068 BM25Weight::calc_termweight() const
00069 {
00070 DEBUGCALL(MATCH, void, "BM25Weight::calc_termweight", "");
00071
00072 lenpart = internal->average_length;
00073
00074 if (lenpart != 0) lenpart = 1 / lenpart;
00075
00076 Xapian::doccount termfreq = internal->termfreq;
00077
00078 DEBUGLINE(WTCALC, "Statistics: N=" << internal->collection_size <<
00079 " n_t=" << termfreq << " lenpart=" << lenpart);
00080
00081 Xapian::weight tw = 0;
00082 if (internal->rset_size != 0) {
00083 Xapian::doccount rtermfreq = internal->reltermfreq;
00084
00085 DEBUGLINE(WTCALC, " R=" << internal->rset_size << " r_t=" << rtermfreq);
00086
00087
00088
00089
00090
00091
00092 Assert(termfreq >= rtermfreq);
00093 Assert(termfreq <= internal->collection_size - internal->rset_size + rtermfreq);
00094
00095 tw = ((rtermfreq + 0.5) *
00096 (internal->collection_size - internal->rset_size - termfreq + rtermfreq + 0.5)) /
00097 ((internal->rset_size - rtermfreq + 0.5) *
00098 (termfreq - rtermfreq + 0.5));
00099 } else {
00100 tw = (internal->collection_size - termfreq + 0.5) / (termfreq + 0.5);
00101 }
00102
00103 Assert(tw > 0);
00104
00105 if (tw < 2) {
00106 tw = tw / 2 + 1;
00107 }
00108 tw = log(tw);
00109
00110 tw *= (k3 + 1) * wqf / (k3 + wqf);
00111
00112 DEBUGLINE(WTCALC, " => termweight = " << tw);
00113 termweight = tw;
00114 weight_calculated = true;
00115 }
00116
00117 Xapian::weight
00118 BM25Weight::get_sumpart(Xapian::termcount wdf, Xapian::doclength len) const
00119 {
00120 DEBUGCALL(MATCH, Xapian::weight, "BM25Weight::get_sumpart", wdf << ", " << len);
00121 if (!weight_calculated) calc_termweight();
00122
00123 Xapian::doclength normlen = len * lenpart;
00124 if (normlen < min_normlen) normlen = min_normlen;
00125
00126 double denom = k1 * (normlen * b + (1 - b)) + wdf;
00127 Xapian::weight wt;
00128 if (denom != 0) {
00129 wt = double(wdf) * (k1 + 1) / denom;
00130 } else {
00131 wt = 0;
00132 }
00133 DEBUGLINE(WTCALC, "(wdf,len,lenpart) = (" << wdf << "," << len << "," <<
00134 lenpart << ") => wtadj = " << wt);
00135
00136 wt *= termweight;
00137
00138 DEBUGLINE(WTCALC, " => sumpart = " << wt);
00139
00140 RETURN(wt);
00141 }
00142
00143 Xapian::weight
00144 BM25Weight::get_maxpart() const
00145 {
00146 DEBUGCALL(MATCH, Xapian::weight, "BM25Weight::get_maxpart", "");
00147 if (!weight_calculated) calc_termweight();
00148 RETURN((k1 + 1) * termweight);
00149 }
00150
00151
00152
00153
00154
00155 Xapian::weight
00156 BM25Weight::get_sumextra(Xapian::doclength len) const
00157 {
00158 DEBUGCALL(MATCH, Xapian::weight, "BM25Weight::get_sumextra", len);
00159 if (!weight_calculated) calc_termweight();
00160
00161 Xapian::doclength normlen = len * lenpart;
00162 if (normlen < min_normlen) normlen = min_normlen;
00163 Xapian::weight extra = 2 * k2 * querysize / (1 + normlen);
00164 DEBUGLINE(WTCALC, "len = " << len << " querysize = " << querysize <<
00165 " => normlen = " << normlen << " => sumextra = " << extra);
00166 RETURN(extra);
00167 }
00168
00169 Xapian::weight
00170 BM25Weight::get_maxextra() const
00171 {
00172 DEBUGCALL(MATCH, Xapian::weight, "BM25Weight::get_maxextra", "");
00173 Xapian::weight maxextra = 2 * k2 * querysize;
00174 DEBUGLINE(WTCALC, "querysize = " << querysize <<
00175 " => maxextra = " << maxextra);
00176 RETURN(maxextra);
00177 }
00178
00179 bool BM25Weight::get_sumpart_needs_doclength() const {
00180 if (!weight_calculated) calc_termweight();
00181 return (b != 0 && k1 != 0 && lenpart != 0);
00182 }
00183
00184 }