backends/quartz/quartz_termlist.cc

Go to the documentation of this file.
00001 /* quartz_termlist.cc: Termlists in quartz databases
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2006,2007 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 
00025 #include <xapian/error.h>
00026 #include "expandweight.h"
00027 #include "quartz_termlist.h"
00028 #include "quartz_utils.h"
00029 #include "utils.h"
00030 
00031 #include <algorithm>
00032 using namespace std;
00033 
00034 void
00035 QuartzTermListTable::set_entries(Xapian::docid did,
00036                             Xapian::TermIterator t,
00037                             const Xapian::TermIterator &t_end,
00038                             quartz_doclen_t doclen_,
00039                             bool store_termfreqs)
00040 {
00041     DEBUGCALL(DB, void, "QuartzTermList::set_entries", did << ", " << t << ", " << t_end << ", " << doclen_ << ", " << store_termfreqs);
00042     string tag = pack_uint(doclen_);
00043 
00044     string v;
00045     string prev_term;
00046     Xapian::doccount size = 0;
00047     for ( ; t != t_end; ++t) {
00048         bool stored_wdf = false;
00049         // If there was a previous term, work out how much we can reuse.
00050         if (!prev_term.empty()) {
00051             string::size_type len = min(prev_term.length(), (*t).length());
00052             string::size_type i;
00053             for (i = 0; i < len; ++i) {
00054                 if (prev_term[i] != (*t)[i]) break;
00055             }
00056             // See if we can squeeze the wdf into the spare space in a char.
00057             string::size_type x;
00058             x = (t.get_wdf() + 1) * (prev_term.length() + 1) + i;
00059             if (x < 256) {
00060                 // Cool, we can!
00061                 v += char(x);
00062                 stored_wdf = true;
00063             } else {
00064                 v += char(i);
00065             }
00066             v += char((*t).length() - i);
00067             v += (*t).substr(i);
00068         } else {
00069             v += char((*t).length());
00070             v += *t;
00071         }
00072         prev_term = *t;
00073 
00074         if (!stored_wdf) v += pack_uint(t.get_wdf());
00075         if (store_termfreqs) v += pack_uint(t.get_termfreq());
00076         ++size;
00077     }
00078     tag += pack_uint(size);
00079     tag += pack_bool(store_termfreqs);
00080     tag += v;
00081     add(quartz_docid_to_key(did), tag);
00082 
00083     DEBUGLINE(DB, "QuartzTermList::set_entries() - new entry is `" + tag + "'");
00084 }
00085 
00086 void
00087 QuartzTermListTable::delete_termlist(Xapian::docid did)
00088 {
00089     DEBUGCALL_STATIC(DB, void, "QuartzTermList::delete_termlist", did);
00090     del(quartz_docid_to_key(did));
00091 }
00092 
00093 
00094 QuartzTermList::QuartzTermList(Xapian::Internal::RefCntPtr<const Xapian::Database::Internal> this_db_,
00095                                const Btree * table_,
00096                                Xapian::docid did_,
00097                                Xapian::doccount doccount_)
00098         : this_db(this_db_), did(did_), table(table_),
00099           have_finished(false), current_wdf(0), has_termfreqs(false),
00100           current_termfreq(0), doccount(doccount_)
00101 {
00102     DEBUGCALL(DB, void, "QuartzTermList", "[this_db_], " << table_ << ", "
00103               << did << ", " << doccount_);
00104 
00105     string key(quartz_docid_to_key(did));
00106 
00107     if (!table->get_exact_entry(key, termlist_part))
00108         throw Xapian::DocNotFoundError("Can't read termlist for document "
00109                                  + om_tostring(did) + ": Not found");
00110 
00111     DEBUGLINE(DB, "QuartzTermList::QuartzTermList() - data is `" + termlist_part + "'");
00112 
00113     pos = termlist_part.data();
00114     end = pos + termlist_part.size();
00115 
00116     // Read doclen
00117     if (!unpack_uint(&pos, end, &doclen)) {
00118         if (pos != 0) throw Xapian::RangeError("doclen out of range.");
00119         throw Xapian::DatabaseCorruptError("Unexpected end of data when reading doclen.");
00120     }
00121 
00122     // Read termlist_size
00123     if (!unpack_uint(&pos, end, &termlist_size)) {
00124         if (pos != 0) throw Xapian::RangeError("Size of termlist out of range.");
00125         throw Xapian::DatabaseCorruptError("Unexpected end of data when reading termlist.");
00126     }
00127 
00128     // Read has_termfreqs
00129     if (!unpack_bool(&pos, end, &has_termfreqs)) {
00130         Assert(pos == 0);
00131         throw Xapian::DatabaseCorruptError("Unexpected end of data when reading termlist.");
00132     }
00133 }
00134 
00135 Xapian::termcount
00136 QuartzTermList::get_approx_size() const
00137 {
00138     DEBUGCALL(DB, Xapian::termcount, "QuartzTermList::get_approx_size", "");
00139     RETURN(termlist_size);
00140 }
00141 
00142 quartz_doclen_t
00143 QuartzTermList::get_doclength() const
00144 {
00145     DEBUGCALL(DB, quartz_doclen_t, "QuartzTermList::get_doclength", "");
00146     RETURN(doclen);
00147 }
00148 
00149 
00150 TermList *
00151 QuartzTermList::next()
00152 {
00153     DEBUGCALL(DB, TermList *, "QuartzTermList::next", "");
00154     if (pos == end) {
00155         have_finished = true;
00156         RETURN(0);
00157     }
00158     bool got_wdf = false;
00159     // If there was a previous term, how much to reuse.
00160     if (!current_tname.empty()) {
00161         string::size_type len = static_cast<unsigned char>(*pos++);
00162         if (len > current_tname.length()) {
00163             // The wdf was squeezed into the same byte.
00164             current_wdf = len / (current_tname.length() + 1) - 1;
00165             len %= (current_tname.length() + 1);
00166             got_wdf = true;
00167         }
00168         current_tname.resize(len);
00169     }
00170     // What to append (note len must be positive, since just truncating
00171     // always takes us backwards in the sort order)
00172     string::size_type len = static_cast<unsigned char>(*pos++);
00173     current_tname.append(pos, len);
00174     pos += len;
00175 
00176     if (!got_wdf) {
00177         // Read wdf
00178         if (!unpack_uint(&pos, end, &current_wdf)) {
00179             if (pos == 0) throw Xapian::DatabaseCorruptError("Unexpected end of data when reading termlist.");
00180             throw Xapian::RangeError("Size of wdf out of range, in termlist.");
00181         }
00182     }
00183 
00184     // Read termfreq, if stored
00185     if (has_termfreqs) {
00186         if (!unpack_uint(&pos, end, &current_termfreq)) {
00187             if (pos == 0) throw Xapian::DatabaseCorruptError("Unexpected end of data when reading termlist.");
00188             throw Xapian::RangeError("Size of term frequency out of range, in termlist.");
00189         }
00190     } else {
00191         current_termfreq = 0;
00192     }
00193 
00194     DEBUGLINE(DB, "QuartzTermList::next() -" <<
00195                   " current_tname=" << current_tname <<
00196                   " current_wdf=" << current_wdf <<
00197                   " current_termfreq=" << current_termfreq);
00198     RETURN(0);
00199 }
00200 
00201 bool
00202 QuartzTermList::at_end() const
00203 {
00204     DEBUGCALL(DB, bool, "QuartzTermList::at_end", "");
00205     RETURN(have_finished);
00206 }
00207 
00208 string
00209 QuartzTermList::get_termname() const
00210 {
00211     DEBUGCALL(DB, string, "QuartzTermList::get_termname", "");
00212     RETURN(current_tname);
00213 }
00214 
00215 Xapian::termcount
00216 QuartzTermList::get_wdf() const
00217 {
00218     DEBUGCALL(DB, Xapian::termcount, "QuartzTermList::get_wdf", "");
00219     RETURN(current_wdf);
00220 }
00221 
00222 Xapian::doccount
00223 QuartzTermList::get_termfreq() const
00224 {
00225     DEBUGCALL(DB, Xapian::doccount, "QuartzTermList::get_termfreq", "");
00226     if (current_termfreq == 0)
00227         current_termfreq = this_db->get_termfreq(current_tname);
00228     RETURN(current_termfreq);
00229 }
00230 
00231 void
00232 QuartzTermList::accumulate_stats(Xapian::Internal::ExpandStats & stats) const
00233 {
00234     DEBUGCALL(DB, void, "QuartzTermList::accumulate_stats", "[stats&]");
00235     Assert(!have_finished);
00236     stats.accumulate(current_wdf, doclen, get_termfreq(), doccount);
00237 }
00238 
00239 Xapian::termcount
00240 QuartzTermList::positionlist_count() const
00241 {
00242     throw Xapian::UnimplementedError("QuartzTermList::positionlist_count() not implemented");
00243 }
00244 
00245 Xapian::PositionIterator
00246 QuartzTermList::positionlist_begin() const
00247 {
00248     DEBUGCALL(DB, Xapian::PositionIterator, "QuartzTermList::positionlist_begin", "");
00249     return Xapian::PositionIterator(this_db->open_position_list(did, current_tname));
00250 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.