backends/flint/flint_termlisttable.cc

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2007 Olly Betts
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include <xapian/document.h>
00024 #include <xapian/error.h>
00025 #include <xapian/termiterator.h>
00026 
00027 #include "flint_termlisttable.h"
00028 #include "flint_utils.h"
00029 #include "omassert.h"
00030 #include "omdebug.h"
00031 #include "stringutils.h"
00032 #include "utils.h"
00033 
00034 #include <string>
00035 
00036 using namespace std;
00037 
00038 void
00039 FlintTermListTable::set_termlist(Xapian::docid did,
00040                                  const Xapian::Document & doc,
00041                                  flint_doclen_t doclen)
00042 {
00043     DEBUGCALL(DB, void, "FlintTermListTable::set_termlist",
00044               did << ", " << doc << ", " << doclen);
00045 
00046     string tag = pack_uint(doclen);
00047 
00048     Xapian::doccount termlist_size = doc.termlist_count();
00049     if (termlist_size == 0) {
00050         // doclen is sum(wdf) so should be zero if there are no terms.
00051         Assert(doclen == 0);
00052         Assert(doc.termlist_begin() == doc.termlist_end());
00053         add(flint_docid_to_key(did), string());
00054         return;
00055     }
00056 
00057     Xapian::TermIterator t = doc.termlist_begin();
00058     if (t != doc.termlist_end()) {
00059         tag += pack_uint(termlist_size);
00060         string prev_term = *t;
00061 
00062         // Previous database versions encoded a boolean here, which was
00063         // always false (and pack_bool() encodes false as a '0').  We can
00064         // just omit this and successfully read old and new termlists
00065         // except in the case where the next byte is a '0' - in this case
00066         // we need keep the '0' so that the decoder can just skip any '0'
00067         // it sees in this position (this shouldn't be a common case - 48
00068         // character terms aren't very common, and the first term
00069         // alphabetically is likely to be shorter than average).
00070         // FIXME: If we have an incompatible database version bump we should
00071         // drop this completely.
00072         if (prev_term.size() == '0') tag += '0';
00073 
00074         tag += prev_term.size();
00075         tag += prev_term;
00076         tag += pack_uint(t.get_wdf());
00077         --termlist_size;
00078 
00079         while (++t != doc.termlist_end()) {
00080             const string & term = *t;
00081             // If there's a shared prefix with the previous term, we don't
00082             // store it explicitly, but just store the length of the shared
00083             // prefix.  In general, this is a big win.
00084             size_t reuse = common_prefix_length(prev_term, term);
00085 
00086             // reuse must be <= prev_term.size(), and we know that value while
00087             // decoding.  So if the wdf is small enough that we can multiply it
00088             // by (prev_term.size() + 1), add reuse and fit the result in a
00089             // byte, then we can pack reuse and the wdf into a single byte and
00090             // save ourselves a byte.  We actually need to add one to the wdf
00091             // before multiplying so that a wdf of 0 can be detected by the
00092             // decoder.
00093             size_t packed = 0;
00094             Xapian::termcount wdf = t.get_wdf();
00095             // If wdf >= 128, then we aren't going to be able to pack it in so
00096             // don't even try to avoid the calculation overflowing and making
00097             // us think we can.
00098             if (wdf < 127)
00099                 packed = (wdf + 1) * (prev_term.size() + 1) + reuse;
00100 
00101             if (packed && packed < 256) {
00102                 // We can pack the wdf into the same byte.
00103                 tag += char(packed);
00104                 tag += char(term.size() - reuse);
00105                 tag.append(term.data() + reuse, term.size() - reuse);
00106             } else {
00107                 tag += char(reuse);
00108                 tag += char(term.size() - reuse);
00109                 tag.append(term.data() + reuse, term.size() - reuse);
00110                 // FIXME: pack wdf after reuse next time we rejig the format
00111                 // incompatibly.
00112                 tag += pack_uint(wdf);
00113             }
00114 
00115             prev_term = *t;
00116             --termlist_size;
00117         }
00118     }
00119     Assert(termlist_size == 0);
00120     add(flint_docid_to_key(did), tag);
00121 }
00122 
00123 flint_doclen_t
00124 FlintTermListTable::get_doclength(Xapian::docid did) const
00125 {
00126     DEBUGCALL(DB, flint_doclen_t, "FlintTermListTable::get_doclength", did);
00127 
00128     string tag;
00129     if (!get_exact_entry(flint_docid_to_key(did), tag))
00130         throw Xapian::DocNotFoundError("No termlist found for document " +
00131                                        om_tostring(did));
00132 
00133     if (tag.empty()) RETURN(0);
00134 
00135     const char * pos = tag.data();
00136     const char * end = pos + tag.size();
00137 
00138     flint_doclen_t doclen;
00139     if (!unpack_uint(&pos, end, &doclen)) {
00140         const char *msg;
00141         if (pos == 0) {
00142             msg = "Too little data for doclen in termlist";
00143         } else {
00144             msg = "Overflowed value for doclen in termlist";
00145         }
00146         throw Xapian::DatabaseCorruptError(msg);
00147     }
00148 
00149     RETURN(doclen);
00150 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.