Xapian: Internal Source Documentation: xapian-core: backends/flint/flint

00001 /* flint_positionlist.cc: A position list in a flint database.
00002  *
00003  * Copyright (C) 2004,2005,2006 Olly Betts
00004  *
00005  * This program is free software; you can redistribute it and/or
00006  * modify it under the terms of the GNU General Public License as
00007  * published by the Free Software Foundation; either version 2 of the
00008  * License, or (at your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00018  * USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include <xapian/types.h>
00024 
00025 #include "flint_positionlist.h"
00026 #include "flint_utils.h"
00027 #include "omdebug.h"
00028 
00029 #include <vector>
00030 #include <string>
00031 #include <cmath>
00032 
00033 using namespace std;
00034 
00035 static const unsigned char flstab[256] = {
00036     0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
00037     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
00038     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
00039     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
00040     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00041     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00042     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00043     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00044     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00045     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00046     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00047     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00048     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00049     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00050     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00051     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
00052 };
00053 
00054 // Highly optimised fls() implementation.
00055 inline int my_fls(unsigned mask)
00056 {
00057     int result = 0;
00058     if (mask >= 0x10000u) {
00059         mask >>= 16;
00060         result = 16;
00061     }
00062     if (mask >= 0x100u) {
00063         mask >>= 8;
00064         result += 8;
00065     }
00066     return result + flstab[mask];
00067 }
00068 
00069 class BitWriter {
00070     private:
00071         string buf;
00072         int n_bits;
00073         unsigned int acc;
00074     public:
00075         BitWriter() : n_bits(0), acc(0) { }
00076         BitWriter(const string & seed) : buf(seed), n_bits(0), acc(0) { }
00077         void encode(size_t value, size_t outof) {
00078             Assert(value < outof);
00079             size_t bits = my_fls(outof - 1);
00080             const size_t spare = (1 << bits) - outof;
00081             if (spare) {
00082                 const size_t mid_start = (outof - spare) / 2;
00083                 if (value < mid_start) {
00084                     write_bits(value, bits);
00085                 } else if (value >= mid_start + spare) {
00086                     write_bits((value - (mid_start + spare)) | (1 << (bits - 1)), bits);
00087                 } else {
00088                     --bits;
00089                     write_bits(value, bits);
00090                 }
00091             } else {
00092                 write_bits(value, bits);
00093             }
00094         }
00095         void write_bits(int data, int count) {
00096             if (count + n_bits > 32) {
00097                 // We need to write more bits than there's empty room for in
00098                 // the accumulator.  So we arrange to shift out 8 bits, then
00099                 // adjust things so we're adding 8 fewer bits.
00100                 Assert(count <= 32);
00101                 acc |= (data << n_bits);
00102                 buf += char(acc);
00103                 acc >>= 8;
00104                 data >>= 8;
00105                 count -= 8;
00106             }
00107             acc |= (data << n_bits);
00108             n_bits += count;
00109             while (n_bits >= 8) {
00110                 buf += char(acc);
00111                 acc >>= 8;
00112                 n_bits -= 8;
00113             }
00114         }
00115         string & freeze() {
00116             if (n_bits) {
00117                 buf += char(acc);
00118                 n_bits = 0;
00119                 acc = 0;
00120             }
00121             return buf;
00122         }
00123 };
00124 
00125 static void
00126 encode_interpolative(BitWriter & wr, const vector<Xapian::termpos> &pos, int j, int k)
00127 {
00128     while (j + 1 < k) {
00129         const size_t mid = (j + k) / 2;
00130         // Encode one out of (pos[k] - pos[j] + 1) values
00131         // (less some at either end because we must be able to fit
00132         // all the intervening pos in)
00133         const size_t outof = pos[k] - pos[j] + j - k + 1;
00134         const size_t lowest = pos[j] + mid - j;
00135         wr.encode(pos[mid] - lowest, outof);
00136         encode_interpolative(wr, pos, j, mid);
00137         j = mid;
00138     }
00139 }
00140 
00141 namespace Xapian {
00142 
00143 class BitReader {
00144     private:
00145         string buf;
00146         size_t idx;
00147         int n_bits;
00148         unsigned int acc;
00149     public:
00150         BitReader(const string &buf_) : buf(buf_), idx(0), n_bits(0), acc(0) { }
00151         Xapian::termpos decode(Xapian::termpos outof) {
00152             size_t bits = my_fls(outof - 1);
00153             const size_t spare = (1 << bits) - outof;
00154             const size_t mid_start = (outof - spare) / 2;
00155             Xapian::termpos p;
00156             if (spare) {
00157                 p = read_bits(bits - 1);
00158                 if (p < mid_start) {
00159                     if (read_bits(1)) p += mid_start + spare;
00160                 }
00161             } else {
00162                 p = read_bits(bits);
00163             }
00164             Assert(p < outof);
00165             return p;
00166         }
00167         unsigned int read_bits(int count) {
00168             unsigned int result;
00169             if (count > 25) {
00170                 // If we need more than 25 bits, read in two goes to ensure
00171                 // that we don't overflow acc.  This is a little more
00172                 // conservative than it needs to be, but such large values will
00173                 // inevitably be rare (because you can't fit very many of them
00174                 // into 2^32!)
00175                 Assert(count <= 32);
00176                 result = read_bits(16);
00177                 return result | (read_bits(count - 16) << 16);
00178             }
00179             while (n_bits < count) {
00180                 Assert(idx < buf.size());
00181                 acc |= static_cast<unsigned char>(buf[idx++]) << n_bits;
00182                 n_bits += 8;
00183             }
00184             result = acc & ((1u << count) - 1);
00185             acc >>= count;
00186             n_bits -= count;
00187             return result;
00188         }
00189         // Check all the data has been read.  Because it'll be zero padded
00190         // to fill a byte, the best we can actually do is check that
00191         // there's less than a byte left and that all remaining bits are
00192         // zero.
00193         bool check_all_gone() const {
00194             return (idx == buf.size() && n_bits < 7 && acc == 0);
00195         }
00196         void decode_interpolative(vector<Xapian::termpos> & pos, int j, int k);
00197 };
00198 
00199 void
00200 BitReader::decode_interpolative(vector<Xapian::termpos> & pos, int j, int k)
00201 {
00202     while (j + 1 < k) {
00203         const size_t mid = (j + k) / 2;
00204         // Decode one out of (pos[k] - pos[j] + 1) values
00205         // (less some at either end because we must be able to fit
00206         // all the intervening pos in)
00207         const size_t outof = pos[k] - pos[j] + j - k + 1;
00208         pos[mid] = decode(outof) + (pos[j] + mid - j);
00209         decode_interpolative(pos, j, mid);
00210         j = mid;
00211     }
00212 }
00213 
00214 }
00215 
00216 using Xapian::BitReader;
00217 
00218 void
00219 FlintPositionListTable::set_positionlist(Xapian::docid did,
00220                                          const string & tname,
00221                                          Xapian::PositionIterator pos,
00222                                          const Xapian::PositionIterator &pos_end)
00223 {
00224     DEBUGCALL(DB, void, "FlintPositionList::set_positionlist",
00225               did << ", " << tname << ", " << pos << ", " << pos_end);
00226     Assert(pos != pos_end);
00227 
00228     // FIXME: avoid the need for this copy!
00229     vector<Xapian::termpos> poscopy(pos, pos_end);
00230 
00231     string key = make_key(did, tname);
00232 
00233     if (poscopy.size() == 1) {
00234         // Special case for single entry position list.
00235         add(key, pack_uint(poscopy[0]));
00236         return;
00237     }
00238 
00239     BitWriter wr(pack_uint(poscopy.back()));
00240 
00241     wr.encode(poscopy[0], poscopy.back());
00242     wr.encode(poscopy.size() - 2, poscopy.back() - poscopy[0]);
00243     encode_interpolative(wr, poscopy, 0, poscopy.size() - 1);
00244     add(key, wr.freeze());
00245 }
00246 
00247 Xapian::termcount
00248 FlintPositionListTable::positionlist_count(Xapian::docid did,
00249                                            const string & term) const
00250 {
00251     DEBUGCALL(DB, void, "FlintPositionListTable::positionlist_count",
00252               did << ", " << term);
00253 
00254     string data;
00255     if (!get_exact_entry(pack_uint_preserving_sort(did) + term, data)) {
00256         // There's no positional information for this term.
00257         return 0;
00258     }
00259 
00260     const char * pos = data.data();
00261     const char * end = pos + data.size();
00262     Xapian::termpos pos_last;
00263     if (!unpack_uint(&pos, end, &pos_last)) {
00264         throw Xapian::DatabaseCorruptError("Position list data corrupt");
00265     }
00266     if (pos == end) {
00267         // Special case for single entry position list.
00268         return 1;
00269     }
00270 
00271     BitReader rd(data);
00272     // Skip the header we just read.
00273     (void)rd.read_bits(8 * (pos - data.data()));
00274     Xapian::termpos pos_first = rd.decode(pos_last);
00275     Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00276     return pos_size;
00277 }
00278 
00280 
00281 bool
00282 FlintPositionList::read_data(const FlintTable * table, Xapian::docid did,
00283                              const string & tname)
00284 {
00285     DEBUGCALL(DB, void, "FlintPositionList::read_data",
00286               table << ", " << did << ", " << tname);
00287 
00288     have_started = false;
00289     positions.clear();
00290 
00291     string data;
00292     if (!table->get_exact_entry(pack_uint_preserving_sort(did) + tname, data)) {
00293         // There's no positional information for this term.
00294         current_pos = positions.begin();
00295         return false;
00296     }
00297 
00298     const char * pos = data.data();
00299     const char * end = pos + data.size();
00300     Xapian::termpos pos_last;
00301     if (!unpack_uint(&pos, end, &pos_last)) {
00302         throw Xapian::DatabaseCorruptError("Position list data corrupt");
00303     }
00304     if (pos == end) {
00305         // Special case for single entry position list.
00306         positions.push_back(pos_last);
00307         current_pos = positions.begin();
00308         return true;
00309     }
00310     BitReader rd(data);
00311     // Skip the header we just read.
00312     (void)rd.read_bits(8 * (pos - data.data()));
00313     Xapian::termpos pos_first = rd.decode(pos_last);
00314     Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00315     positions.resize(pos_size);
00316     positions[0] = pos_first;
00317     positions.back() = pos_last;
00318     rd.decode_interpolative(positions, 0, pos_size - 1);
00319 
00320     current_pos = positions.begin();
00321     return true;
00322 }
00323 
00324 Xapian::termcount
00325 FlintPositionList::get_size() const
00326 {
00327     DEBUGCALL(DB, Xapian::termcount, "FlintPositionList::get_size", "");
00328     RETURN(positions.size());
00329 }
00330 
00331 Xapian::termpos
00332 FlintPositionList::get_position() const
00333 {
00334     DEBUGCALL(DB, Xapian::termpos, "FlintPositionList::get_position", "");
00335     Assert(have_started);
00336     RETURN(*current_pos);
00337 }
00338 
00339 void
00340 FlintPositionList::next()
00341 {
00342     DEBUGCALL(DB, void, "FlintPositionList::next", "");
00343 
00344     if (!have_started) {
00345         have_started = true;
00346     } else {
00347         Assert(!at_end());
00348         ++current_pos;
00349     }
00350 }
00351 
00352 void
00353 FlintPositionList::skip_to(Xapian::termpos termpos)
00354 {
00355     DEBUGCALL(DB, void, "FlintPositionList::skip_to", termpos);
00356     if (!have_started) {
00357         have_started = true;
00358     }
00359     while (!at_end() && *current_pos < termpos) ++current_pos;
00360 }
00361 
00362 bool
00363 FlintPositionList::at_end() const
00364 {
00365     DEBUGCALL(DB, bool, "FlintPositionList::at_end", "");
00366     RETURN(current_pos == positions.end());
00367 }
backends/flint/flint_positionlist.cc