00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include <xapian/types.h>
00024
00025 #include "flint_positionlist.h"
00026 #include "flint_utils.h"
00027 #include "omdebug.h"
00028
00029 #include <vector>
00030 #include <string>
00031 #include <cmath>
00032
00033 using namespace std;
00034
00035 static const unsigned char flstab[256] = {
00036 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
00037 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
00038 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
00039 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
00040 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00041 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00042 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00043 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00044 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00045 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00046 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00047 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00048 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00049 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00050 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00051 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
00052 };
00053
00054
00055 inline int my_fls(unsigned mask)
00056 {
00057 int result = 0;
00058 if (mask >= 0x10000u) {
00059 mask >>= 16;
00060 result = 16;
00061 }
00062 if (mask >= 0x100u) {
00063 mask >>= 8;
00064 result += 8;
00065 }
00066 return result + flstab[mask];
00067 }
00068
00069 class BitWriter {
00070 private:
00071 string buf;
00072 int n_bits;
00073 unsigned int acc;
00074 public:
00075 BitWriter() : n_bits(0), acc(0) { }
00076 BitWriter(const string & seed) : buf(seed), n_bits(0), acc(0) { }
00077 void encode(size_t value, size_t outof) {
00078 Assert(value < outof);
00079 size_t bits = my_fls(outof - 1);
00080 const size_t spare = (1 << bits) - outof;
00081 if (spare) {
00082 const size_t mid_start = (outof - spare) / 2;
00083 if (value < mid_start) {
00084 write_bits(value, bits);
00085 } else if (value >= mid_start + spare) {
00086 write_bits((value - (mid_start + spare)) | (1 << (bits - 1)), bits);
00087 } else {
00088 --bits;
00089 write_bits(value, bits);
00090 }
00091 } else {
00092 write_bits(value, bits);
00093 }
00094 }
00095 void write_bits(int data, int count) {
00096 if (count + n_bits > 32) {
00097
00098
00099
00100 Assert(count <= 32);
00101 acc |= (data << n_bits);
00102 buf += char(acc);
00103 acc >>= 8;
00104 data >>= 8;
00105 count -= 8;
00106 }
00107 acc |= (data << n_bits);
00108 n_bits += count;
00109 while (n_bits >= 8) {
00110 buf += char(acc);
00111 acc >>= 8;
00112 n_bits -= 8;
00113 }
00114 }
00115 string & freeze() {
00116 if (n_bits) {
00117 buf += char(acc);
00118 n_bits = 0;
00119 acc = 0;
00120 }
00121 return buf;
00122 }
00123 };
00124
00125 static void
00126 encode_interpolative(BitWriter & wr, const vector<Xapian::termpos> &pos, int j, int k)
00127 {
00128 while (j + 1 < k) {
00129 const size_t mid = (j + k) / 2;
00130
00131
00132
00133 const size_t outof = pos[k] - pos[j] + j - k + 1;
00134 const size_t lowest = pos[j] + mid - j;
00135 wr.encode(pos[mid] - lowest, outof);
00136 encode_interpolative(wr, pos, j, mid);
00137 j = mid;
00138 }
00139 }
00140
00141 namespace Xapian {
00142
00143 class BitReader {
00144 private:
00145 string buf;
00146 size_t idx;
00147 int n_bits;
00148 unsigned int acc;
00149 public:
00150 BitReader(const string &buf_) : buf(buf_), idx(0), n_bits(0), acc(0) { }
00151 Xapian::termpos decode(Xapian::termpos outof) {
00152 size_t bits = my_fls(outof - 1);
00153 const size_t spare = (1 << bits) - outof;
00154 const size_t mid_start = (outof - spare) / 2;
00155 Xapian::termpos p;
00156 if (spare) {
00157 p = read_bits(bits - 1);
00158 if (p < mid_start) {
00159 if (read_bits(1)) p += mid_start + spare;
00160 }
00161 } else {
00162 p = read_bits(bits);
00163 }
00164 Assert(p < outof);
00165 return p;
00166 }
00167 unsigned int read_bits(int count) {
00168 unsigned int result;
00169 if (count > 25) {
00170
00171
00172
00173
00174
00175 Assert(count <= 32);
00176 result = read_bits(16);
00177 return result | (read_bits(count - 16) << 16);
00178 }
00179 while (n_bits < count) {
00180 Assert(idx < buf.size());
00181 acc |= static_cast<unsigned char>(buf[idx++]) << n_bits;
00182 n_bits += 8;
00183 }
00184 result = acc & ((1u << count) - 1);
00185 acc >>= count;
00186 n_bits -= count;
00187 return result;
00188 }
00189
00190
00191
00192
00193 bool check_all_gone() const {
00194 return (idx == buf.size() && n_bits < 7 && acc == 0);
00195 }
00196 void decode_interpolative(vector<Xapian::termpos> & pos, int j, int k);
00197 };
00198
00199 void
00200 BitReader::decode_interpolative(vector<Xapian::termpos> & pos, int j, int k)
00201 {
00202 while (j + 1 < k) {
00203 const size_t mid = (j + k) / 2;
00204
00205
00206
00207 const size_t outof = pos[k] - pos[j] + j - k + 1;
00208 pos[mid] = decode(outof) + (pos[j] + mid - j);
00209 decode_interpolative(pos, j, mid);
00210 j = mid;
00211 }
00212 }
00213
00214 }
00215
00216 using Xapian::BitReader;
00217
00218 void
00219 FlintPositionListTable::set_positionlist(Xapian::docid did,
00220 const string & tname,
00221 Xapian::PositionIterator pos,
00222 const Xapian::PositionIterator &pos_end)
00223 {
00224 DEBUGCALL(DB, void, "FlintPositionList::set_positionlist",
00225 did << ", " << tname << ", " << pos << ", " << pos_end);
00226 Assert(pos != pos_end);
00227
00228
00229 vector<Xapian::termpos> poscopy(pos, pos_end);
00230
00231 string key = make_key(did, tname);
00232
00233 if (poscopy.size() == 1) {
00234
00235 add(key, pack_uint(poscopy[0]));
00236 return;
00237 }
00238
00239 BitWriter wr(pack_uint(poscopy.back()));
00240
00241 wr.encode(poscopy[0], poscopy.back());
00242 wr.encode(poscopy.size() - 2, poscopy.back() - poscopy[0]);
00243 encode_interpolative(wr, poscopy, 0, poscopy.size() - 1);
00244 add(key, wr.freeze());
00245 }
00246
00247 Xapian::termcount
00248 FlintPositionListTable::positionlist_count(Xapian::docid did,
00249 const string & term) const
00250 {
00251 DEBUGCALL(DB, void, "FlintPositionListTable::positionlist_count",
00252 did << ", " << term);
00253
00254 string data;
00255 if (!get_exact_entry(pack_uint_preserving_sort(did) + term, data)) {
00256
00257 return 0;
00258 }
00259
00260 const char * pos = data.data();
00261 const char * end = pos + data.size();
00262 Xapian::termpos pos_last;
00263 if (!unpack_uint(&pos, end, &pos_last)) {
00264 throw Xapian::DatabaseCorruptError("Position list data corrupt");
00265 }
00266 if (pos == end) {
00267
00268 return 1;
00269 }
00270
00271 BitReader rd(data);
00272
00273 (void)rd.read_bits(8 * (pos - data.data()));
00274 Xapian::termpos pos_first = rd.decode(pos_last);
00275 Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00276 return pos_size;
00277 }
00278
00280
00281 bool
00282 FlintPositionList::read_data(const FlintTable * table, Xapian::docid did,
00283 const string & tname)
00284 {
00285 DEBUGCALL(DB, void, "FlintPositionList::read_data",
00286 table << ", " << did << ", " << tname);
00287
00288 have_started = false;
00289 positions.clear();
00290
00291 string data;
00292 if (!table->get_exact_entry(pack_uint_preserving_sort(did) + tname, data)) {
00293
00294 current_pos = positions.begin();
00295 return false;
00296 }
00297
00298 const char * pos = data.data();
00299 const char * end = pos + data.size();
00300 Xapian::termpos pos_last;
00301 if (!unpack_uint(&pos, end, &pos_last)) {
00302 throw Xapian::DatabaseCorruptError("Position list data corrupt");
00303 }
00304 if (pos == end) {
00305
00306 positions.push_back(pos_last);
00307 current_pos = positions.begin();
00308 return true;
00309 }
00310 BitReader rd(data);
00311
00312 (void)rd.read_bits(8 * (pos - data.data()));
00313 Xapian::termpos pos_first = rd.decode(pos_last);
00314 Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00315 positions.resize(pos_size);
00316 positions[0] = pos_first;
00317 positions.back() = pos_last;
00318 rd.decode_interpolative(positions, 0, pos_size - 1);
00319
00320 current_pos = positions.begin();
00321 return true;
00322 }
00323
00324 Xapian::termcount
00325 FlintPositionList::get_size() const
00326 {
00327 DEBUGCALL(DB, Xapian::termcount, "FlintPositionList::get_size", "");
00328 RETURN(positions.size());
00329 }
00330
00331 Xapian::termpos
00332 FlintPositionList::get_position() const
00333 {
00334 DEBUGCALL(DB, Xapian::termpos, "FlintPositionList::get_position", "");
00335 Assert(have_started);
00336 RETURN(*current_pos);
00337 }
00338
00339 void
00340 FlintPositionList::next()
00341 {
00342 DEBUGCALL(DB, void, "FlintPositionList::next", "");
00343
00344 if (!have_started) {
00345 have_started = true;
00346 } else {
00347 Assert(!at_end());
00348 ++current_pos;
00349 }
00350 }
00351
00352 void
00353 FlintPositionList::skip_to(Xapian::termpos termpos)
00354 {
00355 DEBUGCALL(DB, void, "FlintPositionList::skip_to", termpos);
00356 if (!have_started) {
00357 have_started = true;
00358 }
00359 while (!at_end() && *current_pos < termpos) ++current_pos;
00360 }
00361
00362 bool
00363 FlintPositionList::at_end() const
00364 {
00365 DEBUGCALL(DB, bool, "FlintPositionList::at_end", "");
00366 RETURN(current_pos == positions.end());
00367 }