bin/xapian-check.cc

Go to the documentation of this file.
00001 /* xapian-check.cc: use Btree::check to check consistency of a flint database
00002  * or btree.  Also check the structures inside the tables.
00003  *
00004  * Copyright 1999,2000,2001 BrightStation PLC
00005  * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 #include <iostream>
00025 
00026 #include "autoptr.h"
00027 #include "flint_check.h"
00028 #include "flint_cursor.h"
00029 #include "flint_table.h"
00030 #include "flint_types.h"
00031 #include "flint_utils.h"
00032 #include "stringutils.h"
00033 #include "utils.h"
00034 
00035 #include <xapian.h>
00036 
00037 using namespace std;
00038 
00039 #define PROG_NAME "xapian-check"
00040 #define PROG_DESC "Check the consistency of a flint database or table"
00041 
00042 // FIXME: We don't currently cross-check wdf between postlist and termlist.
00043 // It's hard to see how to efficiently.  We do cross-check doclens, but that
00044 // "only" requires (4 * last_docid()) bytes.
00045 
00046 static void show_usage() {
00047     cout << "Usage: "PROG_NAME" <flint directory>|<path to btree and prefix> [[t][f][b][v][+]]\n\n"
00048 "The btree(s) is/are always checked - control the output verbosity with:\n"
00049 " t = short tree printing\n"
00050 " f = full tree printing\n"
00051 " b = show bitmap\n"
00052 " v = show stats about B-tree (default)\n"
00053 " + = same as tbv\n"
00054 " e.g. "PROG_NAME" /var/lib/xapian/data/default\n"
00055 "      "PROG_NAME" /var/lib/xapian/data/default/postlist fbv" << endl;
00056 }
00057 
00058 static size_t check_table(string table, int opts);
00059 
00060 static vector<Xapian::termcount> doclens;
00061 
00062 static const unsigned char flstab[256] = {
00063     0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
00064     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
00065     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
00066     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
00067     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00068     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00069     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00070     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
00071     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00072     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00073     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00074     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00075     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00076     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00077     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
00078     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
00079 };
00080 
00081 // Highly optimised fls() implementation.
00082 inline int my_fls(unsigned mask)
00083 {
00084     int result = 0;
00085     if (mask >= 0x10000u) {
00086         mask >>= 16;
00087         result = 16;
00088     }
00089     if (mask >= 0x100u) {
00090         mask >>= 8;
00091         result += 8;
00092     }
00093     return result + flstab[mask];
00094 }
00095 
00096 class BitReader {
00097     private:
00098         string buf;
00099         size_t idx;
00100         int n_bits;
00101         unsigned int acc;
00102     public:
00103         BitReader(const string &buf_) : buf(buf_), idx(0), n_bits(0), acc(0) { }
00104         Xapian::termpos decode(Xapian::termpos outof) {
00105             size_t bits = my_fls(outof - 1);
00106             const size_t spare = (1 << bits) - outof;
00107             const size_t mid_start = (outof - spare) / 2;
00108             Xapian::termpos p;
00109             if (spare) {
00110                 p = read_bits(bits - 1);
00111                 if (p < mid_start) {
00112                     if (read_bits(1)) p += mid_start + spare;
00113                 }
00114             } else {
00115                 p = read_bits(bits);
00116             }
00117             Assert(p < outof);
00118             return p;
00119         }
00120         unsigned int read_bits(int count) {
00121             unsigned int result;
00122             if (count > 25) {
00123                 // If we need more than 25 bits, read in two goes to ensure
00124                 // that we don't overflow acc.  This is a little more
00125                 // conservative than it needs to be, but such large values will
00126                 // inevitably be rare (because you can't fit very many of them
00127                 // into 2^32!)
00128                 Assert(count <= 32);
00129                 result = read_bits(16);
00130                 return result | (read_bits(count - 16) << 16);
00131             }
00132             while (n_bits < count) {
00133                 Assert(idx < buf.size());
00134                 acc |= static_cast<unsigned char>(buf[idx++]) << n_bits;
00135                 n_bits += 8;
00136             }
00137             result = acc & ((1u << count) - 1);
00138             acc >>= count;
00139             n_bits -= count;
00140             return result;
00141         }
00142         // Check all the data has been read.  Because it'll be zero padded
00143         // to fill a byte, the best we can actually do is check that
00144         // there's less than a byte left and that all remaining bits are
00145         // zero.
00146         bool check_all_gone() const {
00147             return (idx == buf.size() && n_bits < 7 && acc == 0);
00148         }
00149         void decode_interpolative(vector<Xapian::termpos> & pos, int j, int k);
00150 };
00151 
00152 void
00153 BitReader::decode_interpolative(vector<Xapian::termpos> & pos, int j, int k)
00154 {
00155     while (j + 1 < k) {
00156         const size_t mid = (j + k) / 2;
00157         // Decode one out of (pos[k] - pos[j] + 1) values
00158         // (less some at either end because we must be able to fit
00159         // all the intervening pos in)
00160         const size_t outof = pos[k] - pos[j] + j - k + 1;
00161         pos[mid] = decode(outof) + (pos[j] + mid - j);
00162         decode_interpolative(pos, j, mid);
00163         j = mid;
00164     }
00165 }
00166 
00167 static inline bool
00168 is_user_metadata_key(const string & key)
00169 {
00170     return key.size() > 1 && key[0] == '\0' && key[1] != '\xff';
00171 }
00172 
00173 int
00174 main(int argc, char **argv)
00175 {
00176     if (argc > 1 && argv[1][0] == '-') {
00177         if (strcmp(argv[1], "--help") == 0) {
00178             cout << PROG_NAME" - "PROG_DESC"\n\n";
00179             show_usage();
00180             exit(0);
00181         }
00182         if (strcmp(argv[1], "--version") == 0) {
00183             cout << PROG_NAME" - "PACKAGE_STRING << endl;
00184             exit(0);
00185         }
00186     }
00187     if (argc < 2 || argc > 3) {
00188         show_usage();
00189         exit(1);
00190     }
00191 
00192     int opts = 0;
00193     const char * opt_string = argv[2];
00194     if (!opt_string) opt_string = "v";
00195     for (const char *p = opt_string; *p; ++p) {
00196         switch (*p) {
00197             case 't': opts |= OPT_SHORT_TREE; break;
00198             case 'f': opts |= OPT_FULL_TREE; break;
00199             case 'b': opts |= OPT_SHOW_BITMAP; break;
00200             case 'v': opts |= OPT_SHOW_STATS; break;
00201             case '+':
00202                 opts |= OPT_SHORT_TREE | OPT_SHOW_BITMAP | OPT_SHOW_STATS;
00203                 break;
00204             default:
00205                 cerr << "option " << opt_string << " unknown\n";
00206                 cerr << "use t,f,b,v and/or + in the option string\n";
00207                 exit(1);
00208         }
00209     }
00210 
00211     try {
00212         size_t errors = 0;
00213         struct stat sb;
00214         string meta_file(argv[1]);
00215         meta_file += "/iamflint";
00216         if (stat(meta_file.c_str(), &sb) == 0) {
00217             // Check a whole flint database directory.
00218             try {
00219                 Xapian::Database db = Xapian::Flint::open(argv[1]);
00220                 doclens.reserve(db.get_lastdocid());
00221             } catch (const Xapian::Error & e) {
00222                 // Ignore so we can check a database too broken to open.
00223                 cout << "Database couldn't be opened for reading: "
00224                      << e.get_description()
00225                      << "\nContinuing check anyway" << endl;
00226                 ++errors;
00227             }
00228             // Assume it's a flint directory and try to check all the btrees
00229             // Note: it's important to check termlist before postlist so
00230             // that we can cross-check the document lengths.
00231             const char * tables[] = {
00232                 "record", "termlist", "postlist", "position", "value",
00233                 "spelling", "synonym"
00234             };
00235             for (const char **t = tables;
00236                  t != tables + sizeof(tables)/sizeof(tables[0]); ++t) {
00237                 string table(argv[1]);
00238                 table += '/';
00239                 table += *t;
00240                 cout << *t << ":\n";
00241                 if (strcmp(*t, "position") == 0 ||
00242                     strcmp(*t, "value") == 0 ||
00243                     strcmp(*t, "spelling") == 0 ||
00244                     strcmp(*t, "synonym") == 0) {
00245                     // These are created lazily, so may not exist.
00246                     if (!file_exists(table + ".DB")) {
00247                         cout << "Lazily created, and not yet used.\n" << endl;
00248                         continue;
00249                     }
00250                 }
00251                 errors += check_table(table, opts);
00252             }
00253         } else {
00254             // Just check a single Btree.  If it ends with "." or ".DB"
00255             // already, trim that so the user can do xapian-check on
00256             // "foo", "foo.", or "foo.DB".
00257             string table_name = argv[1];
00258             if (endswith(table_name, '.'))
00259                 table_name.resize(table_name.size() - 1);
00260             else if (endswith(table_name, ".DB"))
00261                 table_name.resize(table_name.size() - 3);
00262 
00263             errors = check_table(table_name, opts);
00264         }
00265         if (errors > 0) {
00266             cout << "Total errors found: " << errors << endl;
00267             exit(1);
00268         }
00269         cout << "No errors found" << endl;
00270     } catch (const char *error) {
00271         cerr << argv[0] << ": " << error << endl;
00272         exit(1);
00273     } catch (const Xapian::Error &error) {
00274         cerr << argv[0] << ": " << error.get_description() << endl;
00275         exit(1);
00276     } catch (...) {
00277         cerr << argv[0] << ": Unknown exception" << endl;
00278         exit(1);
00279     }
00280 }
00281 
00282 static size_t
00283 check_table(string filename, int opts)
00284 {
00285     size_t p = filename.find_last_of('/');
00286 #if defined __WIN32__ || defined __EMX__
00287     if (p == string::npos) p = 0;
00288     p = filename.find_last_of('\\', p);
00289 #endif
00290     if (p == string::npos) p = 0; else ++p;
00291     string tablename;
00292     while (p != filename.size()) {
00293         tablename += tolower(static_cast<unsigned char>(filename[p++]));
00294     }
00295 
00296     filename += '.';
00297 
00298     // Check the btree structure.
00299     BtreeCheck::check(filename, opts);
00300 
00301     // Now check the flint structures inside the btree.
00302     FlintTable table(filename, true);
00303     table.open();
00304     AutoPtr<FlintCursor> cursor(table.cursor_get());
00305 
00306     size_t errors = 0;
00307 
00308     cursor->find_entry("");
00309     cursor->next(); // Skip the empty entry.
00310 
00311     if (tablename == "postlist") {
00312         // Now check the structure of each postlist in the table.
00313         string current_term;
00314         Xapian::docid lastdid = 0;
00315         Xapian::termcount termfreq = 0, collfreq = 0;
00316         Xapian::termcount tf = 0, cf = 0;
00317         bool have_metainfo_key = false;
00318 
00319         // The first key/tag pair should be the METAINFO - though this may be
00320         // missing if the table only contains user-metadata.
00321         if (!cursor->after_end()) {
00322             if (cursor->current_key == string("", 1)) {
00323                 have_metainfo_key = true;
00324                 cursor->read_tag();
00325                 // Check format of the METAINFO key.
00326                 Xapian::docid did;
00327                 flint_totlen_t totlen;
00328                 const char * data = cursor->current_tag.data();
00329                 const char * end = data + cursor->current_tag.size();
00330                 if (!unpack_uint(&data, end, &did)) {
00331                     cout << "Tag containing meta information is corrupt." << endl;
00332                     return errors + 1;
00333                 }
00334                 if (!unpack_uint_last(&data, end, &totlen)) {
00335                     cout << "Tag containing meta information is corrupt." << endl;
00336                     return errors + 1;
00337                 }
00338                 if (data != end) {
00339                     cout << "Tag containing meta information is corrupt." << endl;
00340                     return errors + 1;
00341                 }
00342                 cursor->next();
00343             }
00344         }
00345 
00346         while (!cursor->after_end()) {
00347             string & key = cursor->current_key;
00348 
00349             if (is_user_metadata_key(key)) {
00350                 // User metadata can be anything, so we can't do any particular
00351                 // checks on it.
00352                 cursor->next();
00353                 continue;
00354             }
00355 
00356             if (!have_metainfo_key) {
00357                 cout << "METAINFO key missing from postlist table" << endl;
00358                 return errors + 1;
00359             }
00360 
00361             const char * pos, * end;
00362 
00363             // Get term from key.
00364             pos = key.data();
00365             end = pos + key.size();
00366 
00367             string term;
00368             Xapian::docid did;
00369             if (!unpack_string_preserving_sort(&pos, end, term)) {
00370                 cout << "Error unpacking termname from key" << endl;
00371                 ++errors;
00372                 cursor->next();
00373                 continue;
00374             }
00375             if (current_term.empty()) {
00376                 current_term = term;
00377                 tf = cf = 0;
00378                 if (pos != end) {
00379                     cout << "Extra bytes after key for first chunk of "
00380                         "posting list for term `" << term << "'" << endl;
00381                     ++errors;
00382                     cursor->next();
00383                     continue;
00384                 }
00385                 // Unpack extra header from first chunk.
00386                 cursor->read_tag();
00387                 pos = cursor->current_tag.data();
00388                 end = pos + cursor->current_tag.size();
00389                 if (!unpack_uint(&pos, end, &termfreq)) {
00390                     cout << "Failed to unpack termfreq for term `" << term
00391                          << "'" << endl;
00392                     ++errors;
00393                     cursor->next();
00394                     continue;
00395                 }
00396                 if (!unpack_uint(&pos, end, &collfreq)) {
00397                     cout << "Failed to unpack collfreq for term `" << term
00398                          << "'" << endl;
00399                     ++errors;
00400                     cursor->next();
00401                     continue;
00402                 }
00403                 if (!unpack_uint(&pos, end, &did)) {
00404                     cout << "Failed to unpack firstdid for term `" << term
00405                          << "'" << endl;
00406                     ++errors;
00407                     cursor->next();
00408                     continue;
00409                 } else {
00410                     ++did;
00411                 }
00412             } else {
00413                 if (term != current_term) {
00414                     if (pos == end) {
00415                         cout << "No last chunk for term `" << term << "'"
00416                              << endl;
00417                     } else {
00418                         cout << "Mismatch in follow-on chunk in posting "
00419                             "list for term `" << current_term << "' (got `"
00420                             << term << "')" << endl;
00421                     }
00422                     ++errors;
00423                     current_term = term;
00424                 }
00425                 if (pos != end) {
00426                     if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00427                         cout << "Failed to unpack did from key" << endl;
00428                         ++errors;
00429                         cursor->next();
00430                         continue;
00431                     }
00432                     if (did <= lastdid) {
00433                         cout << "First did in this chunk is <= last in "
00434                             "prev chunk" << endl;
00435                         ++errors;
00436                     }
00437                 }
00438                 cursor->read_tag();
00439                 pos = cursor->current_tag.data();
00440                 end = pos + cursor->current_tag.size();
00441             }
00442 
00443             bool is_last_chunk;
00444             if (!unpack_bool(&pos, end, &is_last_chunk)) {
00445                 cout << "Failed to unpack last chunk flag" << endl;
00446                 ++errors;
00447                 cursor->next();
00448                 continue;
00449             }
00450             // Read what the final document ID in this chunk is.
00451             if (!unpack_uint(&pos, end, &lastdid)) {
00452                 cout << "Failed to unpack increase to last" << endl;
00453                 ++errors;
00454                 cursor->next();
00455                 continue;
00456             }
00457             ++lastdid;
00458             lastdid += did;
00459             bool bad = false;
00460             while (true) {
00461                 Xapian::termcount wdf, doclen;
00462                 if (!unpack_uint(&pos, end, &wdf)) {
00463                     cout << "Failed to unpack wdf" << endl;
00464                     ++errors;
00465                     bad = true;
00466                     break;
00467                 }
00468                 if (!unpack_uint(&pos, end, &doclen)) {
00469                     cout << "Failed to unpack doc length" << endl;
00470                     ++errors;
00471                     bad = true;
00472                     break;
00473                 }
00474                 ++tf;
00475                 cf += wdf;
00476 
00477                 if (!doclens.empty()) {
00478                     if (did >= doclens.size()) {
00479                         cout << "document id " << did << " is larger than any in the termlist table!" << endl;
00480                     } else if (doclens[did] != doclen) {
00481                         cout << "doclen " << doclen << " doesn't match " << doclens[did] << " in the termlist table" << endl;
00482                         ++errors;
00483                     }
00484                 }
00485                 if (pos == end) break;
00486 
00487                 Xapian::docid inc;
00488                 if (!unpack_uint(&pos, end, &inc)) {
00489                     cout << "Failed to unpack docid increase" << endl;
00490                     ++errors;
00491                     bad = true;
00492                     break;
00493                 }
00494                 ++inc;
00495                 did += inc;
00496                 if (did > lastdid) {
00497                     cout << "docid " << did << " > last docid " << lastdid
00498                          << endl;
00499                     ++errors;
00500                 }
00501             }
00502             if (bad) {
00503                 cursor->next();
00504                 continue;
00505             }
00506             if (is_last_chunk) {
00507                 if (tf != termfreq) {
00508                     cout << "termfreq " << termfreq << " != # of entries "
00509                          << tf << endl;
00510                     ++errors;
00511                 }
00512                 if (cf != collfreq) {
00513                     cout << "collfreq " << collfreq << " != sum wdf " << cf
00514                          << endl;
00515                     ++errors;
00516                 }
00517                 if (did != lastdid) {
00518                     cout << "lastdid " << lastdid << " != last did " << did
00519                          << endl;
00520                     ++errors;
00521                 }
00522                 current_term = "";
00523             }
00524 
00525             cursor->next();
00526         }
00527         if (!current_term.empty()) {
00528             cout << "Last term `" << current_term << "' has no last chunk"
00529                  << endl;
00530             ++errors;
00531         }
00532     } else if (tablename == "record") {
00533         // Now check the contents of the record table.  Any data is valid as
00534         // the tag so we don't check the tags.
00535         while (!cursor->after_end()) {
00536             string & key = cursor->current_key;
00537 
00538             // Get docid from key.
00539             const char * pos = key.data();
00540             const char * end = pos + key.size();
00541 
00542             Xapian::docid did;
00543             if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00544                 cout << "Error unpacking docid from key" << endl;
00545                 return errors + 1;
00546             } else if (pos != end) {
00547                 cout << "Extra junk in key" << endl;
00548                 return errors + 1;
00549             }
00550 
00551             cursor->next();
00552         }
00553     } else if (tablename == "termlist") {
00554         // Now check the contents of the termlist table.
00555         while (!cursor->after_end()) {
00556             string & key = cursor->current_key;
00557 
00558             // Get docid from key.
00559             const char * pos = key.data();
00560             const char * end = pos + key.size();
00561 
00562             Xapian::docid did;
00563             if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00564                 cout << "Error unpacking docid from key" << endl;
00565                 return errors + 1;
00566             } else if (pos != end) {
00567                 cout << "Extra junk in key" << endl;
00568                 return errors + 1;
00569             }
00570 
00571             cursor->read_tag();
00572 
00573             pos = cursor->current_tag.data();
00574             end = pos + cursor->current_tag.size();
00575 
00576             if (pos == end) {
00577                 // Empty termlist.
00578                 cursor->next();
00579                 continue;
00580             }
00581 
00582             Xapian::termcount doclen, termlist_size;
00583 
00584             // Read doclen
00585             if (!unpack_uint(&pos, end, &doclen)) {
00586                 if (pos != 0) {
00587                     cout << "doclen out of range" << endl;
00588                 } else {
00589                     cout << "Unexpected end of data when reading doclen" << endl;
00590                 }
00591                 ++errors;
00592                 cursor->next();
00593                 continue;
00594             }
00595 
00596             // Read termlist_size
00597             if (!unpack_uint(&pos, end, &termlist_size)) {
00598                 if (pos != 0) {
00599                     cout << "termlist_size out of range" << endl;
00600                 } else {
00601                     cout << "Unexpected end of data when reading termlist_size" << endl;
00602                 }
00603                 ++errors;
00604                 cursor->next();
00605                 continue;
00606             }
00607 
00608             // See comment in FlintTermListTable::set_termlist() in
00609             // flint_termlisttable.cc for an explanation of this!
00610             if (pos != end && *pos == '0') ++pos;
00611 
00612             Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
00613             string current_tname;
00614 
00615             bool bad = false;
00616             while (pos != end) {
00617                 Xapian::doccount current_wdf;
00618                 bool got_wdf = false;
00619                 // If there was a previous term, how much to reuse.
00620                 if (!current_tname.empty()) {
00621                     string::size_type len = static_cast<unsigned char>(*pos++);
00622                     if (len > current_tname.length()) {
00623                         // The wdf was squeezed into the same byte.
00624                         current_wdf = len / (current_tname.length() + 1) - 1;
00625                         len %= (current_tname.length() + 1);
00626                         got_wdf = true;
00627                     }
00628                     current_tname.resize(len);
00629                 }
00630                 // What to append (note len must be positive, since just truncating
00631                 // always takes us backwards in the sort order)
00632                 string::size_type len = static_cast<unsigned char>(*pos++);
00633                 current_tname.append(pos, len);
00634                 pos += len;
00635 
00636                 if (!got_wdf) {
00637                     // Read wdf
00638                     if (!unpack_uint(&pos, end, &current_wdf)) {
00639                         if (pos == 0) {
00640                             cout << "Unexpected end of data when reading termlist current_wdf" << endl;
00641                         } else {
00642                             cout << "Size of wdf out of range, in termlist" << endl;
00643                         }
00644                         ++errors;
00645                         bad = true;
00646                         break;
00647                     }
00648                 }
00649 
00650                 ++actual_termlist_size;
00651                 actual_doclen += current_wdf;
00652             }
00653             if (bad) {
00654                 cursor->next();
00655                 continue;
00656             }
00657 
00658             if (termlist_size != actual_termlist_size) {
00659                 cout << "termlist_size != # of entries in termlist" << endl;
00660                 ++errors;
00661             }
00662             if (doclen != actual_doclen) {
00663                 cout << "doclen != sum(wdf)" << endl;
00664                 ++errors;
00665             }
00666 
00667             // + 1 so that did is a valid subscript.
00668             if (doclens.size() <= did) doclens.resize(did + 1);
00669             doclens[did] = actual_doclen;
00670 
00671             cursor->next();
00672         }
00673     } else if (tablename == "value") {
00674         // Now check the contents of the value table.
00675         while (!cursor->after_end()) {
00676             string & key = cursor->current_key;
00677 
00678             // Get docid from key.
00679             const char * pos = key.data();
00680             const char * end = pos + key.size();
00681 
00682             Xapian::docid did;
00683             if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00684                 cout << "Error unpacking docid from key" << endl;
00685                 return errors + 1;
00686             } else if (pos != end) {
00687                 cout << "Extra junk in key" << endl;
00688                 return errors + 1;
00689             }
00690 
00691             cursor->read_tag();
00692 
00693             pos = cursor->current_tag.data();
00694             end = pos + cursor->current_tag.size();
00695 
00696             bool first = true;
00697             Xapian::valueno last_value_no = 0;
00698             while (pos && pos != end) {
00699                 Xapian::valueno this_value_no;
00700                 string this_value;
00701 
00702                 if (!unpack_uint(&pos, end, &this_value_no)) {
00703                     if (pos == 0)
00704                         cout << "Incomplete item in value table" << endl;
00705                     else
00706                         cout << "Value number in value table is too large" << endl;
00707                     ++errors;
00708                     break;
00709                 }
00710 
00711                 if (!unpack_string(&pos, end, this_value)) {
00712                     if (pos == 0)
00713                         cout << "Incomplete item in value table" << endl;
00714                     else
00715                         cout << "Item in value table is too large" << endl;
00716                     ++errors;
00717                     break;
00718                 }
00719 
00720                 if (first) {
00721                     first = false;
00722                 } else if (this_value_no <= last_value_no) {
00723                     cout << "Values not in sorted order - valueno " << last_value_no << " comes before valueno " << this_value_no << endl;
00724                     ++errors;
00725                 }
00726                 last_value_no = this_value_no;
00727             }
00728 
00729             cursor->next();
00730         }
00731     } else if (tablename == "position") {
00732         // Now check the contents of the position table.
00733         while (!cursor->after_end()) {
00734             string & key = cursor->current_key;
00735 
00736             // Get docid from key.
00737             const char * pos = key.data();
00738             const char * end = pos + key.size();
00739 
00740             Xapian::docid did;
00741             if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00742                 cout << "Error unpacking docid from key" << endl;
00743                 return errors + 1;
00744             }
00745             if (pos == end) {
00746                 cout << "No termname in key" << endl;
00747                 return errors + 1;
00748             }
00749 
00750             cursor->read_tag();
00751 
00752             const string & data = cursor->current_tag;
00753             pos = data.data();
00754             end = pos + data.size();
00755 
00756             Xapian::termpos pos_last;
00757             if (!unpack_uint(&pos, end, &pos_last)) {
00758                 cout << tablename << " table: Position list data corrupt" << endl;
00759                 ++errors;
00760                 cursor->next();
00761                 continue;
00762             }
00763             if (pos == end) {
00764                 // Special case for single entry position list.
00765             } else {
00766                 BitReader rd(data);
00767                 // Skip the header we just read.
00768                 (void)rd.read_bits(8 * (pos - data.data()));
00769                 Xapian::termpos pos_first = rd.decode(pos_last);
00770                 Xapian::termpos pos_size = rd.decode(pos_last - pos_first) + 2;
00771                 vector<Xapian::termpos> positions;
00772                 positions.resize(pos_size);
00773                 positions[0] = pos_first;
00774                 positions.back() = pos_last;
00775                 rd.decode_interpolative(positions, 0, pos_size - 1);
00776                 vector<Xapian::termpos>::const_iterator current_pos = positions.begin();
00777                 Xapian::termpos lastpos = *current_pos++;
00778                 while (current_pos != positions.end()) {
00779                     Xapian::termpos termpos = *current_pos++;
00780                     if (termpos <= lastpos) {
00781                         cout << tablename << " table: Positions not strictly monotonically increasing" << endl;
00782                         ++errors;
00783                         cursor->next();
00784                         continue;
00785                     }
00786                     lastpos = termpos;
00787                 }
00788             }
00789 
00790             cursor->next();
00791         }
00792     } else {
00793         cout << tablename << " table: Don't know how to check structure\n" << endl;
00794         return errors;
00795     }
00796 
00797     if (!errors)
00798         cout << tablename << " table structure checked OK\n" << endl;
00799     else
00800         cout << tablename << " table errors found: " << errors << "\n" << endl;
00801 
00802     return errors;
00803 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.