bin/quartzcheck.cc

Go to the documentation of this file.
00001 /* quartzcheck.cc: use Btree::check to check consistency of a quartz database
00002  * or btree.  Also check the structures inside the tables.
00003  *
00004  * Copyright 1999,2000,2001 BrightStation PLC
00005  * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 
00025 // We have to use the deprecated Quartz::open() method.
00026 #define XAPIAN_DEPRECATED(D) D
00027 #include <xapian.h>
00028 
00029 #include <iostream>
00030 
00031 #include "autoptr.h"
00032 #include "btreecheck.h"
00033 #include "bcursor.h"
00034 #include "quartz_types.h"
00035 #include "quartz_utils.h"
00036 
00037 #include "safesysstat.h"
00038 
00039 using namespace std;
00040 
00041 #define PROG_NAME "quartzcheck"
00042 #define PROG_DESC "Check the consistency of quartz database or table"
00043 
00044 // FIXME: We don't currently cross-check wdf between postlist and termlist.
00045 // It's hard to see how to efficiently.  We do cross-check doclens, but that
00046 // "only" requires (4 * last_docid()) bytes.
00047 
00048 static void show_usage() {
00049     cout << "Usage: "PROG_NAME" <path to btree and prefix>|<quartz directory> [[t][f][b][v][+]]\n\n"
00050 "The btree(s) is/are always checked - control the output verbosity with:\n"
00051 " t = short tree printing\n"
00052 " f = full tree printing\n"
00053 " b = show bitmap\n"
00054 " v = show stats about B-tree (default)\n"
00055 " + = same as tbv\n"
00056 " e.g. "PROG_NAME" /var/lib/xapian/data/default\n"
00057 "      "PROG_NAME" /var/lib/xapian/data/default/postlist_ fbv" << endl;
00058 }
00059 
00060 static size_t check_table(const char *table, int opts);
00061 
00062 static vector<Xapian::termcount> doclens;
00063 
00064 int
00065 main(int argc, char **argv)
00066 {
00067     if (argc > 1 && argv[1][0] == '-') {
00068         if (strcmp(argv[1], "--help") == 0) {
00069             cout << PROG_NAME" - "PROG_DESC"\n\n";
00070             show_usage();
00071             exit(0);
00072         }
00073         if (strcmp(argv[1], "--version") == 0) {
00074             cout << PROG_NAME" - "PACKAGE_STRING << endl;
00075             exit(0);
00076         }
00077     }
00078     if (argc < 2 || argc > 3) {
00079         show_usage();
00080         exit(1);
00081     }
00082 
00083     int opts = 0;
00084     const char * opt_string = argv[2];
00085     if (!opt_string) opt_string = "v";
00086     for (const char *p = opt_string; *p; ++p) {
00087         switch (*p) {
00088             case 't': opts |= OPT_SHORT_TREE; break;
00089             case 'f': opts |= OPT_FULL_TREE; break;
00090             case 'b': opts |= OPT_SHOW_BITMAP; break;
00091             case 'v': opts |= OPT_SHOW_STATS; break;
00092             case '+':
00093                 opts |= OPT_SHORT_TREE | OPT_SHOW_BITMAP | OPT_SHOW_STATS;
00094                 break;
00095             default:
00096                 cerr << "option " << opt_string << " unknown\n";
00097                 cerr << "use t,f,b,v and/or + in the option string\n";
00098                 exit(1);
00099         }
00100     }
00101 
00102     try {
00103         size_t errors = 0;
00104         struct stat sb;
00105         string meta_file(argv[1]);
00106         meta_file += "/meta";
00107         if (stat(meta_file.c_str(), &sb) == 0) {
00108             // Check a whole quartz database directory.
00109             try {
00110                 Xapian::Database db = Xapian::Quartz::open(argv[1]);
00111                 doclens.reserve(db.get_lastdocid());
00112             } catch (const Xapian::Error & e) {
00113                 // Ignore so we can check a database too broken to open.
00114                 cout << "Database couldn't be opened for reading: "
00115                      << e.get_description()
00116                      << "\nContinuing check anyway" << endl;
00117                 ++errors;
00118             }
00119             // Assume it's a quartz directory and try to check all the btrees
00120             // Note: it's important to check termlist before postlist so
00121             // that we can cross-check the document lengths.
00122             const char * tables[] = {
00123                 "record", "termlist", "postlist", "position", "value"
00124             };
00125             for (const char **t = tables;
00126                  t != tables + sizeof(tables)/sizeof(tables[0]); ++t) {
00127                 string table(argv[1]);
00128                 table += '/';
00129                 table += *t;
00130                 table += '_';
00131                 cout << *t << ":\n";
00132                 errors += check_table(table.c_str(), opts);
00133             }
00134         } else {
00135             // Just check a single Btree.
00136             errors = check_table(argv[1], opts);
00137         }
00138         if (errors > 0) {
00139             cout << "Total errors found: " << errors << endl;
00140             exit(1);
00141         }
00142         cout << "No errors found" << endl;
00143     } catch (const char *error) {
00144         cerr << argv[0] << ": " << error << endl;
00145         exit(1);
00146     } catch (const Xapian::Error &error) {
00147         cerr << argv[0] << ": " << error.get_description() << endl;
00148         exit(1);
00149     } catch (...) {
00150         cerr << argv[0] << ": Unknown exception" << endl;
00151         exit(1);
00152     }
00153 }
00154 
00155 static size_t
00156 check_table(const char *filename, int opts)
00157 {
00158     // Check the btree structure.
00159     BtreeCheck::check(filename, opts);
00160 
00161     // Now check the quartz structures inside the btree.
00162     Btree table(filename, true);
00163     table.open();
00164     AutoPtr<Bcursor> cursor(table.cursor_get());
00165 
00166     size_t errors = 0;
00167 
00168     cursor->find_entry("");
00169     cursor->next(); // Skip the empty entry.
00170 
00171     const char *p = strrchr(filename, '/');
00172     if (!p) p = strrchr(filename, '\\');
00173     if (p) ++p; else p = filename;
00174     string tablename;
00175     while (unsigned char ch = *p++) {
00176         if (ch == '_' && *p == '\0') break;
00177         tablename += tolower(ch);
00178     }
00179 
00180     if (tablename == "postlist") {
00181         // Now check the structure of each postlist in the table.
00182         string current_term;
00183         Xapian::docid lastdid = 0;
00184         Xapian::termcount termfreq = 0, collfreq = 0;
00185         Xapian::termcount tf = 0, cf = 0;
00186         while (!cursor->after_end()) {
00187             string & key = cursor->current_key;
00188 
00189             const char * pos, * end;
00190 
00191             // Get term from key.
00192             pos = key.data();
00193             end = pos + key.size();
00194 
00195             string term;
00196             Xapian::docid did;
00197             if (!unpack_string_preserving_sort(&pos, end, term)) {
00198                 cout << "Error unpacking termname from key" << endl;
00199                 ++errors;
00200                 continue;
00201             }
00202             if (current_term.empty()) {
00203                 current_term = term;
00204                 tf = cf = 0;
00205                 if (pos != end) {
00206                     cout << "Extra bytes after key for first chunk of "
00207                         "posting list for term `" << term << "'" << endl;
00208                     ++errors;
00209                     continue;
00210                 }
00211                 // Unpack extra header from first chunk.
00212                 cursor->read_tag();
00213                 pos = cursor->current_tag.data();
00214                 end = pos + cursor->current_tag.size();
00215                 if (!unpack_uint(&pos, end, &termfreq)) {
00216                     cout << "Failed to unpack termfreq for term `" << term
00217                          << "'" << endl;
00218                     ++errors;
00219                     continue;
00220                 }
00221                 if (!unpack_uint(&pos, end, &collfreq)) {
00222                     cout << "Failed to unpack collfreq for term `" << term
00223                          << "'" << endl;
00224                     ++errors;
00225                     continue;
00226                 }
00227                 if (!unpack_uint(&pos, end, &did)) {
00228                     cout << "Failed to unpack firstdid for term `" << term
00229                          << "'" << endl;
00230                     ++errors;
00231                     continue;
00232                 } else {
00233                     ++did;
00234                 }
00235             } else {
00236                 if (term != current_term) {
00237                     if (pos == end) {
00238                         cout << "No last chunk for term `" << term << "'"
00239                              << endl;
00240                     } else {
00241                         cout << "Mismatch in follow-on chunk in posting "
00242                             "list for term `" << current_term << "' (got `"
00243                             << term << "')" << endl;
00244                     }
00245                     ++errors;
00246                     current_term = term;
00247                 }
00248                 if (pos != end) {
00249                     if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00250                         cout << "Failed to unpack did from key" << endl;
00251                         ++errors;
00252                         continue;
00253                     }
00254                     if (did <= lastdid) {
00255                         cout << "First did in this chunk is <= last in "
00256                             "prev chunk" << endl;
00257                         ++errors;
00258                     }
00259                 }
00260                 cursor->read_tag();
00261                 pos = cursor->current_tag.data();
00262                 end = pos + cursor->current_tag.size();
00263             }
00264 
00265             bool is_last_chunk;
00266             if (!unpack_bool(&pos, end, &is_last_chunk)) {
00267                 cout << "Failed to unpack last chunk flag" << endl;
00268                 ++errors;
00269                 continue;
00270             }
00271             // Read what the final document ID in this chunk is.
00272             if (!unpack_uint(&pos, end, &lastdid)) {
00273                 cout << "Failed to unpack increase to last" << endl;
00274                 ++errors;
00275                 continue;
00276             }
00277             ++lastdid;
00278             lastdid += did;
00279             bool bad = false;
00280             while (true) {
00281                 Xapian::termcount wdf, doclen;
00282                 if (!unpack_uint(&pos, end, &wdf)) {
00283                     cout << "Failed to unpack wdf" << endl;
00284                     ++errors;
00285                     bad = true;
00286                     break;
00287                 }
00288                 if (!unpack_uint(&pos, end, &doclen)) {
00289                     cout << "Failed to unpack doc length" << endl;
00290                     ++errors;
00291                     bad = true;
00292                     break;
00293                 }
00294                 ++tf;
00295                 cf += wdf;
00296 
00297                 if (!doclens.empty()) {
00298                     if (did >= doclens.size()) {
00299                         cout << "document id " << did << " is larger than any in the termlist table!" << endl;
00300                     } else if (doclens[did] != doclen) {
00301                         cout << "doclen " << doclen << " doesn't match " << doclens[did] << " in the termlist table" << endl;
00302                         ++errors;
00303                     }
00304                 }
00305                 if (pos == end) break;
00306 
00307                 Xapian::docid inc;
00308                 if (!unpack_uint(&pos, end, &inc)) {
00309                     cout << "Failed to unpack docid increase" << endl;
00310                     ++errors;
00311                     bad = true;
00312                     break;
00313                 }
00314                 ++inc;
00315                 did += inc;
00316                 if (did > lastdid) {
00317                     cout << "docid " << did << " > last docid " << lastdid
00318                          << endl;
00319                     ++errors;
00320                 }
00321             }
00322             if (bad) continue;
00323             if (is_last_chunk) {
00324                 if (tf != termfreq) {
00325                     cout << "termfreq " << termfreq << " != # of entries "
00326                          << tf << endl;
00327                     ++errors;
00328                 }
00329                 if (cf != collfreq) {
00330                     cout << "collfreq " << collfreq << " != sum wdf " << cf
00331                          << endl;
00332                     ++errors;
00333                 }
00334                 if (did != lastdid) {
00335                     cout << "lastdid " << lastdid << " != last did " << did
00336                          << endl;
00337                     ++errors;
00338                 }
00339                 current_term = "";
00340             }
00341 
00342             cursor->next();
00343         }
00344         if (!current_term.empty()) {
00345             cout << "Last term `" << current_term << "' has no last chunk"
00346                  << endl;
00347             ++errors;
00348         }
00349     } else if (tablename == "record") {
00350         // Now check the contents of the record table.  Apart from the
00351         // METAINFO key, any data is valid as the tag so we don't check
00352         // those.
00353         if (!cursor->after_end()) {
00354             if (cursor->current_key != string("", 1)) {
00355                 cout << "METAINFO key missing from record table" << endl;
00356                 return errors + 1;
00357             } else {
00358                 cursor->read_tag();
00359                 // Check format of the METAINFO key.
00360                 Xapian::docid did;
00361                 quartz_totlen_t totlen;
00362                 const char * data = cursor->current_tag.data();
00363                 const char * end = data + cursor->current_tag.size();
00364                 if (!unpack_uint(&data, end, &did)) {
00365                     cout << "Record containing meta information is corrupt." << endl;
00366                     return errors + 1;
00367                 }
00368                 if (!unpack_uint_last(&data, end, &totlen)) {
00369                     cout << "Record containing meta information is corrupt." << endl;
00370                     return errors + 1;
00371                 }
00372                 if (data != end) {
00373                     cout << "Record containing meta information is corrupt." << endl;
00374                     return errors + 1;
00375                 }
00376                 cursor->next();
00377             }
00378         }
00379         while (!cursor->after_end()) {
00380             string & key = cursor->current_key;
00381 
00382             // Get docid from key.
00383             const char * pos = key.data();
00384             const char * end = pos + key.size();
00385 
00386             Xapian::docid did;
00387             if (!unpack_uint_last(&pos, end, &did)) {
00388                 cout << "Error unpacking docid from key" << endl;
00389                 return errors + 1;
00390             } else if (pos != end) {
00391                 cout << "Extra junk in key" << endl;
00392                 return errors + 1;
00393             }
00394 
00395             cursor->next();
00396         }
00397     } else if (tablename == "termlist") {
00398         // Now check the contents of the termlist table.
00399         while (!cursor->after_end()) {
00400             string & key = cursor->current_key;
00401 
00402             // Get docid from key.
00403             const char * pos = key.data();
00404             const char * end = pos + key.size();
00405 
00406             Xapian::docid did;
00407             if (!unpack_uint_last(&pos, end, &did)) {
00408                 cout << "Error unpacking docid from key" << endl;
00409                 return errors + 1;
00410             } else if (pos != end) {
00411                 cout << "Extra junk in key" << endl;
00412                 return errors + 1;
00413             }
00414 
00415             cursor->read_tag();
00416 
00417             pos = cursor->current_tag.data();
00418             end = pos + cursor->current_tag.size();
00419 
00420             Xapian::termcount doclen, termlist_size;
00421             bool has_termfreqs;
00422 
00423             // Read doclen
00424             if (!unpack_uint(&pos, end, &doclen)) {
00425                 if (pos != 0) {
00426                     cout << "doclen out of range" << endl;
00427                 } else {
00428                     cout << "Unexpected end of data when reading doclen" << endl;
00429                 }
00430                 ++errors;
00431                 continue;
00432             }
00433 
00434             // Read termlist_size
00435             if (!unpack_uint(&pos, end, &termlist_size)) {
00436                 if (pos != 0) {
00437                     cout << "termlist_size out of range" << endl;
00438                 } else {
00439                     cout << "Unexpected end of data when reading termlist_size" << endl;
00440                 }
00441                 ++errors;
00442                 continue;
00443             }
00444 
00445             // Read has_termfreqs
00446             if (!unpack_bool(&pos, end, &has_termfreqs)) {
00447                 cout << "Unexpected end of data when reading termlist" << endl;
00448                 ++errors;
00449                 continue;
00450             }
00451             if (has_termfreqs) {
00452                 cout << "has_termfreqs is true, but Xapian never sets it!" << endl;
00453                 ++errors;
00454                 continue;
00455             }
00456 
00457             Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
00458             string current_tname;
00459 
00460             bool bad = false;
00461             while (pos != end) {
00462                 Xapian::doccount current_wdf;
00463                 bool got_wdf = false;
00464                 // If there was a previous term, how much to reuse.
00465                 if (!current_tname.empty()) {
00466                     string::size_type len = static_cast<unsigned char>(*pos++);
00467                     if (len > current_tname.length()) {
00468                         // The wdf was squeezed into the same byte.
00469                         current_wdf = len / (current_tname.length() + 1) - 1;
00470                         len %= (current_tname.length() + 1);
00471                         got_wdf = true;
00472                     }
00473                     current_tname.resize(len);
00474                 }
00475                 // What to append (note len must be positive, since just truncating
00476                 // always takes us backwards in the sort order)
00477                 string::size_type len = static_cast<unsigned char>(*pos++);
00478                 current_tname.append(pos, len);
00479                 pos += len;
00480 
00481                 if (!got_wdf) {
00482                     // Read wdf
00483                     if (!unpack_uint(&pos, end, &current_wdf)) {
00484                         if (pos == 0) {
00485                             cout << "Unexpected end of data when reading termlist" << endl;
00486                         } else {
00487                             cout << "Size of wdf out of range, in termlist" << endl;
00488                         }
00489                         ++errors;
00490                         bad = true;
00491                         break;
00492                     }
00493                 }
00494 
00495                 // Don't bother with the (has_termfreqs == true) case since
00496                 // we never generate that.
00497 
00498                 ++actual_termlist_size;
00499                 actual_doclen += current_wdf;
00500             }
00501             if (bad) continue;
00502 
00503             if (termlist_size != actual_termlist_size) {
00504                 cout << "termlist_size != # of entries in termlist" << endl;
00505                 ++errors;
00506             }
00507             if (doclen != actual_doclen) {
00508                 cout << "doclen != sum(wdf)" << endl;
00509                 ++errors;
00510             }
00511 
00512             // + 1 so that did is a valid subscript.
00513             if (doclens.size() <= did) doclens.resize(did + 1);
00514             doclens[did] = actual_doclen;
00515 
00516             cursor->next();
00517         }
00518     } else if (tablename == "value") {
00519         // Now check the contents of the value table.
00520         while (!cursor->after_end()) {
00521             string & key = cursor->current_key;
00522 
00523             // Get docid from key.
00524             const char * pos = key.data();
00525             const char * end = pos + key.size();
00526 
00527             Xapian::docid did;
00528             if (!unpack_uint_last(&pos, end, &did)) {
00529                 cout << "Error unpacking docid from key" << endl;
00530                 return errors + 1;
00531             } else if (pos != end) {
00532                 cout << "Extra junk in key" << endl;
00533                 return errors + 1;
00534             }
00535 
00536             cursor->read_tag();
00537 
00538             pos = cursor->current_tag.data();
00539             end = pos + cursor->current_tag.size();
00540 
00541             bool first = true;
00542             Xapian::valueno last_value_no = 0;
00543             while (pos && pos != end) {
00544                 Xapian::valueno this_value_no;
00545                 string this_value;
00546 
00547                 if (!unpack_uint(&pos, end, &this_value_no)) {
00548                     if (pos == 0)
00549                         cout << "Incomplete item in value table" << endl;
00550                     else
00551                         cout << "Value number in value table is too large" << endl;
00552                     ++errors;
00553                     break;
00554                 }
00555 
00556                 if (!unpack_string(&pos, end, this_value)) {
00557                     if (pos == 0)
00558                         cout << "Incomplete item in value table" << endl;
00559                     else
00560                         cout << "Item in value table is too large" << endl;
00561                     ++errors;
00562                     break;
00563                 }
00564 
00565                 if (first) {
00566                     first = false;
00567                 } else if (this_value_no <= last_value_no) {
00568                     cout << "Values not in sorted order - valueno " << last_value_no << " comes before valueno " << this_value_no << endl;
00569                     ++errors;
00570                 }
00571                 last_value_no = this_value_no;
00572             }
00573 
00574             cursor->next();
00575         }
00576     } else if (tablename == "position") {
00577         // Now check the contents of the position table.
00578         while (!cursor->after_end()) {
00579             string & key = cursor->current_key;
00580 
00581             // Get docid from key.
00582             const char * pos = key.data();
00583             const char * end = pos + key.size();
00584 
00585             Xapian::docid did;
00586             if (!unpack_uint(&pos, end, &did)) {
00587                 cout << "Error unpacking docid from key" << endl;
00588                 return errors + 1;
00589             }
00590             if (pos == end) {
00591                 cout << "No termname in key" << endl;
00592                 return errors + 1;
00593             }
00594 
00595             cursor->read_tag();
00596 
00597             pos = cursor->current_tag.data();
00598             end = pos + cursor->current_tag.size();
00599 
00600             Xapian::termcount number_of_entries;
00601 
00602             // Read list length.
00603             if (!unpack_uint(&pos, end, &number_of_entries)) {
00604                 if (pos != 0) {
00605                     cout << "number_of_entries out of range" << endl;
00606                 } else {
00607                     cout << "Unexpected end of data when reading number_of_entries" << endl;
00608                 }
00609                 ++errors;
00610                 continue;
00611             }
00612 
00613             Xapian::termcount actual_number_of_entries = 0;
00614 
00615             bool bad = false;
00616             while (pos != end) {
00617                 Xapian::termpos pos_increment;
00618                 if (!unpack_uint(&pos, end, &pos_increment)) {
00619                     if (pos != 0) {
00620                         cout << "value out of range" << endl;
00621                     } else {
00622                         cout << "Unexpected end of data when reading position increment" << endl;
00623                     }
00624                     ++errors;
00625                     bad = true;
00626                     break;
00627                 }
00628 
00629                 ++actual_number_of_entries;
00630             }
00631             if (bad) continue;
00632 
00633             if (number_of_entries != actual_number_of_entries) {
00634                 cout << "number_of_entries != # of entries in positionlist" << endl;
00635                 ++errors;
00636             }
00637 
00638             cursor->next();
00639         }
00640     } else {
00641         cout << tablename << " table: Don't know how to check structure\n" << endl;
00642         return errors;
00643     }
00644 
00645     if (!errors)
00646         cout << tablename << " table structure checked OK\n" << endl;
00647     else
00648         cout << tablename << " table errors found: " << errors << "\n" << endl;
00649 
00650     return errors;
00651 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.