00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024
00025
00026 #define XAPIAN_DEPRECATED(D) D
00027 #include <xapian.h>
00028
00029 #include <iostream>
00030
00031 #include "autoptr.h"
00032 #include "btreecheck.h"
00033 #include "bcursor.h"
00034 #include "quartz_types.h"
00035 #include "quartz_utils.h"
00036
00037 #include "safesysstat.h"
00038
00039 using namespace std;
00040
00041 #define PROG_NAME "quartzcheck"
00042 #define PROG_DESC "Check the consistency of quartz database or table"
00043
00044
00045
00046
00047
00048 static void show_usage() {
00049 cout << "Usage: "PROG_NAME" <path to btree and prefix>|<quartz directory> [[t][f][b][v][+]]\n\n"
00050 "The btree(s) is/are always checked - control the output verbosity with:\n"
00051 " t = short tree printing\n"
00052 " f = full tree printing\n"
00053 " b = show bitmap\n"
00054 " v = show stats about B-tree (default)\n"
00055 " + = same as tbv\n"
00056 " e.g. "PROG_NAME" /var/lib/xapian/data/default\n"
00057 " "PROG_NAME" /var/lib/xapian/data/default/postlist_ fbv" << endl;
00058 }
00059
00060 static size_t check_table(const char *table, int opts);
00061
00062 static vector<Xapian::termcount> doclens;
00063
00064 int
00065 main(int argc, char **argv)
00066 {
00067 if (argc > 1 && argv[1][0] == '-') {
00068 if (strcmp(argv[1], "--help") == 0) {
00069 cout << PROG_NAME" - "PROG_DESC"\n\n";
00070 show_usage();
00071 exit(0);
00072 }
00073 if (strcmp(argv[1], "--version") == 0) {
00074 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00075 exit(0);
00076 }
00077 }
00078 if (argc < 2 || argc > 3) {
00079 show_usage();
00080 exit(1);
00081 }
00082
00083 int opts = 0;
00084 const char * opt_string = argv[2];
00085 if (!opt_string) opt_string = "v";
00086 for (const char *p = opt_string; *p; ++p) {
00087 switch (*p) {
00088 case 't': opts |= OPT_SHORT_TREE; break;
00089 case 'f': opts |= OPT_FULL_TREE; break;
00090 case 'b': opts |= OPT_SHOW_BITMAP; break;
00091 case 'v': opts |= OPT_SHOW_STATS; break;
00092 case '+':
00093 opts |= OPT_SHORT_TREE | OPT_SHOW_BITMAP | OPT_SHOW_STATS;
00094 break;
00095 default:
00096 cerr << "option " << opt_string << " unknown\n";
00097 cerr << "use t,f,b,v and/or + in the option string\n";
00098 exit(1);
00099 }
00100 }
00101
00102 try {
00103 size_t errors = 0;
00104 struct stat sb;
00105 string meta_file(argv[1]);
00106 meta_file += "/meta";
00107 if (stat(meta_file.c_str(), &sb) == 0) {
00108
00109 try {
00110 Xapian::Database db = Xapian::Quartz::open(argv[1]);
00111 doclens.reserve(db.get_lastdocid());
00112 } catch (const Xapian::Error & e) {
00113
00114 cout << "Database couldn't be opened for reading: "
00115 << e.get_description()
00116 << "\nContinuing check anyway" << endl;
00117 ++errors;
00118 }
00119
00120
00121
00122 const char * tables[] = {
00123 "record", "termlist", "postlist", "position", "value"
00124 };
00125 for (const char **t = tables;
00126 t != tables + sizeof(tables)/sizeof(tables[0]); ++t) {
00127 string table(argv[1]);
00128 table += '/';
00129 table += *t;
00130 table += '_';
00131 cout << *t << ":\n";
00132 errors += check_table(table.c_str(), opts);
00133 }
00134 } else {
00135
00136 errors = check_table(argv[1], opts);
00137 }
00138 if (errors > 0) {
00139 cout << "Total errors found: " << errors << endl;
00140 exit(1);
00141 }
00142 cout << "No errors found" << endl;
00143 } catch (const char *error) {
00144 cerr << argv[0] << ": " << error << endl;
00145 exit(1);
00146 } catch (const Xapian::Error &error) {
00147 cerr << argv[0] << ": " << error.get_description() << endl;
00148 exit(1);
00149 } catch (...) {
00150 cerr << argv[0] << ": Unknown exception" << endl;
00151 exit(1);
00152 }
00153 }
00154
00155 static size_t
00156 check_table(const char *filename, int opts)
00157 {
00158
00159 BtreeCheck::check(filename, opts);
00160
00161
00162 Btree table(filename, true);
00163 table.open();
00164 AutoPtr<Bcursor> cursor(table.cursor_get());
00165
00166 size_t errors = 0;
00167
00168 cursor->find_entry("");
00169 cursor->next();
00170
00171 const char *p = strrchr(filename, '/');
00172 if (!p) p = strrchr(filename, '\\');
00173 if (p) ++p; else p = filename;
00174 string tablename;
00175 while (unsigned char ch = *p++) {
00176 if (ch == '_' && *p == '\0') break;
00177 tablename += tolower(ch);
00178 }
00179
00180 if (tablename == "postlist") {
00181
00182 string current_term;
00183 Xapian::docid lastdid = 0;
00184 Xapian::termcount termfreq = 0, collfreq = 0;
00185 Xapian::termcount tf = 0, cf = 0;
00186 while (!cursor->after_end()) {
00187 string & key = cursor->current_key;
00188
00189 const char * pos, * end;
00190
00191
00192 pos = key.data();
00193 end = pos + key.size();
00194
00195 string term;
00196 Xapian::docid did;
00197 if (!unpack_string_preserving_sort(&pos, end, term)) {
00198 cout << "Error unpacking termname from key" << endl;
00199 ++errors;
00200 continue;
00201 }
00202 if (current_term.empty()) {
00203 current_term = term;
00204 tf = cf = 0;
00205 if (pos != end) {
00206 cout << "Extra bytes after key for first chunk of "
00207 "posting list for term `" << term << "'" << endl;
00208 ++errors;
00209 continue;
00210 }
00211
00212 cursor->read_tag();
00213 pos = cursor->current_tag.data();
00214 end = pos + cursor->current_tag.size();
00215 if (!unpack_uint(&pos, end, &termfreq)) {
00216 cout << "Failed to unpack termfreq for term `" << term
00217 << "'" << endl;
00218 ++errors;
00219 continue;
00220 }
00221 if (!unpack_uint(&pos, end, &collfreq)) {
00222 cout << "Failed to unpack collfreq for term `" << term
00223 << "'" << endl;
00224 ++errors;
00225 continue;
00226 }
00227 if (!unpack_uint(&pos, end, &did)) {
00228 cout << "Failed to unpack firstdid for term `" << term
00229 << "'" << endl;
00230 ++errors;
00231 continue;
00232 } else {
00233 ++did;
00234 }
00235 } else {
00236 if (term != current_term) {
00237 if (pos == end) {
00238 cout << "No last chunk for term `" << term << "'"
00239 << endl;
00240 } else {
00241 cout << "Mismatch in follow-on chunk in posting "
00242 "list for term `" << current_term << "' (got `"
00243 << term << "')" << endl;
00244 }
00245 ++errors;
00246 current_term = term;
00247 }
00248 if (pos != end) {
00249 if (!unpack_uint_preserving_sort(&pos, end, &did)) {
00250 cout << "Failed to unpack did from key" << endl;
00251 ++errors;
00252 continue;
00253 }
00254 if (did <= lastdid) {
00255 cout << "First did in this chunk is <= last in "
00256 "prev chunk" << endl;
00257 ++errors;
00258 }
00259 }
00260 cursor->read_tag();
00261 pos = cursor->current_tag.data();
00262 end = pos + cursor->current_tag.size();
00263 }
00264
00265 bool is_last_chunk;
00266 if (!unpack_bool(&pos, end, &is_last_chunk)) {
00267 cout << "Failed to unpack last chunk flag" << endl;
00268 ++errors;
00269 continue;
00270 }
00271
00272 if (!unpack_uint(&pos, end, &lastdid)) {
00273 cout << "Failed to unpack increase to last" << endl;
00274 ++errors;
00275 continue;
00276 }
00277 ++lastdid;
00278 lastdid += did;
00279 bool bad = false;
00280 while (true) {
00281 Xapian::termcount wdf, doclen;
00282 if (!unpack_uint(&pos, end, &wdf)) {
00283 cout << "Failed to unpack wdf" << endl;
00284 ++errors;
00285 bad = true;
00286 break;
00287 }
00288 if (!unpack_uint(&pos, end, &doclen)) {
00289 cout << "Failed to unpack doc length" << endl;
00290 ++errors;
00291 bad = true;
00292 break;
00293 }
00294 ++tf;
00295 cf += wdf;
00296
00297 if (!doclens.empty()) {
00298 if (did >= doclens.size()) {
00299 cout << "document id " << did << " is larger than any in the termlist table!" << endl;
00300 } else if (doclens[did] != doclen) {
00301 cout << "doclen " << doclen << " doesn't match " << doclens[did] << " in the termlist table" << endl;
00302 ++errors;
00303 }
00304 }
00305 if (pos == end) break;
00306
00307 Xapian::docid inc;
00308 if (!unpack_uint(&pos, end, &inc)) {
00309 cout << "Failed to unpack docid increase" << endl;
00310 ++errors;
00311 bad = true;
00312 break;
00313 }
00314 ++inc;
00315 did += inc;
00316 if (did > lastdid) {
00317 cout << "docid " << did << " > last docid " << lastdid
00318 << endl;
00319 ++errors;
00320 }
00321 }
00322 if (bad) continue;
00323 if (is_last_chunk) {
00324 if (tf != termfreq) {
00325 cout << "termfreq " << termfreq << " != # of entries "
00326 << tf << endl;
00327 ++errors;
00328 }
00329 if (cf != collfreq) {
00330 cout << "collfreq " << collfreq << " != sum wdf " << cf
00331 << endl;
00332 ++errors;
00333 }
00334 if (did != lastdid) {
00335 cout << "lastdid " << lastdid << " != last did " << did
00336 << endl;
00337 ++errors;
00338 }
00339 current_term = "";
00340 }
00341
00342 cursor->next();
00343 }
00344 if (!current_term.empty()) {
00345 cout << "Last term `" << current_term << "' has no last chunk"
00346 << endl;
00347 ++errors;
00348 }
00349 } else if (tablename == "record") {
00350
00351
00352
00353 if (!cursor->after_end()) {
00354 if (cursor->current_key != string("", 1)) {
00355 cout << "METAINFO key missing from record table" << endl;
00356 return errors + 1;
00357 } else {
00358 cursor->read_tag();
00359
00360 Xapian::docid did;
00361 quartz_totlen_t totlen;
00362 const char * data = cursor->current_tag.data();
00363 const char * end = data + cursor->current_tag.size();
00364 if (!unpack_uint(&data, end, &did)) {
00365 cout << "Record containing meta information is corrupt." << endl;
00366 return errors + 1;
00367 }
00368 if (!unpack_uint_last(&data, end, &totlen)) {
00369 cout << "Record containing meta information is corrupt." << endl;
00370 return errors + 1;
00371 }
00372 if (data != end) {
00373 cout << "Record containing meta information is corrupt." << endl;
00374 return errors + 1;
00375 }
00376 cursor->next();
00377 }
00378 }
00379 while (!cursor->after_end()) {
00380 string & key = cursor->current_key;
00381
00382
00383 const char * pos = key.data();
00384 const char * end = pos + key.size();
00385
00386 Xapian::docid did;
00387 if (!unpack_uint_last(&pos, end, &did)) {
00388 cout << "Error unpacking docid from key" << endl;
00389 return errors + 1;
00390 } else if (pos != end) {
00391 cout << "Extra junk in key" << endl;
00392 return errors + 1;
00393 }
00394
00395 cursor->next();
00396 }
00397 } else if (tablename == "termlist") {
00398
00399 while (!cursor->after_end()) {
00400 string & key = cursor->current_key;
00401
00402
00403 const char * pos = key.data();
00404 const char * end = pos + key.size();
00405
00406 Xapian::docid did;
00407 if (!unpack_uint_last(&pos, end, &did)) {
00408 cout << "Error unpacking docid from key" << endl;
00409 return errors + 1;
00410 } else if (pos != end) {
00411 cout << "Extra junk in key" << endl;
00412 return errors + 1;
00413 }
00414
00415 cursor->read_tag();
00416
00417 pos = cursor->current_tag.data();
00418 end = pos + cursor->current_tag.size();
00419
00420 Xapian::termcount doclen, termlist_size;
00421 bool has_termfreqs;
00422
00423
00424 if (!unpack_uint(&pos, end, &doclen)) {
00425 if (pos != 0) {
00426 cout << "doclen out of range" << endl;
00427 } else {
00428 cout << "Unexpected end of data when reading doclen" << endl;
00429 }
00430 ++errors;
00431 continue;
00432 }
00433
00434
00435 if (!unpack_uint(&pos, end, &termlist_size)) {
00436 if (pos != 0) {
00437 cout << "termlist_size out of range" << endl;
00438 } else {
00439 cout << "Unexpected end of data when reading termlist_size" << endl;
00440 }
00441 ++errors;
00442 continue;
00443 }
00444
00445
00446 if (!unpack_bool(&pos, end, &has_termfreqs)) {
00447 cout << "Unexpected end of data when reading termlist" << endl;
00448 ++errors;
00449 continue;
00450 }
00451 if (has_termfreqs) {
00452 cout << "has_termfreqs is true, but Xapian never sets it!" << endl;
00453 ++errors;
00454 continue;
00455 }
00456
00457 Xapian::termcount actual_doclen = 0, actual_termlist_size = 0;
00458 string current_tname;
00459
00460 bool bad = false;
00461 while (pos != end) {
00462 Xapian::doccount current_wdf;
00463 bool got_wdf = false;
00464
00465 if (!current_tname.empty()) {
00466 string::size_type len = static_cast<unsigned char>(*pos++);
00467 if (len > current_tname.length()) {
00468
00469 current_wdf = len / (current_tname.length() + 1) - 1;
00470 len %= (current_tname.length() + 1);
00471 got_wdf = true;
00472 }
00473 current_tname.resize(len);
00474 }
00475
00476
00477 string::size_type len = static_cast<unsigned char>(*pos++);
00478 current_tname.append(pos, len);
00479 pos += len;
00480
00481 if (!got_wdf) {
00482
00483 if (!unpack_uint(&pos, end, ¤t_wdf)) {
00484 if (pos == 0) {
00485 cout << "Unexpected end of data when reading termlist" << endl;
00486 } else {
00487 cout << "Size of wdf out of range, in termlist" << endl;
00488 }
00489 ++errors;
00490 bad = true;
00491 break;
00492 }
00493 }
00494
00495
00496
00497
00498 ++actual_termlist_size;
00499 actual_doclen += current_wdf;
00500 }
00501 if (bad) continue;
00502
00503 if (termlist_size != actual_termlist_size) {
00504 cout << "termlist_size != # of entries in termlist" << endl;
00505 ++errors;
00506 }
00507 if (doclen != actual_doclen) {
00508 cout << "doclen != sum(wdf)" << endl;
00509 ++errors;
00510 }
00511
00512
00513 if (doclens.size() <= did) doclens.resize(did + 1);
00514 doclens[did] = actual_doclen;
00515
00516 cursor->next();
00517 }
00518 } else if (tablename == "value") {
00519
00520 while (!cursor->after_end()) {
00521 string & key = cursor->current_key;
00522
00523
00524 const char * pos = key.data();
00525 const char * end = pos + key.size();
00526
00527 Xapian::docid did;
00528 if (!unpack_uint_last(&pos, end, &did)) {
00529 cout << "Error unpacking docid from key" << endl;
00530 return errors + 1;
00531 } else if (pos != end) {
00532 cout << "Extra junk in key" << endl;
00533 return errors + 1;
00534 }
00535
00536 cursor->read_tag();
00537
00538 pos = cursor->current_tag.data();
00539 end = pos + cursor->current_tag.size();
00540
00541 bool first = true;
00542 Xapian::valueno last_value_no = 0;
00543 while (pos && pos != end) {
00544 Xapian::valueno this_value_no;
00545 string this_value;
00546
00547 if (!unpack_uint(&pos, end, &this_value_no)) {
00548 if (pos == 0)
00549 cout << "Incomplete item in value table" << endl;
00550 else
00551 cout << "Value number in value table is too large" << endl;
00552 ++errors;
00553 break;
00554 }
00555
00556 if (!unpack_string(&pos, end, this_value)) {
00557 if (pos == 0)
00558 cout << "Incomplete item in value table" << endl;
00559 else
00560 cout << "Item in value table is too large" << endl;
00561 ++errors;
00562 break;
00563 }
00564
00565 if (first) {
00566 first = false;
00567 } else if (this_value_no <= last_value_no) {
00568 cout << "Values not in sorted order - valueno " << last_value_no << " comes before valueno " << this_value_no << endl;
00569 ++errors;
00570 }
00571 last_value_no = this_value_no;
00572 }
00573
00574 cursor->next();
00575 }
00576 } else if (tablename == "position") {
00577
00578 while (!cursor->after_end()) {
00579 string & key = cursor->current_key;
00580
00581
00582 const char * pos = key.data();
00583 const char * end = pos + key.size();
00584
00585 Xapian::docid did;
00586 if (!unpack_uint(&pos, end, &did)) {
00587 cout << "Error unpacking docid from key" << endl;
00588 return errors + 1;
00589 }
00590 if (pos == end) {
00591 cout << "No termname in key" << endl;
00592 return errors + 1;
00593 }
00594
00595 cursor->read_tag();
00596
00597 pos = cursor->current_tag.data();
00598 end = pos + cursor->current_tag.size();
00599
00600 Xapian::termcount number_of_entries;
00601
00602
00603 if (!unpack_uint(&pos, end, &number_of_entries)) {
00604 if (pos != 0) {
00605 cout << "number_of_entries out of range" << endl;
00606 } else {
00607 cout << "Unexpected end of data when reading number_of_entries" << endl;
00608 }
00609 ++errors;
00610 continue;
00611 }
00612
00613 Xapian::termcount actual_number_of_entries = 0;
00614
00615 bool bad = false;
00616 while (pos != end) {
00617 Xapian::termpos pos_increment;
00618 if (!unpack_uint(&pos, end, &pos_increment)) {
00619 if (pos != 0) {
00620 cout << "value out of range" << endl;
00621 } else {
00622 cout << "Unexpected end of data when reading position increment" << endl;
00623 }
00624 ++errors;
00625 bad = true;
00626 break;
00627 }
00628
00629 ++actual_number_of_entries;
00630 }
00631 if (bad) continue;
00632
00633 if (number_of_entries != actual_number_of_entries) {
00634 cout << "number_of_entries != # of entries in positionlist" << endl;
00635 ++errors;
00636 }
00637
00638 cursor->next();
00639 }
00640 } else {
00641 cout << tablename << " table: Don't know how to check structure\n" << endl;
00642 return errors;
00643 }
00644
00645 if (!errors)
00646 cout << tablename << " table structure checked OK\n" << endl;
00647 else
00648 cout << tablename << " table errors found: " << errors << "\n" << endl;
00649
00650 return errors;
00651 }