00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include <config.h>
00025
00026 #include "safeerrno.h"
00027
00028 #include <fstream>
00029 #include <iostream>
00030 #include <queue>
00031
00032 #include <stdio.h>
00033 #include <string.h>
00034 #include <sys/types.h>
00035 #include "safesysstat.h"
00036
00037 #include "btree.h"
00038 #include "bcursor.h"
00039 #include "quartz_utils.h"
00040 #include "utils.h"
00041
00042 #include <xapian.h>
00043
00044 #include "gnu_getopt.h"
00045
00046 using namespace std;
00047
00048 #define PROG_NAME "quartzcompact"
00049 #define PROG_DESC "Compact a quartz database, or merge and compact several"
00050
00051 static void show_usage() {
00052 cout << "Usage: "PROG_NAME" [OPTION] SOURCE_DATABASE... DESTINATION_DATABASE\n\n"
00053 "Options:\n"
00054 " -b, --blocksize Set the blocksize in bytes (e.g. 4096) or K (e.g. 4K)\n"
00055 " (must be between 2K and 64K and a power of 2, default 8K)\n"
00056 " -n, --no-full Disable full compaction\n"
00057 " -F, --fuller Enable fuller compaction (not recommended if you plan to\n"
00058 " update the compacted database)\n"
00059 " --help display this help and exit\n"
00060 " --version output version information and exit" << endl;
00061 }
00062
00063 static inline bool
00064 is_metainfo_key(const string & key)
00065 {
00066 return key.size() == 1 && key[0] == '\0';
00067 }
00068
00069 class PostlistCursor : private Bcursor {
00070 Xapian::docid offset;
00071 public:
00072 string key, tag;
00073 Xapian::docid firstdid;
00074 Xapian::termcount tf, cf;
00075
00076 PostlistCursor(Btree *in, Xapian::docid offset_)
00077 : Bcursor(in), offset(offset_)
00078 {
00079 find_entry("");
00080 next();
00081 }
00082
00083 bool next() {
00084 if (!Bcursor::next()) return false;
00085
00086
00087
00088 read_tag();
00089 key = current_key;
00090 tag = current_tag;
00091 tf = cf = 0;
00092
00093
00094
00095 const char * d = key.data();
00096 string tname;
00097 if (!unpack_string_preserving_sort(&d, d + key.size(), tname))
00098 abort();
00099 bool first_chunk = (d == key.data() + key.size());
00100 if (!first_chunk) {
00101 size_t tmp = d - key.data();
00102 if (!unpack_uint_preserving_sort(&d, d + key.size(), &firstdid))
00103 abort();
00104 firstdid += offset;
00105 key.erase(tmp);
00106 }
00107
00108 if (first_chunk) {
00109 d = tag.data();
00110 const char *e = d + tag.size();
00111 if (!unpack_uint(&d, e, &tf))
00112 abort();
00113 if (!unpack_uint(&d, e, &cf))
00114 abort();
00115 if (!unpack_uint(&d, e, &firstdid))
00116 abort();
00117 firstdid += 1;
00118 firstdid += offset;
00119 tag.erase(0, d - tag.data());
00120 }
00121 return true;
00122 }
00123 };
00124
00125 class DocIDKeyedCursor : private Bcursor {
00126 Xapian::docid offset;
00127 public:
00128 string key, tag;
00129
00130 DocIDKeyedCursor(Btree *in, Xapian::docid offset_)
00131 : Bcursor(in), offset(offset_)
00132 {
00133 find_entry("");
00134 next();
00135 }
00136
00137 bool next() {
00138 if (!Bcursor::next()) return false;
00139 read_tag();
00140 tag = current_tag;
00141
00142
00143 if (offset && !is_metainfo_key(current_key)) {
00144 Xapian::docid did;
00145 const char * d = current_key.data();
00146 if (!unpack_uint_last(&d, d + current_key.size(), &did))
00147 abort();
00148 did += offset;
00149 key = pack_uint_last(did);
00150 } else {
00151 key = current_key;
00152 }
00153 return true;
00154 }
00155 };
00156
00157 class PositionCursor : private Bcursor {
00158 Xapian::docid offset;
00159 public:
00160 string key, tag;
00161
00162 PositionCursor(Btree *in, Xapian::docid offset_)
00163 : Bcursor(in), offset(offset_)
00164 {
00165 find_entry("");
00166 next();
00167 }
00168
00169 bool next() {
00170 if (!Bcursor::next()) return false;
00171 read_tag();
00172 tag = current_tag;
00173
00174 if (offset) {
00175
00176 Xapian::docid did;
00177 const char * d = current_key.data();
00178 if (!unpack_uint(&d, d + current_key.size(), &did))
00179 abort();
00180 did += offset;
00181 key = pack_uint(did);
00182 size_t tnameidx = d - current_key.data();
00183 key += current_key.substr(tnameidx);
00184 } else {
00185 key = current_key;
00186 }
00187 return true;
00188 }
00189 };
00190
00191 class CursorGt {
00192 public:
00195 bool operator()(const PostlistCursor *a, const PostlistCursor *b) {
00196 if (a->key > b->key) return true;
00197 if (a->key != b->key) return false;
00198 return (a->firstdid > b->firstdid);
00199 }
00200 bool operator()(const DocIDKeyedCursor *a, const DocIDKeyedCursor *b) {
00201 return (a->key > b->key);
00202 }
00203 bool operator()(const PositionCursor *a, const PositionCursor *b) {
00204 return (a->key > b->key);
00205 }
00206 };
00207
00208 int
00209 main(int argc, char **argv)
00210 {
00211 const struct option long_opts[] = {
00212 {"no-full", no_argument, 0, 'n'},
00213 {"fuller", no_argument, 0, 'F'},
00214 {"blocksize", required_argument, 0, 'b'},
00215 {"help", no_argument, 0, 'h'},
00216 {"version", no_argument, 0, 'v'},
00217 {NULL, 0, 0, 0}
00218 };
00219
00220 bool full_compaction = true;
00221 size_t block_capacity = 0;
00222 size_t block_size = 8192;
00223
00224 int c;
00225 while ((c = gnu_getopt_long(argc, argv, "b:nFhv", long_opts, 0)) != EOF) {
00226 switch (c) {
00227 case 'b': {
00228 char *p;
00229 block_size = strtoul(optarg, &p, 10);
00230 if (block_size <= 64 && (*p == 'K' || *p == 'k')) {
00231 ++p;
00232 block_size *= 1024;
00233 }
00234 if (*p || block_size < 2048 || block_size > 65536 ||
00235 (block_size & (block_size - 1)) != 0) {
00236 cerr << PROG_NAME": Bad value '" << optarg
00237 << "' passed for blocksize, must be a power of 2 between 2K and 64K"
00238 << endl;
00239 exit(1);
00240 }
00241 break;
00242 }
00243 case 'n':
00244 full_compaction = false;
00245 break;
00246 case 'F':
00247 block_capacity = 1;
00248 break;
00249 case 'h':
00250 cout << PROG_NAME" - "PROG_DESC"\n\n";
00251 show_usage();
00252 exit(0);
00253 case 'v':
00254 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00255 exit(0);
00256 default:
00257 show_usage();
00258 exit(1);
00259 }
00260 }
00261
00262 if (argc - optind < 2) {
00263 show_usage();
00264 exit(1);
00265 }
00266
00267
00268 const char *destdir = argv[argc - 1];
00269
00270
00271 for (int i = optind; i < argc - 1; ++i) {
00272 const char *srcdir = argv[i];
00273 if (strcmp(srcdir, destdir) == 0) {
00274 cout << argv[0] << ": destination may not be the same as any source directory" << endl;
00275 exit(1);
00276 }
00277 }
00278
00279
00280 if (mkdir(destdir, 0755) == -1) {
00281
00282
00283
00284 if (errno == EEXIST) {
00285 struct stat sb;
00286 if (stat(destdir, &sb) == 0 && S_ISDIR(sb.st_mode))
00287 errno = 0;
00288 else
00289 errno = EEXIST;
00290 }
00291 if (errno) {
00292 cerr << argv[0] << ": couldn't create directory `"
00293 << destdir << "': " << strerror(errno) << endl;
00294 exit(1);
00295 }
00296 }
00297
00298 quartz_totlen_t tot_totlen = 0;
00299 try {
00300 const char * tables[] = {
00301 "postlist", "record", "termlist", "position", "value"
00302 };
00303
00304 const size_t out_of = argc - 1 - optind;
00305 vector<Xapian::docid> offset(out_of);
00306 Xapian::docid tot_off = 0;
00307 for (size_t i = 0; i < out_of; ++i) {
00308 Xapian::Database db(argv[i + optind]);
00309 Xapian::docid last = db.get_lastdocid();
00310 offset[i] = tot_off;
00311 tot_off += last;
00312
00313 }
00314
00315 for (const char **t = tables;
00316 t != tables + sizeof(tables)/sizeof(tables[0]); ++t) {
00317
00318
00319
00320
00321 cout << *t << " ..." << flush;
00322
00323 string dest = destdir;
00324 dest += '/';
00325 dest += *t;
00326 dest += '_';
00327
00328 Btree out(dest, false);
00329 out.create(block_size);
00330 out.open();
00331 out.set_full_compaction(full_compaction);
00332 if (block_capacity) out.set_max_item_size(block_capacity);
00333
00334
00335
00336 bool bad_stat = false;
00337
00338 off_t in_size = 0;
00339
00340 vector<Btree *> btrees;
00341
00342 if (strcmp(*t, "postlist") == 0) {
00343 priority_queue<PostlistCursor *, vector<PostlistCursor *>,
00344 CursorGt> pq;
00345 for (size_t i = 0; i < out_of; ++i) {
00346 Xapian::docid off = offset[i];
00347 const char *srcdir = argv[i + optind];
00348 string src(srcdir);
00349 src += '/';
00350 src += *t;
00351 src += '_';
00352
00353 Btree *in = new Btree(src, true);
00354 in->open();
00355 if (in->get_entry_count()) {
00356 btrees.push_back(in);
00357 pq.push(new PostlistCursor(in, off));
00358 } else {
00359 delete in;
00360 }
00361
00362 struct stat sb;
00363 if (stat(src + "DB", &sb) == 0)
00364 in_size += sb.st_size / 1024;
00365 else
00366 bad_stat = true;
00367 }
00368
00369 string last_key;
00370
00371 Xapian::termcount tf = 0, cf = 0;
00372 vector<pair<Xapian::docid, string> > tags;
00373 while (true) {
00374 PostlistCursor * bc = NULL;
00375 if (!pq.empty()) {
00376 bc = pq.top();
00377 pq.pop();
00378 }
00379 if (bc == NULL || bc->key != last_key) {
00380 if (!tags.empty()) {
00381 string first_tag = pack_uint(tf);
00382 first_tag += pack_uint(cf);
00383 first_tag += pack_uint(tags[0].first - 1);
00384 string tag = tags[0].second;
00385 tag[0] = (tags.size() == 1) ? '1' : '0';
00386 first_tag += tag;
00387 out.add(last_key, first_tag);
00388 vector<pair<Xapian::docid, string> >::const_iterator i;
00389 i = tags.begin();
00390 while (++i != tags.end()) {
00391 string key = last_key;
00392 key += pack_uint_preserving_sort(i->first);
00393 tag = i->second;
00394 tag[0] = (i + 1 == tags.end()) ? '1' : '0';
00395 out.add(key, tag);
00396 }
00397 }
00398 tags.clear();
00399 if (bc == NULL) break;
00400 tf = cf = 0;
00401 last_key = bc->key;
00402 }
00403 tf += bc->tf;
00404 cf += bc->cf;
00405 tags.push_back(make_pair(bc->firstdid, bc->tag));
00406 if (bc->next()) {
00407 pq.push(bc);
00408 } else {
00409 delete bc;
00410 }
00411 }
00412 } else if (strcmp(*t, "position") == 0) {
00413 priority_queue<PositionCursor *, vector<PositionCursor *>,
00414 CursorGt> pq;
00415 for (size_t i = 0; i < out_of; ++i) {
00416 Xapian::docid off = offset[i];
00417 const char *srcdir = argv[i + optind];
00418 string src(srcdir);
00419 src += '/';
00420 src += *t;
00421 src += '_';
00422
00423 Btree *in = new Btree(src, true);
00424 in->open();
00425 if (in->get_entry_count()) {
00426 btrees.push_back(in);
00427 pq.push(new PositionCursor(in, off));
00428 } else {
00429 delete in;
00430 }
00431
00432 struct stat sb;
00433 if (stat(src + "DB", &sb) == 0)
00434 in_size += sb.st_size / 1024;
00435 else
00436 bad_stat = true;
00437 }
00438
00439 while (!pq.empty()) {
00440 PositionCursor * bc = pq.top();
00441 pq.pop();
00442 out.add(bc->key, bc->tag);
00443 if (bc->next()) {
00444 pq.push(bc);
00445 } else {
00446 delete bc;
00447 }
00448 }
00449 } else {
00450
00451 priority_queue<DocIDKeyedCursor *, vector<DocIDKeyedCursor *>,
00452 CursorGt> pq;
00453 for (size_t i = 0; i < out_of; ++i) {
00454 Xapian::docid off = offset[i];
00455 const char *srcdir = argv[i + optind];
00456 string src(srcdir);
00457 src += '/';
00458 src += *t;
00459 src += '_';
00460
00461 Btree *in = new Btree(src, true);
00462 in->open();
00463 if (in->get_entry_count()) {
00464 btrees.push_back(in);
00465 pq.push(new DocIDKeyedCursor(in, off));
00466 } else {
00467 delete in;
00468 }
00469
00470 struct stat sb;
00471 if (stat(src + "DB", &sb) == 0)
00472 in_size += sb.st_size / 1024;
00473 else
00474 bad_stat = true;
00475 }
00476
00477 if (strcmp(*t, "record") == 0) {
00478
00479
00480
00481 Xapian::docid did;
00482 quartz_totlen_t totlen = 0;
00483 for (int i = pq.size(); i > 0; --i) {
00484 DocIDKeyedCursor * bc = pq.top();
00485 pq.pop();
00486 if (!is_metainfo_key(bc->key)) {
00487 throw Xapian::DatabaseCorruptError("No METAINFO item in record table.");
00488 }
00489 const char * data = bc->tag.data();
00490 const char * end = data + bc->tag.size();
00491 if (!unpack_uint(&data, end, &did)) {
00492 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00493 }
00494 if (!unpack_uint_last(&data, end, &totlen)) {
00495 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00496 }
00497 tot_totlen += totlen;
00498 if (tot_totlen < tot_totlen) {
00499 throw "totlen wrapped!";
00500 }
00501 if (bc->next()) {
00502 pq.push(bc);
00503 } else {
00504 delete bc;
00505 }
00506 }
00507 string tag = pack_uint(tot_off);
00508 tag += pack_uint_last(tot_totlen);
00509 out.add(string("", 1), tag);
00510 }
00511
00512 while (!pq.empty()) {
00513 DocIDKeyedCursor * bc = pq.top();
00514 pq.pop();
00515 out.add(bc->key, bc->tag);
00516 if (bc->next()) {
00517 pq.push(bc);
00518 } else {
00519 delete bc;
00520 }
00521 }
00522 }
00523
00524 for (vector<Btree *>::const_iterator b = btrees.begin();
00525 b != btrees.end(); ++b) {
00526 delete *b;
00527 }
00528 btrees.clear();
00529
00530 out.commit(1);
00531
00532 cout << '\r' << *t << ": ";
00533 struct stat sb;
00534 if (!bad_stat && stat(dest + "DB", &sb) == 0) {
00535 off_t out_size = sb.st_size / 1024;
00536 if (out_size == in_size) {
00537 cout << "Size unchanged (";
00538 } else if (out_size < in_size) {
00539 cout << "Reduced by "
00540 << 100 * double(in_size - out_size) / in_size << "% "
00541 << in_size - out_size << "K (" << in_size << "K -> ";
00542 } else {
00543 cout << "INCREASED by "
00544 << 100 * double(out_size - in_size) / in_size << "% "
00545 << out_size - in_size << "K (" << in_size << "K -> ";
00546 }
00547 cout << out_size << "K)";
00548 } else {
00549 cout << "Done (couldn't stat all the DB files)";
00550 }
00551 cout << endl;
00552 }
00553
00554
00555
00556
00557 string src(argv[optind]);
00558 src += "/meta";
00559 string dest = destdir;
00560 dest += "/meta.tmp";
00561 {
00562 ifstream metain(src.c_str());
00563 ofstream metaout(dest.c_str());
00564 char buf[2048];
00565 while (!metain.eof()) {
00566
00567 metain.read(buf, sizeof(buf));
00568 metaout.write(buf, metain.gcount());
00569 }
00570 }
00571 string meta = destdir;
00572 meta += "/meta";
00573 if (rename(dest.c_str(), meta.c_str()) == -1) {
00574 cerr << argv[0] << ": couldn't rename `" << dest << "' to `"
00575 << meta << "': " << strerror(errno) << endl;
00576 exit(1);
00577 }
00578 } catch (const Xapian::Error &error) {
00579 cerr << argv[0] << ": " << error.get_description() << endl;
00580 exit(1);
00581 }
00582 }