00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include <config.h>
00022
00023 #include "safeerrno.h"
00024
00025 #include <fstream>
00026 #include <iostream>
00027 #include <queue>
00028
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/types.h>
00032 #include "utils.h"
00033
00034 #include "flint_table.h"
00035 #include "flint_cursor.h"
00036 #include "flint_utils.h"
00037
00038 #include <xapian.h>
00039
00040 #include "gnu_getopt.h"
00041
00042 using namespace std;
00043
00044 #define PROG_NAME "xapian-compact"
00045 #define PROG_DESC "Compact a flint database, or merge and compact several"
00046
00047 #define OPT_HELP 1
00048 #define OPT_VERSION 2
00049 #define OPT_NO_RENUMBER 3
00050
00051 static void show_usage() {
00052 cout << "Usage: "PROG_NAME" [OPTIONS] SOURCE_DATABASE... DESTINATION_DATABASE\n\n"
00053 "Options:\n"
00054 " -b, --blocksize Set the blocksize in bytes (e.g. 4096) or K (e.g. 4K)\n"
00055 " (must be between 2K and 64K and a power of 2, default 8K)\n"
00056 " -n, --no-full Disable full compaction\n"
00057 " -F, --fuller Enable fuller compaction (not recommended if you plan to\n"
00058 " update the compacted database)\n"
00059 " -m, --multipass If merging more than 3 databases, merge the postlists in\n"
00060 " multiple passes (which is generally faster but requires\n"
00061 " more disk space for temporary files)\n"
00062 " --no-renumber Preserve the numbering of document ids (useful if you\n"
00063 " external references to them, or have set the to match\n"
00064 " unique ids from an external source). Currently this\n"
00065 " option isn't supported when merging databases.\n"
00066 " --help display this help and exit\n"
00067 " --version output version information and exit" << endl;
00068 }
00069
00070 static inline bool
00071 is_metainfo_key(const string & key)
00072 {
00073 return key.size() == 1 && key[0] == '\0';
00074 }
00075
00076 static inline bool
00077 is_user_metadata_key(const string & key)
00078 {
00079 return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
00080 }
00081
00082 class PostlistCursor : private FlintCursor {
00083 Xapian::docid offset;
00084
00085 public:
00086 string key, tag;
00087 Xapian::docid firstdid;
00088 Xapian::termcount tf, cf;
00089
00090 PostlistCursor(FlintTable *in, Xapian::docid offset_)
00091 : FlintCursor(in), offset(offset_), firstdid(0)
00092 {
00093 find_entry("");
00094 next();
00095 }
00096
00097 ~PostlistCursor()
00098 {
00099 delete FlintCursor::get_table();
00100 }
00101
00102 bool next() {
00103 if (!FlintCursor::next()) return false;
00104
00105
00106 read_tag();
00107 key = current_key;
00108 tag = current_tag;
00109 tf = cf = 0;
00110 if (is_metainfo_key(key)) return true;
00111 if (is_user_metadata_key(key)) return true;
00112
00113
00114
00115 const char * d = key.data();
00116 const char * e = d + key.size();
00117 string tname;
00118 if (!unpack_string_preserving_sort(&d, e, tname))
00119 throw Xapian::DatabaseCorruptError("Bad postlist key");
00120 if (d == e) {
00121
00122 d = tag.data();
00123 e = d + tag.size();
00124 if (!unpack_uint(&d, e, &tf) ||
00125 !unpack_uint(&d, e, &cf) ||
00126 !unpack_uint(&d, e, &firstdid)) {
00127 throw Xapian::DatabaseCorruptError("Bad postlist tag");
00128 }
00129 ++firstdid;
00130 tag.erase(0, d - tag.data());
00131 } else {
00132
00133 size_t tmp = d - key.data();
00134 if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
00135 throw Xapian::DatabaseCorruptError("Bad postlist key");
00136 key.erase(tmp);
00137 }
00138 firstdid += offset;
00139 return true;
00140 }
00141 };
00142
00143 class PostlistCursorGt {
00144 public:
00147 bool operator()(const PostlistCursor *a, const PostlistCursor *b) {
00148 if (a->key > b->key) return true;
00149 if (a->key != b->key) return false;
00150 return (a->firstdid > b->firstdid);
00151 }
00152 };
00153
00154 static void
00155 merge_postlists(FlintTable * out, vector<Xapian::docid>::const_iterator offset,
00156 vector<string>::const_iterator b, vector<string>::const_iterator e,
00157 Xapian::docid tot_off)
00158 {
00159 flint_totlen_t tot_totlen = 0;
00160 priority_queue<PostlistCursor *, vector<PostlistCursor *>, PostlistCursorGt> pq;
00161 for ( ; b != e; ++b, ++offset) {
00162 FlintTable *in = new FlintTable(*b, true);
00163 in->open();
00164 if (in->get_entry_count()) {
00165
00166
00167 PostlistCursor * cur = new PostlistCursor(in, *offset);
00168
00169
00170
00171 if (!is_metainfo_key(cur->key)) {
00172 throw Xapian::DatabaseCorruptError("No METAINFO item in postlist table.");
00173 }
00174 const char * data = cur->tag.data();
00175 const char * end = data + cur->tag.size();
00176 Xapian::docid dummy_did = 0;
00177 if (!unpack_uint(&data, end, &dummy_did)) {
00178 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00179 }
00180 flint_totlen_t totlen = 0;
00181 if (!unpack_uint_last(&data, end, &totlen)) {
00182 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00183 }
00184 tot_totlen += totlen;
00185 if (tot_totlen < tot_totlen) {
00186 throw "totlen wrapped!";
00187 }
00188 if (cur->next()) {
00189 pq.push(cur);
00190 } else {
00191 delete cur;
00192 }
00193 } else {
00194 delete in;
00195 }
00196 }
00197
00198 {
00199 string tag = pack_uint(tot_off);
00200 tag += pack_uint_last(tot_totlen);
00201 out->add(string("", 1), tag);
00202 }
00203
00204 string last_key;
00205 {
00206
00207 string last_tag;
00208 while (!pq.empty()) {
00209 PostlistCursor * cur = pq.top();
00210 const string& key = cur->key;
00211 if (!is_user_metadata_key(key)) break;
00212
00213 const string & tag = cur->tag;
00214 if (key == last_key) {
00215 if (tag != last_tag)
00216 cerr << "Warning: duplicate user metadata key with different tag value - picking arbitrary tag value" << endl;
00217 } else {
00218 out->add(key, tag);
00219 last_key = key;
00220 last_tag = tag;
00221 }
00222
00223 pq.pop();
00224 if (cur->next()) {
00225 pq.push(cur);
00226 } else {
00227 delete cur;
00228 }
00229 }
00230 }
00231
00232 Xapian::termcount tf = 0, cf = 0;
00233 vector<pair<Xapian::docid, string> > tags;
00234 while (true) {
00235 PostlistCursor * cur = NULL;
00236 if (!pq.empty()) {
00237 cur = pq.top();
00238 pq.pop();
00239 }
00240 Assert(cur == NULL || !is_user_metadata_key(cur->key));
00241 if (cur == NULL || cur->key != last_key) {
00242 if (!tags.empty()) {
00243 string first_tag = pack_uint(tf);
00244 first_tag += pack_uint(cf);
00245 first_tag += pack_uint(tags[0].first - 1);
00246 string tag = tags[0].second;
00247 tag[0] = (tags.size() == 1) ? '1' : '0';
00248 first_tag += tag;
00249 out->add(last_key, first_tag);
00250 vector<pair<Xapian::docid, string> >::const_iterator i;
00251 i = tags.begin();
00252 while (++i != tags.end()) {
00253 string key = last_key;
00254 key += pack_uint_preserving_sort(i->first);
00255 tag = i->second;
00256 tag[0] = (i + 1 == tags.end()) ? '1' : '0';
00257 out->add(key, tag);
00258 }
00259 }
00260 tags.clear();
00261 if (cur == NULL) break;
00262 tf = cf = 0;
00263 last_key = cur->key;
00264 }
00265 tf += cur->tf;
00266 cf += cur->cf;
00267 tags.push_back(make_pair(cur->firstdid, cur->tag));
00268 if (cur->next()) {
00269 pq.push(cur);
00270 } else {
00271 delete cur;
00272 }
00273 }
00274 }
00275
00276 struct MergeCursor : public FlintCursor {
00277 MergeCursor(FlintTable *in) : FlintCursor(in) {
00278 find_entry("");
00279 next();
00280 }
00281
00282 ~MergeCursor() {
00283 delete FlintCursor::get_table();
00284 }
00285 };
00286
00287 struct CursorGt {
00289 bool operator()(const FlintCursor *a, const FlintCursor *b) {
00290 if (b->after_end()) return false;
00291 if (a->after_end()) return true;
00292 return (a->current_key > b->current_key);
00293 }
00294 };
00295
00296 #define MAGIC_XOR_VALUE 96
00297
00298
00299 class PrefixCompressedStringItor {
00300 const unsigned char * p;
00301 size_t left;
00302 string current;
00303
00304 PrefixCompressedStringItor(const unsigned char * p_, size_t left_,
00305 const string ¤t_)
00306 : p(p_), left(left_), current(current_) { }
00307
00308 public:
00309 PrefixCompressedStringItor(const std::string & s)
00310 : p(reinterpret_cast<const unsigned char *>(s.data())),
00311 left(s.size()) {
00312 if (left) {
00313 operator++();
00314 } else {
00315 p = NULL;
00316 }
00317 }
00318
00319 const string & operator*() const {
00320 return current;
00321 }
00322
00323 PrefixCompressedStringItor operator++(int) {
00324 const unsigned char * old_p = p;
00325 size_t old_left = left;
00326 string old_current = current;
00327 operator++();
00328 return PrefixCompressedStringItor(old_p, old_left, old_current);
00329 }
00330
00331 PrefixCompressedStringItor & operator++() {
00332 if (left == 0) {
00333 p = NULL;
00334 } else {
00335 if (!current.empty()) {
00336 current.resize(*p++ ^ MAGIC_XOR_VALUE);
00337 --left;
00338 }
00339 size_t add;
00340 if (left == 0 || (add = *p ^ MAGIC_XOR_VALUE) >= left)
00341 throw Xapian::DatabaseCorruptError("Bad spelling data (too little left)");
00342 current.append(reinterpret_cast<const char *>(p + 1), add);
00343 p += add + 1;
00344 left -= add + 1;
00345 }
00346 return *this;
00347 }
00348
00349 bool at_end() const {
00350 return p == NULL;
00351 }
00352 };
00353
00354
00355 class PrefixCompressedStringWriter {
00356 string current;
00357 string & out;
00358
00359 public:
00360 PrefixCompressedStringWriter(string & out_) : out(out_) { }
00361
00362 void append(const string & word) {
00363
00364
00365 if (!current.empty()) {
00366 size_t len = min(current.size(), word.size());
00367 size_t i;
00368 for (i = 0; i < len; ++i) {
00369 if (current[i] != word[i]) break;
00370 }
00371 out += char(i ^ MAGIC_XOR_VALUE);
00372 out += char((word.size() - i) ^ MAGIC_XOR_VALUE);
00373 out.append(word.data() + i, word.size() - i);
00374 } else {
00375 out += char(word.size() ^ MAGIC_XOR_VALUE);
00376 out += word;
00377 }
00378 current = word;
00379 }
00380 };
00381
00382 struct PrefixCompressedStringItorGt {
00384 bool operator()(const PrefixCompressedStringItor *a,
00385 const PrefixCompressedStringItor *b) {
00386 return (**a > **b);
00387 }
00388 };
00389
00390 static void
00391 merge_spellings(FlintTable * out,
00392 vector<string>::const_iterator b,
00393 vector<string>::const_iterator e)
00394 {
00395 priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00396 for ( ; b != e; ++b) {
00397 FlintTable *in = new FlintTable(*b, true, DONT_COMPRESS, true);
00398 in->open();
00399 if (in->get_entry_count()) {
00400
00401
00402 pq.push(new MergeCursor(in));
00403 } else {
00404 delete in;
00405 }
00406 }
00407
00408 while (!pq.empty()) {
00409 MergeCursor * cur = pq.top();
00410 pq.pop();
00411
00412 string key = cur->current_key;
00413 if (pq.empty() || pq.top()->current_key > key) {
00414
00415
00416 bool compressed = cur->read_tag(true);
00417 out->add(key, cur->current_tag, compressed);
00418 if (cur->next()) {
00419 pq.push(cur);
00420 } else {
00421 delete cur;
00422 }
00423 continue;
00424 }
00425
00426
00427 string tag;
00428 if (key[0] != 'W') {
00429
00430
00431 priority_queue<PrefixCompressedStringItor *,
00432 vector<PrefixCompressedStringItor *>,
00433 PrefixCompressedStringItorGt> pqtag;
00434
00435
00436
00437 vector<MergeCursor *> vec;
00438 vec.reserve(pq.size());
00439
00440 while (true) {
00441 cur->read_tag();
00442 pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
00443 vec.push_back(cur);
00444 if (pq.empty() || pq.top()->current_key != key) break;
00445 cur = pq.top();
00446 pq.pop();
00447 }
00448
00449 PrefixCompressedStringWriter wr(tag);
00450 string lastword;
00451 while (!pqtag.empty()) {
00452 PrefixCompressedStringItor * it = pqtag.top();
00453 string word = **it;
00454 if (word != lastword) {
00455 lastword = word;
00456 wr.append(lastword);
00457 }
00458 ++*it;
00459 pqtag.pop();
00460 if (!it->at_end()) {
00461 pqtag.push(it);
00462 } else {
00463 delete it;
00464 }
00465 }
00466
00467 vector<MergeCursor *>::const_iterator i;
00468 for (i = vec.begin(); i != vec.end(); ++i) {
00469 cur = *i;
00470 if (cur->next()) {
00471 pq.push(cur);
00472 } else {
00473 delete cur;
00474 }
00475 }
00476 } else {
00477
00478 Xapian::termcount tot_freq = 0;
00479 while (true) {
00480 cur->read_tag();
00481 Xapian::termcount freq;
00482 const char * p = cur->current_tag.data();
00483 const char * end = p + cur->current_tag.size();
00484 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
00485 throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00486 }
00487 tot_freq += freq;
00488 if (cur->next()) {
00489 pq.push(cur);
00490 } else {
00491 delete cur;
00492 }
00493 if (pq.empty() || pq.top()->current_key != key) break;
00494 cur = pq.top();
00495 pq.pop();
00496 }
00497 tag = pack_uint_last(tot_freq);
00498 }
00499 out->add(key, tag);
00500 }
00501 }
00502
00503 class ByteLengthPrefixedStringItor {
00504 const unsigned char * p;
00505 size_t left;
00506
00507 ByteLengthPrefixedStringItor(const unsigned char * p_, size_t left_)
00508 : p(p_), left(left_) { }
00509
00510 public:
00511 ByteLengthPrefixedStringItor(const std::string & s)
00512 : p(reinterpret_cast<const unsigned char *>(s.data())),
00513 left(s.size()) { }
00514
00515 string operator*() const {
00516 size_t len = *p ^ MAGIC_XOR_VALUE;
00517 return string(reinterpret_cast<const char *>(p + 1), len);
00518 }
00519
00520 ByteLengthPrefixedStringItor operator++(int) {
00521 const unsigned char * old_p = p;
00522 size_t old_left = left;
00523 operator++();
00524 return ByteLengthPrefixedStringItor(old_p, old_left);
00525 }
00526
00527 ByteLengthPrefixedStringItor & operator++() {
00528 if (!left) {
00529 throw Xapian::DatabaseCorruptError("Bad synonym data (none left)");
00530 }
00531 size_t add = (*p ^ MAGIC_XOR_VALUE) + 1;
00532 if (left < add) {
00533 throw Xapian::DatabaseCorruptError("Bad synonym data (too little left)");
00534 }
00535 p += add;
00536 left -= add;
00537 return *this;
00538 }
00539
00540 bool at_end() const {
00541 return left == 0;
00542 }
00543 };
00544
00545 struct ByteLengthPrefixedStringItorGt {
00547 bool operator()(const ByteLengthPrefixedStringItor *a,
00548 const ByteLengthPrefixedStringItor *b) {
00549 return (**a > **b);
00550 }
00551 };
00552
00553 static void
00554 merge_synonyms(FlintTable * out,
00555 vector<string>::const_iterator b,
00556 vector<string>::const_iterator e)
00557 {
00558 priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00559 for ( ; b != e; ++b) {
00560 FlintTable *in = new FlintTable(*b, true, DONT_COMPRESS, true);
00561 in->open();
00562 if (in->get_entry_count()) {
00563
00564
00565 pq.push(new MergeCursor(in));
00566 } else {
00567 delete in;
00568 }
00569 }
00570
00571 while (!pq.empty()) {
00572 MergeCursor * cur = pq.top();
00573 pq.pop();
00574
00575 string key = cur->current_key;
00576 if (pq.top()->current_key > key) {
00577
00578
00579 bool compressed = cur->read_tag(true);
00580 out->add(key, cur->current_tag, compressed);
00581 if (cur->next()) {
00582 pq.push(cur);
00583 } else {
00584 delete cur;
00585 }
00586 continue;
00587 }
00588
00589
00590 string tag;
00591
00592
00593
00594 priority_queue<ByteLengthPrefixedStringItor *,
00595 vector<ByteLengthPrefixedStringItor *>,
00596 ByteLengthPrefixedStringItorGt> pqtag;
00597 vector<MergeCursor *> vec;
00598
00599 while (true) {
00600 cur->read_tag();
00601 pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
00602 vec.push_back(cur);
00603 if (pq.empty() || pq.top()->current_key != key) break;
00604 cur = pq.top();
00605 pq.pop();
00606 }
00607
00608 string lastword;
00609 while (!pqtag.empty()) {
00610 ByteLengthPrefixedStringItor * it = pqtag.top();
00611 if (**it != lastword) {
00612 lastword = **it;
00613 tag += byte(lastword.size() ^ MAGIC_XOR_VALUE);
00614 tag += lastword;
00615 }
00616 ++*it;
00617 pqtag.pop();
00618 if (!it->at_end()) {
00619 pqtag.push(it);
00620 } else {
00621 delete it;
00622 }
00623 }
00624
00625 vector<MergeCursor *>::const_iterator i;
00626 for (i = vec.begin(); i != vec.end(); ++i) {
00627 cur = *i;
00628 if (cur->next()) {
00629 pq.push(cur);
00630 } else {
00631 delete cur;
00632 }
00633 }
00634
00635 out->add(key, tag);
00636 }
00637 }
00638
00639 static void
00640 multimerge_postlists(FlintTable * out, const char * tmpdir,
00641 Xapian::docid tot_off,
00642 vector<string> tmp, vector<Xapian::docid> off)
00643 {
00644 unsigned int c = 0;
00645 while (tmp.size() > 3) {
00646 vector<string> tmpout;
00647 tmpout.reserve(tmp.size() / 2);
00648 vector<Xapian::docid> newoff;
00649 newoff.resize(tmp.size() / 2);
00650 for (unsigned int i = 0, j; i < tmp.size(); i = j) {
00651 j = i + 2;
00652 if (j == tmp.size() - 1) ++j;
00653
00654 string dest = tmpdir;
00655 char buf[64];
00656 sprintf(buf, "/tmp%u_%u.", c, i / 2);
00657 dest += buf;
00658
00659
00660
00661 FlintTable tmptab(dest, false);
00662
00663 tmptab.create_and_open(65536);
00664
00665 merge_postlists(&tmptab, off.begin() + i, tmp.begin() + i, tmp.begin() + j, 0);
00666 if (c > 0) {
00667 for (unsigned int k = i; k < j; ++k) {
00668 unlink((tmp[k] + "DB").c_str());
00669 unlink((tmp[k] + "baseA").c_str());
00670 unlink((tmp[k] + "baseB").c_str());
00671 }
00672 }
00673 tmpout.push_back(dest);
00674 tmptab.commit(1);
00675 }
00676 swap(tmp, tmpout);
00677 swap(off, newoff);
00678 ++c;
00679 }
00680 merge_postlists(out, off.begin(), tmp.begin(), tmp.end(), tot_off);
00681 if (c > 0) {
00682 for (size_t k = 0; k < tmp.size(); ++k) {
00683 unlink((tmp[k] + "DB").c_str());
00684 unlink((tmp[k] + "baseA").c_str());
00685 unlink((tmp[k] + "baseB").c_str());
00686 }
00687 }
00688 }
00689
00690 static void
00691 merge_docid_keyed(FlintTable *out, const vector<string> & inputs,
00692 const vector<Xapian::docid> & offset, bool lazy)
00693 {
00694 for (size_t i = 0; i < inputs.size(); ++i) {
00695 Xapian::docid off = offset[i];
00696
00697 FlintTable in(inputs[i], true, DONT_COMPRESS, lazy);
00698 in.open();
00699 if (in.get_entry_count() == 0) continue;
00700
00701 FlintCursor cur(&in);
00702 cur.find_entry("");
00703
00704 string key;
00705 while (cur.next()) {
00706
00707 if (off) {
00708 Xapian::docid did;
00709 const char * d = cur.current_key.data();
00710 const char * e = d + cur.current_key.size();
00711 if (!unpack_uint_preserving_sort(&d, e, &did)) {
00712 string msg = "Bad key in ";
00713 msg += inputs[i];
00714 throw Xapian::DatabaseCorruptError(msg);
00715 }
00716 did += off;
00717 key = pack_uint_preserving_sort(did);
00718 if (d != e) {
00719
00720 key.append(d, e - d);
00721 }
00722 } else {
00723 key = cur.current_key;
00724 }
00725 bool compressed = cur.read_tag(true);
00726 out->add(key, cur.current_tag, compressed);
00727 }
00728 }
00729 }
00730
00731 int
00732 main(int argc, char **argv)
00733 {
00734 const char * opts = "b:nFm";
00735 const struct option long_opts[] = {
00736 {"fuller", no_argument, 0, 'F'},
00737 {"no-full", no_argument, 0, 'n'},
00738 {"multipass", no_argument, 0, 'm'},
00739 {"blocksize", required_argument, 0, 'b'},
00740 {"no-renumber", no_argument, 0, OPT_NO_RENUMBER},
00741 {"help", no_argument, 0, OPT_HELP},
00742 {"version", no_argument, 0, OPT_VERSION},
00743 {NULL, 0, 0, 0}
00744 };
00745
00746 enum { STANDARD, FULL, FULLER } compaction = FULL;
00747 size_t block_size = 8192;
00748 bool multipass = false;
00749 bool renumber = true;
00750
00751 int c;
00752 while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
00753 switch (c) {
00754 case 'b': {
00755 char *p;
00756 block_size = strtoul(optarg, &p, 10);
00757 if (block_size <= 64 && (*p == 'K' || *p == 'k')) {
00758 ++p;
00759 block_size *= 1024;
00760 }
00761 if (*p || block_size < 2048 || block_size > 65536 ||
00762 (block_size & (block_size - 1)) != 0) {
00763 cerr << PROG_NAME": Bad value '" << optarg
00764 << "' passed for blocksize, must be a power of 2 between 2K and 64K"
00765 << endl;
00766 exit(1);
00767 }
00768 break;
00769 }
00770 case 'n':
00771 compaction = STANDARD;
00772 break;
00773 case 'F':
00774 compaction = FULLER;
00775 break;
00776 case 'm':
00777 multipass = true;
00778 break;
00779 case OPT_NO_RENUMBER:
00780 renumber = false;
00781 break;
00782 case OPT_HELP:
00783 cout << PROG_NAME" - "PROG_DESC"\n\n";
00784 show_usage();
00785 exit(0);
00786 case OPT_VERSION:
00787 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00788 exit(0);
00789 default:
00790 show_usage();
00791 exit(1);
00792 }
00793 }
00794
00795 if (argc - optind < 2) {
00796 show_usage();
00797 exit(1);
00798 }
00799
00800 if (!renumber && argc - optind > 2) {
00801 cout << argv[0]
00802 << ": --no-renumber isn't currently supported when merging databases."
00803 << endl;
00804 exit(1);
00805 }
00806
00807
00808 const char *destdir = argv[argc - 1];
00809
00810 try {
00811 vector<string> sources;
00812 vector<Xapian::docid> offset;
00813 sources.reserve(argc - 1 - optind);
00814 offset.reserve(argc - 1 - optind);
00815 Xapian::docid tot_off = 0;
00816 for (int i = optind; i < argc - 1; ++i) {
00817 const char *srcdir = argv[i];
00818
00819 if (strcmp(srcdir, destdir) == 0) {
00820 cout << argv[0]
00821 << ": destination may not be the same as any source directory."
00822 << endl;
00823 exit(1);
00824 }
00825
00826 struct stat sb;
00827 if (stat(string(srcdir) + "/iamflint", &sb) != 0) {
00828 cout << argv[0] << ": '" << srcdir
00829 << "' is not a flint database directory" << endl;
00830 exit(1);
00831 }
00832
00833 Xapian::Database db(srcdir);
00834 Xapian::docid last = 0;
00835
00836
00837
00838 if (db.get_doccount() != 0) {
00839 last = db.get_lastdocid();
00840
00841 if (renumber) {
00842
00843
00844 Xapian::PostingIterator it = db.postlist_begin("");
00845
00846
00847 if (it != db.postlist_end("")) {
00848
00849
00850 tot_off -= (*it - 1);
00851 }
00852
00853
00854
00855
00856 }
00857 }
00858 offset.push_back(tot_off);
00859 tot_off += last;
00860
00861 sources.push_back(string(srcdir) + '/');
00862 }
00863
00864
00865 if (mkdir(destdir, 0755) < 0) {
00866
00867
00868
00869 if (errno == EEXIST) {
00870 struct stat sb;
00871 if (stat(destdir, &sb) == 0 && S_ISDIR(sb.st_mode))
00872 errno = 0;
00873 else
00874 errno = EEXIST;
00875 }
00876 if (errno) {
00877 cerr << argv[0] << ": cannot create directory '"
00878 << destdir << "': " << strerror(errno) << endl;
00879 exit(1);
00880 }
00881 }
00882
00883 enum table_type {
00884 POSTLIST, RECORD, TERMLIST, POSITION, VALUE, SPELLING, SYNONYM
00885 };
00886 struct table_list {
00887
00888 const char * name;
00889
00890 table_type type;
00891
00892 int compress_strategy;
00893
00894 bool lazy;
00895 };
00896
00897 static const table_list tables[] = {
00898
00899 { "postlist", POSTLIST, DONT_COMPRESS, false },
00900 { "record", RECORD, Z_DEFAULT_STRATEGY, false },
00901 { "termlist", TERMLIST, Z_DEFAULT_STRATEGY, false },
00902 { "position", POSITION, DONT_COMPRESS, true },
00903 { "value", VALUE, DONT_COMPRESS, true },
00904 { "spelling", SPELLING, Z_DEFAULT_STRATEGY, true },
00905 { "synonym", SYNONYM, Z_DEFAULT_STRATEGY, true }
00906 };
00907 const table_list * tables_end = tables +
00908 (sizeof(tables) / sizeof(tables[0]));
00909
00910 for (const table_list * t = tables; t < tables_end; ++t) {
00911
00912
00913
00914
00915 cout << t->name << " ..." << flush;
00916
00917 string dest = destdir;
00918 dest += '/';
00919 dest += t->name;
00920 dest += '.';
00921
00922 FlintTable out(dest, false, t->compress_strategy, t->lazy);
00923 if (!t->lazy) {
00924 out.create_and_open(block_size);
00925 } else {
00926 out.erase();
00927 out.set_block_size(block_size);
00928 }
00929
00930 out.set_full_compaction(compaction != STANDARD);
00931 if (compaction == FULLER) out.set_max_item_size(1);
00932
00933
00934
00935 bool bad_stat = false;
00936
00937 off_t in_size = 0;
00938
00939 vector<string> inputs;
00940 inputs.reserve(sources.size());
00941 for (vector<string>::const_iterator src = sources.begin();
00942 src != sources.end(); ++src) {
00943 string s(*src);
00944 s += t->name;
00945 s += '.';
00946
00947 struct stat sb;
00948 if (stat(s + "DB", &sb) == 0) {
00949 in_size += sb.st_size / 1024;
00950 } else {
00951
00952 bad_stat = (errno != ENOENT);
00953 }
00954 inputs.push_back(s);
00955 }
00956
00957 if (inputs.empty()) continue;
00958
00959 switch (t->type) {
00960 case POSTLIST:
00961 if (multipass && inputs.size() > 3) {
00962 multimerge_postlists(&out, destdir, tot_off,
00963 inputs, offset);
00964 } else {
00965 merge_postlists(&out, offset.begin(),
00966 inputs.begin(), inputs.end(),
00967 tot_off);
00968 }
00969 break;
00970 case SPELLING:
00971 merge_spellings(&out, inputs.begin(), inputs.end());
00972 break;
00973 case SYNONYM:
00974 merge_synonyms(&out, inputs.begin(), inputs.end());
00975 break;
00976 default:
00977
00978 merge_docid_keyed(&out, inputs, offset, t->lazy);
00979 break;
00980 }
00981
00982
00983 out.commit(1);
00984
00985 cout << '\r' << t->name << ": ";
00986 off_t out_size = 0;
00987 if (!bad_stat) {
00988 struct stat sb;
00989 if (stat(dest + "DB", &sb) == 0) {
00990 out_size = sb.st_size / 1024;
00991 } else {
00992 bad_stat = (errno != ENOENT);
00993 }
00994 }
00995 if (bad_stat) {
00996 cout << "Done (couldn't stat all the DB files)";
00997 } else {
00998 if (out_size == in_size) {
00999 cout << "Size unchanged (";
01000 } else if (out_size < in_size) {
01001 cout << "Reduced by "
01002 << 100 * double(in_size - out_size) / in_size << "% "
01003 << in_size - out_size << "K (" << in_size << "K -> ";
01004 } else {
01005 cout << "INCREASED by "
01006 << 100 * double(out_size - in_size) / in_size << "% "
01007 << out_size - in_size << "K (" << in_size << "K -> ";
01008 }
01009 cout << out_size << "K)";
01010 }
01011 cout << endl;
01012 }
01013
01014
01015
01016
01017 string dest = destdir;
01018 dest += "/iamflint.tmp";
01019
01020 string src(argv[optind]);
01021 src += "/iamflint";
01022
01023 ifstream input(src.c_str());
01024 char buf[1024];
01025 input.read(buf, sizeof(buf));
01026 if (!input.eof()) {
01027 if (!input) {
01028 cerr << argv[0] << ": error reading '" << src << "': "
01029 << strerror(errno) << endl;
01030 exit(1);
01031 }
01032
01033 cerr << argv[0] << ": version file '" << src << "' too large!"
01034 << endl;
01035 exit(1);
01036 }
01037 ofstream output(dest.c_str());
01038 if (!output.write(buf, input.gcount())) {
01039 cerr << argv[0] << ": error writing '" << dest << "': "
01040 << strerror(errno) << endl;
01041 exit(1);
01042 }
01043 output.close();
01044
01045 string version = destdir;
01046 version += "/iamflint";
01047 if (rename(dest.c_str(), version.c_str()) == -1) {
01048 cerr << argv[0] << ": cannot rename '" << dest << "' to '"
01049 << version << "': " << strerror(errno) << endl;
01050 exit(1);
01051 }
01052 } catch (const Xapian::Error &error) {
01053 cerr << argv[0] << ": " << error.get_description() << endl;
01054 exit(1);
01055 }
01056 }