Xapian: Internal Source Documentation: xapian-core: bin/xapian-compact.cc Source File

00001 /* xapian-compact.cc: Compact a flint database, or merge and compact several.
00002  *
00003  * Copyright (C) 2004,2005,2006,2007,2008 Olly Betts
00004  *
00005  * This program is free software; you can redistribute it and/or
00006  * modify it under the terms of the GNU General Public License as
00007  * published by the Free Software Foundation; either version 2 of the
00008  * License, or (at your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00018  * USA
00019  */
00020 
00021 #include <config.h>
00022 
00023 #include "safeerrno.h"
00024 
00025 #include <fstream>
00026 #include <iostream>
00027 #include <queue>
00028 
00029 #include <stdio.h> // for rename()
00030 #include <string.h>
00031 #include <sys/types.h>
00032 #include "utils.h"
00033 
00034 #include "flint_table.h"
00035 #include "flint_cursor.h"
00036 #include "flint_utils.h"
00037 
00038 #include <xapian.h>
00039 
00040 #include "gnu_getopt.h"
00041 
00042 using namespace std;
00043 
00044 #define PROG_NAME "xapian-compact"
00045 #define PROG_DESC "Compact a flint database, or merge and compact several"
00046 
00047 #define OPT_HELP 1
00048 #define OPT_VERSION 2
00049 #define OPT_NO_RENUMBER 3
00050 
00051 static void show_usage() {
00052     cout << "Usage: "PROG_NAME" [OPTIONS] SOURCE_DATABASE... DESTINATION_DATABASE\n\n"
00053 "Options:\n"
00054 "  -b, --blocksize   Set the blocksize in bytes (e.g. 4096) or K (e.g. 4K)\n"
00055 "                    (must be between 2K and 64K and a power of 2, default 8K)\n"
00056 "  -n, --no-full     Disable full compaction\n"
00057 "  -F, --fuller      Enable fuller compaction (not recommended if you plan to\n"
00058 "                    update the compacted database)\n"
00059 "  -m, --multipass   If merging more than 3 databases, merge the postlists in\n"
00060 "                    multiple passes (which is generally faster but requires\n"
00061 "                    more disk space for temporary files)\n"
00062 "      --no-renumber Preserve the numbering of document ids (useful if you\n"
00063 "                    external references to them, or have set the to match\n"
00064 "                    unique ids from an external source).  Currently this\n"
00065 "                    option isn't supported when merging databases.\n"
00066 "  --help            display this help and exit\n"
00067 "  --version         output version information and exit" << endl;
00068 }
00069 
00070 static inline bool
00071 is_metainfo_key(const string & key)
00072 {
00073     return key.size() == 1 && key[0] == '\0';
00074 }
00075 
00076 static inline bool
00077 is_user_metadata_key(const string & key)
00078 {
00079     return key.size() > 1 && key[0] == '\0' && key[1] == '\xc0';
00080 }
00081 
00082 class PostlistCursor : private FlintCursor {
00083     Xapian::docid offset;
00084 
00085   public:
00086     string key, tag;
00087     Xapian::docid firstdid;
00088     Xapian::termcount tf, cf;
00089 
00090     PostlistCursor(FlintTable *in, Xapian::docid offset_)
00091         : FlintCursor(in), offset(offset_), firstdid(0)
00092     {
00093         find_entry("");
00094         next();
00095     }
00096 
00097     ~PostlistCursor()
00098     {
00099         delete FlintCursor::get_table();
00100     }
00101 
00102     bool next() {
00103         if (!FlintCursor::next()) return false;
00104         // We put all chunks into the non-initial chunk form here, then fix up
00105         // the first chunk for each term in the merged database as we merge.
00106         read_tag();
00107         key = current_key;
00108         tag = current_tag;
00109         tf = cf = 0;
00110         if (is_metainfo_key(key)) return true;
00111         if (is_user_metadata_key(key)) return true;
00112         // Adjust key if this is *NOT* an initial chunk.
00113         // key is: pack_string_preserving_sort(tname)
00114         // plus optionally: pack_uint_preserving_sort(did)
00115         const char * d = key.data();
00116         const char * e = d + key.size();
00117         string tname;
00118         if (!unpack_string_preserving_sort(&d, e, tname))
00119             throw Xapian::DatabaseCorruptError("Bad postlist key");
00120         if (d == e) {
00121             // This is an initial chunk for a term, so adjust tag header.
00122             d = tag.data();
00123             e = d + tag.size();
00124             if (!unpack_uint(&d, e, &tf) ||
00125                 !unpack_uint(&d, e, &cf) ||
00126                 !unpack_uint(&d, e, &firstdid)) {
00127                 throw Xapian::DatabaseCorruptError("Bad postlist tag");
00128             }
00129             ++firstdid;
00130             tag.erase(0, d - tag.data());
00131         } else {
00132             // Not an initial chunk, so adjust key.
00133             size_t tmp = d - key.data();
00134             if (!unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
00135                 throw Xapian::DatabaseCorruptError("Bad postlist key");
00136             key.erase(tmp);
00137         }
00138         firstdid += offset;
00139         return true;
00140     }
00141 };
00142 
00143 class PostlistCursorGt {
00144   public:
00147     bool operator()(const PostlistCursor *a, const PostlistCursor *b) {
00148         if (a->key > b->key) return true;
00149         if (a->key != b->key) return false;
00150         return (a->firstdid > b->firstdid);
00151     }
00152 };
00153 
00154 static void
00155 merge_postlists(FlintTable * out, vector<Xapian::docid>::const_iterator offset,
00156                 vector<string>::const_iterator b, vector<string>::const_iterator e,
00157                 Xapian::docid tot_off)
00158 {
00159     flint_totlen_t tot_totlen = 0;
00160     priority_queue<PostlistCursor *, vector<PostlistCursor *>, PostlistCursorGt> pq;
00161     for ( ; b != e; ++b, ++offset) {
00162         FlintTable *in = new FlintTable(*b, true);
00163         in->open();
00164         if (in->get_entry_count()) {
00165             // PostlistCursor takes ownership of FlintTable in and is
00166             // responsible for deleting it.
00167             PostlistCursor * cur = new PostlistCursor(in, *offset);
00168             // Merge the METAINFO tags from each database into one.
00169             // They have a key consisting of a single zero byte, which will
00170             // always be the first key.
00171             if (!is_metainfo_key(cur->key)) {
00172                 throw Xapian::DatabaseCorruptError("No METAINFO item in postlist table.");
00173             }
00174             const char * data = cur->tag.data();
00175             const char * end = data + cur->tag.size();
00176             Xapian::docid dummy_did = 0;
00177             if (!unpack_uint(&data, end, &dummy_did)) {
00178                 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00179             }
00180             flint_totlen_t totlen = 0;
00181             if (!unpack_uint_last(&data, end, &totlen)) {
00182                 throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00183             }
00184             tot_totlen += totlen;
00185             if (tot_totlen < tot_totlen) {
00186                 throw "totlen wrapped!";
00187             }
00188             if (cur->next()) {
00189                 pq.push(cur);
00190             } else {
00191                 delete cur;
00192             }
00193         } else {
00194             delete in;
00195         }
00196     }
00197 
00198     {
00199         string tag = pack_uint(tot_off);
00200         tag += pack_uint_last(tot_totlen);
00201         out->add(string("", 1), tag);
00202     }
00203 
00204     string last_key;
00205     {
00206         // Merge user metadata.
00207         string last_tag;
00208         while (!pq.empty()) {
00209             PostlistCursor * cur = pq.top();
00210             const string& key = cur->key;
00211             if (!is_user_metadata_key(key)) break;
00212 
00213             const string & tag = cur->tag;
00214             if (key == last_key) {
00215                 if (tag != last_tag)
00216                     cerr << "Warning: duplicate user metadata key with different tag value - picking arbitrary tag value" << endl;
00217             } else {
00218                 out->add(key, tag);
00219                 last_key = key;
00220                 last_tag = tag;
00221             }
00222 
00223             pq.pop();
00224             if (cur->next()) {
00225                 pq.push(cur);
00226             } else {
00227                 delete cur;
00228             }
00229         }
00230     }
00231 
00232     Xapian::termcount tf = 0, cf = 0; // Initialise to avoid warnings.
00233     vector<pair<Xapian::docid, string> > tags;
00234     while (true) {
00235         PostlistCursor * cur = NULL;
00236         if (!pq.empty()) {
00237             cur = pq.top();
00238             pq.pop();
00239         }
00240         Assert(cur == NULL || !is_user_metadata_key(cur->key));
00241         if (cur == NULL || cur->key != last_key) {
00242             if (!tags.empty()) {
00243                 string first_tag = pack_uint(tf);
00244                 first_tag += pack_uint(cf);
00245                 first_tag += pack_uint(tags[0].first - 1);
00246                 string tag = tags[0].second;
00247                 tag[0] = (tags.size() == 1) ? '1' : '0';
00248                 first_tag += tag;
00249                 out->add(last_key, first_tag);
00250                 vector<pair<Xapian::docid, string> >::const_iterator i;
00251                 i = tags.begin();
00252                 while (++i != tags.end()) {
00253                     string key = last_key;
00254                     key += pack_uint_preserving_sort(i->first);
00255                     tag = i->second;
00256                     tag[0] = (i + 1 == tags.end()) ? '1' : '0';
00257                     out->add(key, tag);
00258                 }
00259             }
00260             tags.clear();
00261             if (cur == NULL) break;
00262             tf = cf = 0;
00263             last_key = cur->key;
00264         }
00265         tf += cur->tf;
00266         cf += cur->cf;
00267         tags.push_back(make_pair(cur->firstdid, cur->tag));
00268         if (cur->next()) {
00269             pq.push(cur);
00270         } else {
00271             delete cur;
00272         }
00273     }
00274 }
00275 
00276 struct MergeCursor : public FlintCursor {
00277     MergeCursor(FlintTable *in) : FlintCursor(in) {
00278         find_entry("");
00279         next();
00280     }
00281 
00282     ~MergeCursor() {
00283         delete FlintCursor::get_table();
00284     }
00285 };
00286 
00287 struct CursorGt {
00289     bool operator()(const FlintCursor *a, const FlintCursor *b) {
00290         if (b->after_end()) return false;
00291         if (a->after_end()) return true;
00292         return (a->current_key > b->current_key);
00293     }
00294 };
00295 
00296 #define MAGIC_XOR_VALUE 96
00297 
00298 // FIXME: copied from backends/flint/flint_spelling.cc.
00299 class PrefixCompressedStringItor {
00300     const unsigned char * p;
00301     size_t left;
00302     string current;
00303 
00304     PrefixCompressedStringItor(const unsigned char * p_, size_t left_,
00305                                const string &current_)
00306         : p(p_), left(left_), current(current_) { }
00307 
00308   public:
00309     PrefixCompressedStringItor(const std::string & s)
00310         : p(reinterpret_cast<const unsigned char *>(s.data())),
00311           left(s.size()) {
00312         if (left) {
00313             operator++();
00314         } else {
00315             p = NULL;
00316         }
00317     }
00318 
00319     const string & operator*() const {
00320         return current;
00321     }
00322 
00323     PrefixCompressedStringItor operator++(int) {
00324         const unsigned char * old_p = p;
00325         size_t old_left = left;
00326         string old_current = current;
00327         operator++();
00328         return PrefixCompressedStringItor(old_p, old_left, old_current);
00329     }
00330 
00331     PrefixCompressedStringItor & operator++() {
00332         if (left == 0) {
00333             p = NULL;
00334         } else {
00335             if (!current.empty()) {
00336                 current.resize(*p++ ^ MAGIC_XOR_VALUE);
00337                 --left;
00338             }
00339             size_t add;
00340             if (left == 0 || (add = *p ^ MAGIC_XOR_VALUE) >= left)
00341                 throw Xapian::DatabaseCorruptError("Bad spelling data (too little left)");
00342             current.append(reinterpret_cast<const char *>(p + 1), add);
00343             p += add + 1;
00344             left -= add + 1;
00345         }
00346         return *this;
00347     }
00348 
00349     bool at_end() const {
00350         return p == NULL;
00351     }
00352 };
00353 
00354 // FIXME: copied from backends/flint/flint_spelling.cc.
00355 class PrefixCompressedStringWriter {
00356     string current;
00357     string & out;
00358 
00359   public:
00360     PrefixCompressedStringWriter(string & out_) : out(out_) { }
00361 
00362     void append(const string & word) {
00363         // If this isn't the first entry, see how much of the previous one
00364         // we can reuse.
00365         if (!current.empty()) {
00366             size_t len = min(current.size(), word.size());
00367             size_t i;
00368             for (i = 0; i < len; ++i) {
00369                 if (current[i] != word[i]) break;
00370             }
00371             out += char(i ^ MAGIC_XOR_VALUE);
00372             out += char((word.size() - i) ^ MAGIC_XOR_VALUE);
00373             out.append(word.data() + i, word.size() - i);
00374         } else {
00375             out += char(word.size() ^ MAGIC_XOR_VALUE);
00376             out += word;
00377         }
00378         current = word;
00379     }
00380 };
00381 
00382 struct PrefixCompressedStringItorGt {
00384     bool operator()(const PrefixCompressedStringItor *a,
00385                     const PrefixCompressedStringItor *b) {
00386         return (**a > **b);
00387     }
00388 };
00389 
00390 static void
00391 merge_spellings(FlintTable * out,
00392                 vector<string>::const_iterator b,
00393                 vector<string>::const_iterator e)
00394 {
00395     priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00396     for ( ; b != e; ++b) {
00397         FlintTable *in = new FlintTable(*b, true, DONT_COMPRESS, true);
00398         in->open();
00399         if (in->get_entry_count()) {
00400             // The MergeCursor takes ownership of FlintTable in and is
00401             // responsible for deleting it.
00402             pq.push(new MergeCursor(in));
00403         } else {
00404             delete in;
00405         }
00406     }
00407 
00408     while (!pq.empty()) {
00409         MergeCursor * cur = pq.top();
00410         pq.pop();
00411 
00412         string key = cur->current_key;
00413         if (pq.empty() || pq.top()->current_key > key) {
00414             // No need to merge the tags, just copy the (possibly compressed)
00415             // tag value.
00416             bool compressed = cur->read_tag(true);
00417             out->add(key, cur->current_tag, compressed);
00418             if (cur->next()) {
00419                 pq.push(cur);
00420             } else {
00421                 delete cur;
00422             }
00423             continue;
00424         }
00425 
00426         // Merge tag values with the same key:
00427         string tag;
00428         if (key[0] != 'W') {
00429             // We just want the union of words, so copy over the first instance
00430             // and skip any identical ones.
00431             priority_queue<PrefixCompressedStringItor *,
00432                            vector<PrefixCompressedStringItor *>,
00433                            PrefixCompressedStringItorGt> pqtag;
00434             // Stick all the MergeCursor pointers in a vector because their
00435             // current_tag members must remain valid while we're merging their
00436             // tags, but we need to call next() on them all afterwards.
00437             vector<MergeCursor *> vec;
00438             vec.reserve(pq.size());
00439 
00440             while (true) {
00441                 cur->read_tag();
00442                 pqtag.push(new PrefixCompressedStringItor(cur->current_tag));
00443                 vec.push_back(cur);
00444                 if (pq.empty() || pq.top()->current_key != key) break;
00445                 cur = pq.top();
00446                 pq.pop();
00447             }
00448 
00449             PrefixCompressedStringWriter wr(tag);
00450             string lastword;
00451             while (!pqtag.empty()) {
00452                 PrefixCompressedStringItor * it = pqtag.top();
00453                 string word = **it;
00454                 if (word != lastword) {
00455                     lastword = word;
00456                     wr.append(lastword);
00457                 }
00458                 ++*it;
00459                 pqtag.pop();
00460                 if (!it->at_end()) {
00461                     pqtag.push(it);
00462                 } else {
00463                     delete it;
00464                 }
00465             }
00466 
00467             vector<MergeCursor *>::const_iterator i;
00468             for (i = vec.begin(); i != vec.end(); ++i) {
00469                 cur = *i;
00470                 if (cur->next()) {
00471                     pq.push(cur);
00472                 } else {
00473                     delete cur;
00474                 }
00475             }
00476         } else {
00477             // We want to sum the frequencies from tags for the same key.
00478             Xapian::termcount tot_freq = 0;
00479             while (true) {
00480                 cur->read_tag();
00481                 Xapian::termcount freq;
00482                 const char * p = cur->current_tag.data();
00483                 const char * end = p + cur->current_tag.size();
00484                 if (!unpack_uint_last(&p, end, &freq) || freq == 0) {
00485                     throw Xapian::DatabaseCorruptError("Bad spelling word freq");
00486                 }
00487                 tot_freq += freq;
00488                 if (cur->next()) {
00489                     pq.push(cur);
00490                 } else {
00491                     delete cur;
00492                 }
00493                 if (pq.empty() || pq.top()->current_key != key) break;
00494                 cur = pq.top();
00495                 pq.pop();
00496             }
00497             tag = pack_uint_last(tot_freq);
00498         }
00499         out->add(key, tag);
00500     }
00501 }
00502 
00503 class ByteLengthPrefixedStringItor {
00504     const unsigned char * p;
00505     size_t left;
00506 
00507     ByteLengthPrefixedStringItor(const unsigned char * p_, size_t left_)
00508         : p(p_), left(left_) { }
00509 
00510   public:
00511     ByteLengthPrefixedStringItor(const std::string & s)
00512         : p(reinterpret_cast<const unsigned char *>(s.data())),
00513           left(s.size()) { }
00514 
00515     string operator*() const {
00516         size_t len = *p ^ MAGIC_XOR_VALUE;
00517         return string(reinterpret_cast<const char *>(p + 1), len);
00518     }
00519 
00520     ByteLengthPrefixedStringItor operator++(int) {
00521         const unsigned char * old_p = p;
00522         size_t old_left = left;
00523         operator++();
00524         return ByteLengthPrefixedStringItor(old_p, old_left);
00525     }
00526 
00527     ByteLengthPrefixedStringItor & operator++() {
00528         if (!left) {
00529             throw Xapian::DatabaseCorruptError("Bad synonym data (none left)");
00530         }
00531         size_t add = (*p ^ MAGIC_XOR_VALUE) + 1;
00532         if (left < add) {
00533             throw Xapian::DatabaseCorruptError("Bad synonym data (too little left)");
00534         }
00535         p += add;
00536         left -= add;
00537         return *this;
00538     }
00539 
00540     bool at_end() const {
00541         return left == 0;
00542     }
00543 };
00544 
00545 struct ByteLengthPrefixedStringItorGt {
00547     bool operator()(const ByteLengthPrefixedStringItor *a,
00548                     const ByteLengthPrefixedStringItor *b) {
00549         return (**a > **b);
00550     }
00551 };
00552 
00553 static void
00554 merge_synonyms(FlintTable * out,
00555                vector<string>::const_iterator b,
00556                vector<string>::const_iterator e)
00557 {
00558     priority_queue<MergeCursor *, vector<MergeCursor *>, CursorGt> pq;
00559     for ( ; b != e; ++b) {
00560         FlintTable *in = new FlintTable(*b, true, DONT_COMPRESS, true);
00561         in->open();
00562         if (in->get_entry_count()) {
00563             // The MergeCursor takes ownership of FlintTable in and is
00564             // responsible for deleting it.
00565             pq.push(new MergeCursor(in));
00566         } else {
00567             delete in;
00568         }
00569     }
00570 
00571     while (!pq.empty()) {
00572         MergeCursor * cur = pq.top();
00573         pq.pop();
00574 
00575         string key = cur->current_key;
00576         if (pq.top()->current_key > key) {
00577             // No need to merge the tags, just copy the (possibly compressed)
00578             // tag value.
00579             bool compressed = cur->read_tag(true);
00580             out->add(key, cur->current_tag, compressed);
00581             if (cur->next()) {
00582                 pq.push(cur);
00583             } else {
00584                 delete cur;
00585             }
00586             continue;
00587         }
00588 
00589         // Merge tag values with the same key:
00590         string tag;
00591 
00592         // We just want the union of words, so copy over the first instance
00593         // and skip any identical ones.
00594         priority_queue<ByteLengthPrefixedStringItor *,
00595                        vector<ByteLengthPrefixedStringItor *>,
00596                        ByteLengthPrefixedStringItorGt> pqtag;
00597         vector<MergeCursor *> vec;
00598 
00599         while (true) {
00600             cur->read_tag();
00601             pqtag.push(new ByteLengthPrefixedStringItor(cur->current_tag));
00602             vec.push_back(cur);
00603             if (pq.empty() || pq.top()->current_key != key) break;
00604             cur = pq.top();
00605             pq.pop();
00606         }
00607 
00608         string lastword;
00609         while (!pqtag.empty()) {
00610             ByteLengthPrefixedStringItor * it = pqtag.top();
00611             if (**it != lastword) {
00612                 lastword = **it;
00613                 tag += byte(lastword.size() ^ MAGIC_XOR_VALUE);
00614                 tag += lastword;
00615             }
00616             ++*it;
00617             pqtag.pop();
00618             if (!it->at_end()) {
00619                 pqtag.push(it);
00620             } else {
00621                 delete it;
00622             }
00623         }
00624 
00625         vector<MergeCursor *>::const_iterator i;
00626         for (i = vec.begin(); i != vec.end(); ++i) {
00627             cur = *i;
00628             if (cur->next()) {
00629                 pq.push(cur);
00630             } else {
00631                 delete cur;
00632             }
00633         }
00634 
00635         out->add(key, tag);
00636     }
00637 }
00638 
00639 static void
00640 multimerge_postlists(FlintTable * out, const char * tmpdir,
00641                      Xapian::docid tot_off,
00642                      vector<string> tmp, vector<Xapian::docid> off)
00643 {
00644     unsigned int c = 0;
00645     while (tmp.size() > 3) {
00646         vector<string> tmpout;
00647         tmpout.reserve(tmp.size() / 2);
00648         vector<Xapian::docid> newoff;
00649         newoff.resize(tmp.size() / 2);
00650         for (unsigned int i = 0, j; i < tmp.size(); i = j) {
00651             j = i + 2;
00652             if (j == tmp.size() - 1) ++j;
00653 
00654             string dest = tmpdir;
00655             char buf[64];
00656             sprintf(buf, "/tmp%u_%u.", c, i / 2);
00657             dest += buf;
00658 
00659             // Don't compress temporary tables, even if the final table would
00660             // be.
00661             FlintTable tmptab(dest, false);
00662             // Use maximum blocksize for temporary tables.
00663             tmptab.create_and_open(65536);
00664 
00665             merge_postlists(&tmptab, off.begin() + i, tmp.begin() + i, tmp.begin() + j, 0);
00666             if (c > 0) {
00667                 for (unsigned int k = i; k < j; ++k) {
00668                     unlink((tmp[k] + "DB").c_str());
00669                     unlink((tmp[k] + "baseA").c_str());
00670                     unlink((tmp[k] + "baseB").c_str());
00671                 }
00672             }
00673             tmpout.push_back(dest);
00674             tmptab.commit(1);
00675         }
00676         swap(tmp, tmpout);
00677         swap(off, newoff);
00678         ++c;
00679     }
00680     merge_postlists(out, off.begin(), tmp.begin(), tmp.end(), tot_off);
00681     if (c > 0) {
00682         for (size_t k = 0; k < tmp.size(); ++k) {
00683             unlink((tmp[k] + "DB").c_str());
00684             unlink((tmp[k] + "baseA").c_str());
00685             unlink((tmp[k] + "baseB").c_str());
00686         }
00687     }
00688 }
00689 
00690 static void
00691 merge_docid_keyed(FlintTable *out, const vector<string> & inputs,
00692                   const vector<Xapian::docid> & offset, bool lazy)
00693 {
00694     for (size_t i = 0; i < inputs.size(); ++i) {
00695         Xapian::docid off = offset[i];
00696 
00697         FlintTable in(inputs[i], true, DONT_COMPRESS, lazy);
00698         in.open();
00699         if (in.get_entry_count() == 0) continue;
00700 
00701         FlintCursor cur(&in);
00702         cur.find_entry("");
00703 
00704         string key;
00705         while (cur.next()) {
00706             // Adjust the key if this isn't the first database.
00707             if (off) {
00708                 Xapian::docid did;
00709                 const char * d = cur.current_key.data();
00710                 const char * e = d + cur.current_key.size();
00711                 if (!unpack_uint_preserving_sort(&d, e, &did)) {
00712                     string msg = "Bad key in ";
00713                     msg += inputs[i];
00714                     throw Xapian::DatabaseCorruptError(msg);
00715                 }
00716                 did += off;
00717                 key = pack_uint_preserving_sort(did);
00718                 if (d != e) {
00719                     // Copy over the termname for the position table.
00720                     key.append(d, e - d);
00721                 }
00722             } else {
00723                 key = cur.current_key;
00724             }
00725             bool compressed = cur.read_tag(true);
00726             out->add(key, cur.current_tag, compressed);
00727         }
00728     }
00729 }
00730 
00731 int
00732 main(int argc, char **argv)
00733 {
00734     const char * opts = "b:nFm";
00735     const struct option long_opts[] = {
00736         {"fuller",      no_argument, 0, 'F'},
00737         {"no-full",     no_argument, 0, 'n'},
00738         {"multipass",   no_argument, 0, 'm'},
00739         {"blocksize",   required_argument, 0, 'b'},
00740         {"no-renumber", no_argument, 0, OPT_NO_RENUMBER},
00741         {"help",        no_argument, 0, OPT_HELP},
00742         {"version",     no_argument, 0, OPT_VERSION},
00743         {NULL,          0, 0, 0}
00744     };
00745 
00746     enum { STANDARD, FULL, FULLER } compaction = FULL;
00747     size_t block_size = 8192;
00748     bool multipass = false;
00749     bool renumber = true;
00750 
00751     int c;
00752     while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
00753         switch (c) {
00754             case 'b': {
00755                 char *p;
00756                 block_size = strtoul(optarg, &p, 10);
00757                 if (block_size <= 64 && (*p == 'K' || *p == 'k')) {
00758                     ++p;
00759                     block_size *= 1024;
00760                 }
00761                 if (*p || block_size < 2048 || block_size > 65536 ||
00762                     (block_size & (block_size - 1)) != 0) {
00763                     cerr << PROG_NAME": Bad value '" << optarg
00764                          << "' passed for blocksize, must be a power of 2 between 2K and 64K"
00765                          << endl;
00766                     exit(1);
00767                 }
00768                 break;
00769             }
00770             case 'n':
00771                 compaction = STANDARD;
00772                 break;
00773             case 'F':
00774                 compaction = FULLER;
00775                 break;
00776             case 'm':
00777                 multipass = true;
00778                 break;
00779             case OPT_NO_RENUMBER:
00780                 renumber = false;
00781                 break;
00782             case OPT_HELP:
00783                 cout << PROG_NAME" - "PROG_DESC"\n\n";
00784                 show_usage();
00785                 exit(0);
00786             case OPT_VERSION:
00787                 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00788                 exit(0);
00789             default:
00790                 show_usage();
00791                 exit(1);
00792         }
00793     }
00794 
00795     if (argc - optind < 2) {
00796         show_usage();
00797         exit(1);
00798     }
00799 
00800     if (!renumber && argc - optind > 2) {
00801         cout << argv[0]
00802              << ": --no-renumber isn't currently supported when merging databases."
00803              << endl;
00804         exit(1);
00805     }
00806 
00807     // Path to the database to create.
00808     const char *destdir = argv[argc - 1];
00809 
00810     try {
00811         vector<string> sources;
00812         vector<Xapian::docid> offset;
00813         sources.reserve(argc - 1 - optind);
00814         offset.reserve(argc - 1 - optind);
00815         Xapian::docid tot_off = 0;
00816         for (int i = optind; i < argc - 1; ++i) {
00817             const char *srcdir = argv[i];
00818             // Check destdir isn't the same as any source directory...
00819             if (strcmp(srcdir, destdir) == 0) {
00820                 cout << argv[0]
00821                      << ": destination may not be the same as any source directory."
00822                      << endl;
00823                 exit(1);
00824             }
00825 
00826             struct stat sb;
00827             if (stat(string(srcdir) + "/iamflint", &sb) != 0) {
00828                 cout << argv[0] << ": '" << srcdir
00829                      << "' is not a flint database directory" << endl;
00830                 exit(1);
00831             }
00832 
00833             Xapian::Database db(srcdir);
00834             Xapian::docid last = 0;
00835 
00836             // "Empty" databases might have spelling or synonym data so can't
00837             // just be completely ignored.
00838             if (db.get_doccount() != 0) {
00839                 last = db.get_lastdocid();
00840 
00841                 if (renumber) {
00842                     // Prune any unused docids off the start of this source
00843                     // database.
00844                     Xapian::PostingIterator it = db.postlist_begin("");
00845                     // This test should never fail, since db.get_doccount() is
00846                     // non-zero!
00847                     if (it != db.postlist_end("")) {
00848                         // tot_off could wrap here, but it's unsigned, so
00849                         // that's OK.
00850                         tot_off -= (*it - 1);
00851                     }
00852 
00853                     // FIXME: get_lastdocid() returns a "high water mark" - we
00854                     // should prune unused docids off the end of each source
00855                     // database as well as off the start.
00856                 }
00857             }
00858             offset.push_back(tot_off);
00859             tot_off += last;
00860 
00861             sources.push_back(string(srcdir) + '/');
00862         }
00863 
00864         // If the destination database directory doesn't exist, create it.
00865         if (mkdir(destdir, 0755) < 0) {
00866             // Check why mkdir failed.  It's ok if the directory already
00867             // exists, but we also get EEXIST if there's an existing file with
00868             // that name.
00869             if (errno == EEXIST) {
00870                 struct stat sb;
00871                 if (stat(destdir, &sb) == 0 && S_ISDIR(sb.st_mode))
00872                     errno = 0;
00873                 else
00874                     errno = EEXIST; // stat might have changed it
00875             }
00876             if (errno) {
00877                 cerr << argv[0] << ": cannot create directory '"
00878                      << destdir << "': " << strerror(errno) << endl;
00879                 exit(1);
00880             }
00881         }
00882 
00883         enum table_type {
00884             POSTLIST, RECORD, TERMLIST, POSITION, VALUE, SPELLING, SYNONYM
00885         };
00886         struct table_list {
00887             // The "base name" of the table.
00888             const char * name;
00889             // The type.
00890             table_type type;
00891             // zlib compression strategy to use on tags.
00892             int compress_strategy;
00893             // Create tables after position lazily.
00894             bool lazy;
00895         };
00896 
00897         static const table_list tables[] = {
00898             // name         type        compress_strategy       lazy
00899             { "postlist",   POSTLIST,   DONT_COMPRESS,          false },
00900             { "record",     RECORD,     Z_DEFAULT_STRATEGY,     false },
00901             { "termlist",   TERMLIST,   Z_DEFAULT_STRATEGY,     false },
00902             { "position",   POSITION,   DONT_COMPRESS,          true },
00903             { "value",      VALUE,      DONT_COMPRESS,          true },
00904             { "spelling",   SPELLING,   Z_DEFAULT_STRATEGY,     true },
00905             { "synonym",    SYNONYM,    Z_DEFAULT_STRATEGY,     true }
00906         };
00907         const table_list * tables_end = tables +
00908             (sizeof(tables) / sizeof(tables[0]));
00909 
00910         for (const table_list * t = tables; t < tables_end; ++t) {
00911             // The postlist requires an N-way merge, adjusting the headers of
00912             // various blocks.  The other tables have keys sorted in docid
00913             // order, so we can merge them by simply copying all the keys from
00914             // each source table in turn.
00915             cout << t->name << " ..." << flush;
00916 
00917             string dest = destdir;
00918             dest += '/';
00919             dest += t->name;
00920             dest += '.';
00921 
00922             FlintTable out(dest, false, t->compress_strategy, t->lazy);
00923             if (!t->lazy) {
00924                 out.create_and_open(block_size);
00925             } else {
00926                 out.erase();
00927                 out.set_block_size(block_size);
00928             }
00929 
00930             out.set_full_compaction(compaction != STANDARD);
00931             if (compaction == FULLER) out.set_max_item_size(1);
00932 
00933             // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
00934             // on certain systems).
00935             bool bad_stat = false;
00936 
00937             off_t in_size = 0;
00938 
00939             vector<string> inputs;
00940             inputs.reserve(sources.size());
00941             for (vector<string>::const_iterator src = sources.begin();
00942                  src != sources.end(); ++src) {
00943                 string s(*src);
00944                 s += t->name;
00945                 s += '.';
00946 
00947                 struct stat sb;
00948                 if (stat(s + "DB", &sb) == 0) {
00949                     in_size += sb.st_size / 1024;
00950                 } else {
00951                     // We get ENOENT for an optional table.
00952                     bad_stat = (errno != ENOENT);
00953                 }
00954                 inputs.push_back(s);
00955             }
00956 
00957             if (inputs.empty()) continue;
00958 
00959             switch (t->type) {
00960                 case POSTLIST:
00961                     if (multipass && inputs.size() > 3) {
00962                         multimerge_postlists(&out, destdir, tot_off,
00963                                              inputs, offset);
00964                     } else {
00965                         merge_postlists(&out, offset.begin(),
00966                                         inputs.begin(), inputs.end(),
00967                                         tot_off);
00968                     }
00969                     break;
00970                 case SPELLING:
00971                     merge_spellings(&out, inputs.begin(), inputs.end());
00972                     break;
00973                 case SYNONYM:
00974                     merge_synonyms(&out, inputs.begin(), inputs.end());
00975                     break;
00976                 default:
00977                     // Position, Record, Termlist, Value
00978                     merge_docid_keyed(&out, inputs, offset, t->lazy);
00979                     break;
00980             }
00981 
00982             // Commit as revision 1.
00983             out.commit(1);
00984 
00985             cout << '\r' << t->name << ": ";
00986             off_t out_size = 0;
00987             if (!bad_stat) {
00988                 struct stat sb;
00989                 if (stat(dest + "DB", &sb) == 0) {
00990                     out_size = sb.st_size / 1024;
00991                 } else {
00992                     bad_stat = (errno != ENOENT);
00993                 }
00994             }
00995             if (bad_stat) {
00996                 cout << "Done (couldn't stat all the DB files)";
00997             } else {
00998                 if (out_size == in_size) {
00999                     cout << "Size unchanged (";
01000                 } else if (out_size < in_size) {
01001                     cout << "Reduced by "
01002                          << 100 * double(in_size - out_size) / in_size << "% "
01003                          << in_size - out_size << "K (" << in_size << "K -> ";
01004                 } else {
01005                     cout << "INCREASED by "
01006                          << 100 * double(out_size - in_size) / in_size << "% "
01007                          << out_size - in_size << "K (" << in_size << "K -> ";
01008                 }
01009                 cout << out_size << "K)";
01010             }
01011             cout << endl;
01012         }
01013 
01014         // Copy over the version file ("iamflint").
01015         // FIXME: We may need to do something smarter that just copying an
01016         // arbitrary version file if the version file format changes...
01017         string dest = destdir;
01018         dest += "/iamflint.tmp";
01019 
01020         string src(argv[optind]);
01021         src += "/iamflint";
01022 
01023         ifstream input(src.c_str());
01024         char buf[1024];
01025         input.read(buf, sizeof(buf));
01026         if (!input.eof()) {
01027             if (!input) {
01028                 cerr << argv[0] << ": error reading '" << src << "': "
01029                      << strerror(errno) << endl;
01030                 exit(1);
01031             }
01032             // Version file should be about 12 bytes, not > 1024!
01033             cerr << argv[0] << ": version file '" << src << "' too large!"
01034                  << endl;
01035             exit(1);
01036         }
01037         ofstream output(dest.c_str());
01038         if (!output.write(buf, input.gcount())) {
01039             cerr << argv[0] << ": error writing '" << dest << "': "
01040                  << strerror(errno) << endl;
01041             exit(1);
01042         }
01043         output.close();
01044 
01045         string version = destdir;
01046         version += "/iamflint";
01047         if (rename(dest.c_str(), version.c_str()) == -1) {
01048             cerr << argv[0] << ": cannot rename '" << dest << "' to '"
01049                  << version << "': " << strerror(errno) << endl;
01050             exit(1);
01051         }
01052     } catch (const Xapian::Error &error) {
01053         cerr << argv[0] << ": " << error.get_description() << endl;
01054         exit(1);
01055     }
01056 }