bin/quartzcompact.cc

Go to the documentation of this file.
00001 /* quartzcompact.cc: Compact one or more quartz databases to produce a new
00002  * quartzdatabase.  By default the resultant database has full compaction
00003  * with revision 1, which makes it especially fast to search.
00004  *
00005  * Copyright 1999,2000,2001 BrightStation PLC
00006  * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts
00007  *
00008  * This program is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU General Public License as
00010  * published by the Free Software Foundation; either version 2 of the
00011  * License, or (at your option) any later version.
00012  *
00013  * This program is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  * GNU General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU General Public License
00019  * along with this program; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00021  * USA
00022  */
00023 
00024 #include <config.h>
00025 
00026 #include "safeerrno.h"
00027 
00028 #include <fstream>
00029 #include <iostream>
00030 #include <queue>
00031 
00032 #include <stdio.h> // for rename()
00033 #include <string.h>
00034 #include <sys/types.h>
00035 #include "safesysstat.h"
00036 
00037 #include "btree.h"
00038 #include "bcursor.h"
00039 #include "quartz_utils.h"
00040 #include "utils.h" // for mkdir for MSVC
00041 
00042 #include <xapian.h>
00043 
00044 #include "gnu_getopt.h"
00045 
00046 using namespace std;
00047 
00048 #define PROG_NAME "quartzcompact"
00049 #define PROG_DESC "Compact a quartz database, or merge and compact several"
00050 
00051 static void show_usage() {
00052     cout << "Usage: "PROG_NAME" [OPTION] SOURCE_DATABASE... DESTINATION_DATABASE\n\n"
00053 "Options:\n"
00054 "  -b, --blocksize  Set the blocksize in bytes (e.g. 4096) or K (e.g. 4K)\n"
00055 "                   (must be between 2K and 64K and a power of 2, default 8K)\n"
00056 "  -n, --no-full    Disable full compaction\n"
00057 "  -F, --fuller     Enable fuller compaction (not recommended if you plan to\n"
00058 "                   update the compacted database)\n"
00059 "  --help           display this help and exit\n"
00060 "  --version        output version information and exit" << endl;
00061 }
00062 
00063 static inline bool
00064 is_metainfo_key(const string & key)
00065 {
00066     return key.size() == 1 && key[0] == '\0';
00067 }
00068 
00069 class PostlistCursor : private Bcursor {
00070         Xapian::docid offset;
00071     public:
00072         string key, tag;
00073         Xapian::docid firstdid;
00074         Xapian::termcount tf, cf;
00075 
00076         PostlistCursor(Btree *in, Xapian::docid offset_)
00077             : Bcursor(in), offset(offset_)
00078         {
00079             find_entry("");
00080             next();
00081         }
00082 
00083         bool next() {
00084             if (!Bcursor::next()) return false;
00085             // We put all chunks into the non-initial chunk form here,
00086             // then fix up the first chunks in the merged database as we
00087             // merge.
00088             read_tag();
00089             key = current_key;
00090             tag = current_tag;
00091             tf = cf = 0;
00092             // Adjust key if this is *NOT* an initial chunk.
00093             // key is: pack_string_preserving_sort(tname)
00094             // plus optionally: pack_uint_preserving_sort(did)
00095             const char * d = key.data();
00096             string tname;
00097             if (!unpack_string_preserving_sort(&d, d + key.size(), tname))
00098                 abort();
00099             bool first_chunk = (d == key.data() + key.size());
00100             if (!first_chunk) {
00101                 size_t tmp = d - key.data();
00102                 if (!unpack_uint_preserving_sort(&d, d + key.size(), &firstdid))
00103                     abort();
00104                 firstdid += offset;
00105                 key.erase(tmp);
00106             }
00107             // Adjust key and header of tag if this *IS* an initial chunk.
00108             if (first_chunk) {
00109                 d = tag.data();
00110                 const char *e = d + tag.size();
00111                 if (!unpack_uint(&d, e, &tf))
00112                     abort();
00113                 if (!unpack_uint(&d, e, &cf))
00114                     abort();
00115                 if (!unpack_uint(&d, e, &firstdid))
00116                     abort();
00117                 firstdid += 1;
00118                 firstdid += offset;
00119                 tag.erase(0, d - tag.data());
00120             }
00121             return true;
00122         }
00123 };
00124 
00125 class DocIDKeyedCursor : private Bcursor {
00126         Xapian::docid offset;
00127     public:
00128         string key, tag;
00129 
00130         DocIDKeyedCursor(Btree *in, Xapian::docid offset_)
00131             : Bcursor(in), offset(offset_)
00132         {
00133             find_entry("");
00134             next();
00135         }
00136 
00137         bool next() {
00138             if (!Bcursor::next()) return false;
00139             read_tag();
00140             tag = current_tag;
00141             // Adjust the key if this isn't the first database and
00142             // this isn't the METAINFO key.
00143             if (offset && !is_metainfo_key(current_key)) {
00144                 Xapian::docid did;
00145                 const char * d = current_key.data();
00146                 if (!unpack_uint_last(&d, d + current_key.size(), &did))
00147                     abort();
00148                 did += offset;
00149                 key = pack_uint_last(did);
00150             } else {
00151                 key = current_key;
00152             }
00153             return true;
00154         }
00155 };
00156 
00157 class PositionCursor : private Bcursor {
00158         Xapian::docid offset;
00159     public:
00160         string key, tag;
00161 
00162         PositionCursor(Btree *in, Xapian::docid offset_)
00163             : Bcursor(in), offset(offset_)
00164         {
00165             find_entry("");
00166             next();
00167         }
00168 
00169         bool next() {
00170             if (!Bcursor::next()) return false;
00171             read_tag();
00172             tag = current_tag;
00173             // Adjust the key if this isn't the first database.
00174             if (offset) {
00175                 // key is: pack_uint(did) + tname
00176                 Xapian::docid did;
00177                 const char * d = current_key.data();
00178                 if (!unpack_uint(&d, d + current_key.size(), &did))
00179                     abort();
00180                 did += offset;
00181                 key = pack_uint(did);
00182                 size_t tnameidx = d - current_key.data();
00183                 key += current_key.substr(tnameidx);
00184             } else {
00185                 key = current_key;
00186             }
00187             return true;
00188         }
00189 };
00190 
00191 class CursorGt {
00192   public:
00195     bool operator()(const PostlistCursor *a, const PostlistCursor *b) {
00196         if (a->key > b->key) return true;
00197         if (a->key != b->key) return false;
00198         return (a->firstdid > b->firstdid);
00199     }
00200     bool operator()(const DocIDKeyedCursor *a, const DocIDKeyedCursor *b) {
00201         return (a->key > b->key);
00202     }
00203     bool operator()(const PositionCursor *a, const PositionCursor *b) {
00204         return (a->key > b->key);
00205     }
00206 };
00207 
00208 int
00209 main(int argc, char **argv)
00210 {
00211     const struct option long_opts[] = {
00212         {"no-full",     no_argument, 0, 'n'},
00213         {"fuller",      no_argument, 0, 'F'},
00214         {"blocksize",   required_argument, 0, 'b'},
00215         {"help",        no_argument, 0, 'h'},
00216         {"version",     no_argument, 0, 'v'},
00217         {NULL,          0, 0, 0}
00218     };
00219 
00220     bool full_compaction = true;
00221     size_t block_capacity = 0;
00222     size_t block_size = 8192;
00223 
00224     int c;
00225     while ((c = gnu_getopt_long(argc, argv, "b:nFhv", long_opts, 0)) != EOF) {
00226         switch (c) {
00227             case 'b': {
00228                 char *p;
00229                 block_size = strtoul(optarg, &p, 10);   
00230                 if (block_size <= 64 && (*p == 'K' || *p == 'k')) {
00231                     ++p;
00232                     block_size *= 1024;
00233                 }
00234                 if (*p || block_size < 2048 || block_size > 65536 ||
00235                     (block_size & (block_size - 1)) != 0) {
00236                     cerr << PROG_NAME": Bad value '" << optarg
00237                          << "' passed for blocksize, must be a power of 2 between 2K and 64K"
00238                          << endl;
00239                     exit(1);
00240                 }
00241                 break;
00242             }
00243             case 'n':
00244                 full_compaction = false;
00245                 break;
00246             case 'F':
00247                 block_capacity = 1;
00248                 break;
00249             case 'h':
00250                 cout << PROG_NAME" - "PROG_DESC"\n\n";
00251                 show_usage();
00252                 exit(0);
00253             case 'v':
00254                 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00255                 exit(0);
00256             default:
00257                 show_usage();
00258                 exit(1);
00259         }
00260     }
00261 
00262     if (argc - optind < 2) {
00263         show_usage();
00264         exit(1);
00265     }
00266 
00267     // Path to the database to create
00268     const char *destdir = argv[argc - 1];
00269 
00270     // Check destdir isn't the same as any source directory...
00271     for (int i = optind; i < argc - 1; ++i) {
00272         const char *srcdir = argv[i];
00273         if (strcmp(srcdir, destdir) == 0) {
00274             cout << argv[0] << ": destination may not be the same as any source directory" << endl;
00275             exit(1);
00276         }
00277     }
00278 
00279     // Create the directory for the database, if it doesn't exist already
00280     if (mkdir(destdir, 0755) == -1) {
00281         // Check if mkdir failed because there's already a directory there
00282         // or for some other reason - we also get EEXIST if there's a file
00283         // with that name.
00284         if (errno == EEXIST) {
00285             struct stat sb;
00286             if (stat(destdir, &sb) == 0 && S_ISDIR(sb.st_mode))
00287                 errno = 0;
00288             else
00289                 errno = EEXIST; // stat might have changed it
00290         }
00291         if (errno) {
00292             cerr << argv[0] << ": couldn't create directory `"
00293                  << destdir << "': " << strerror(errno) << endl;
00294             exit(1);
00295         }
00296     }
00297 
00298     quartz_totlen_t tot_totlen = 0;
00299     try {
00300         const char * tables[] = {
00301             "postlist", "record", "termlist", "position", "value"
00302         };
00303 
00304         const size_t out_of = argc - 1 - optind;
00305         vector<Xapian::docid> offset(out_of);
00306         Xapian::docid tot_off = 0;
00307         for (size_t i = 0; i < out_of; ++i) {
00308             Xapian::Database db(argv[i + optind]);
00309             Xapian::docid last = db.get_lastdocid();
00310             offset[i] = tot_off;
00311             tot_off += last;
00312             // FIXME: prune unused docids off the start and end of each range...
00313         }
00314 
00315         for (const char **t = tables;
00316              t != tables + sizeof(tables)/sizeof(tables[0]); ++t) {
00317             // The postlist requires an N-way merge, adjusting the headers
00318             // of various blocks.  The other tables also require an N-way
00319             // merge, because the keys we use in quartz don't sort in
00320             // docid order.
00321             cout << *t << " ..." << flush;
00322 
00323             string dest = destdir;
00324             dest += '/';
00325             dest += *t;
00326             dest += '_';
00327 
00328             Btree out(dest, false);
00329             out.create(block_size);
00330             out.open();
00331             out.set_full_compaction(full_compaction);
00332             if (block_capacity) out.set_max_item_size(block_capacity);
00333 
00334             // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
00335             // on certain systems).
00336             bool bad_stat = false;
00337 
00338             off_t in_size = 0;
00339 
00340             vector<Btree *> btrees;
00341 
00342             if (strcmp(*t, "postlist") == 0) {
00343                 priority_queue<PostlistCursor *, vector<PostlistCursor *>,
00344                                CursorGt> pq;
00345                 for (size_t i = 0; i < out_of; ++i) {
00346                     Xapian::docid off = offset[i];
00347                     const char *srcdir = argv[i + optind];
00348                     string src(srcdir);
00349                     src += '/';
00350                     src += *t;
00351                     src += '_';
00352 
00353                     Btree *in = new Btree(src, true);
00354                     in->open();
00355                     if (in->get_entry_count()) {
00356                         btrees.push_back(in);
00357                         pq.push(new PostlistCursor(in, off));
00358                     } else {
00359                         delete in;
00360                     }
00361 
00362                     struct stat sb;
00363                     if (stat(src + "DB", &sb) == 0)
00364                         in_size += sb.st_size / 1024;
00365                     else
00366                         bad_stat = true;
00367                 }
00368 
00369                 string last_key;
00370                 // We only need to initialise tf and cf to shut up GCC warnings.
00371                 Xapian::termcount tf = 0, cf = 0;
00372                 vector<pair<Xapian::docid, string> > tags;
00373                 while (true) {
00374                     PostlistCursor * bc = NULL;
00375                     if (!pq.empty()) {
00376                         bc = pq.top();
00377                         pq.pop();
00378                     }
00379                     if (bc == NULL || bc->key != last_key) {
00380                         if (!tags.empty()) {
00381                             string first_tag = pack_uint(tf);
00382                             first_tag += pack_uint(cf);
00383                             first_tag += pack_uint(tags[0].first - 1);
00384                             string tag = tags[0].second;
00385                             tag[0] = (tags.size() == 1) ? '1' : '0';
00386                             first_tag += tag;
00387                             out.add(last_key, first_tag);
00388                             vector<pair<Xapian::docid, string> >::const_iterator i;
00389                             i = tags.begin();
00390                             while (++i != tags.end()) {
00391                                 string key = last_key;
00392                                 key += pack_uint_preserving_sort(i->first);
00393                                 tag = i->second;
00394                                 tag[0] = (i + 1 == tags.end()) ? '1' : '0';
00395                                 out.add(key, tag);
00396                             }
00397                         }
00398                         tags.clear();
00399                         if (bc == NULL) break;
00400                         tf = cf = 0;
00401                         last_key = bc->key;
00402                     }
00403                     tf += bc->tf;
00404                     cf += bc->cf;
00405                     tags.push_back(make_pair(bc->firstdid, bc->tag));
00406                     if (bc->next()) {
00407                         pq.push(bc);
00408                     } else {
00409                         delete bc;
00410                     }
00411                 }
00412             } else if (strcmp(*t, "position") == 0) {
00413                 priority_queue<PositionCursor *, vector<PositionCursor *>,
00414                                CursorGt> pq;
00415                 for (size_t i = 0; i < out_of; ++i) {
00416                     Xapian::docid off = offset[i];
00417                     const char *srcdir = argv[i + optind];
00418                     string src(srcdir);
00419                     src += '/';
00420                     src += *t;
00421                     src += '_';
00422 
00423                     Btree *in = new Btree(src, true);
00424                     in->open();
00425                     if (in->get_entry_count()) {
00426                         btrees.push_back(in);
00427                         pq.push(new PositionCursor(in, off));
00428                     } else {
00429                         delete in;
00430                     }
00431 
00432                     struct stat sb;
00433                     if (stat(src + "DB", &sb) == 0)
00434                         in_size += sb.st_size / 1024;
00435                     else
00436                         bad_stat = true;
00437                 }
00438 
00439                 while (!pq.empty()) {
00440                     PositionCursor * bc = pq.top();
00441                     pq.pop();
00442                     out.add(bc->key, bc->tag);
00443                     if (bc->next()) {
00444                         pq.push(bc);
00445                     } else {
00446                         delete bc;
00447                     }
00448                 }
00449             } else {
00450                 // Record, Termlist, Value
00451                 priority_queue<DocIDKeyedCursor *, vector<DocIDKeyedCursor *>,
00452                                CursorGt> pq;
00453                 for (size_t i = 0; i < out_of; ++i) {
00454                     Xapian::docid off = offset[i];
00455                     const char *srcdir = argv[i + optind];
00456                     string src(srcdir);
00457                     src += '/';
00458                     src += *t;
00459                     src += '_';
00460 
00461                     Btree *in = new Btree(src, true);
00462                     in->open();
00463                     if (in->get_entry_count()) {
00464                         btrees.push_back(in);
00465                         pq.push(new DocIDKeyedCursor(in, off));
00466                     } else {
00467                         delete in;
00468                     }
00469 
00470                     struct stat sb;
00471                     if (stat(src + "DB", &sb) == 0)
00472                         in_size += sb.st_size / 1024;
00473                     else
00474                         bad_stat = true;
00475                 }
00476 
00477                 if (strcmp(*t, "record") == 0) {
00478                     // Merge the METAINFO tags from each database into one.
00479                     // They have a key with a single zero byte, which will
00480                     // always be the first key.
00481                     Xapian::docid did;
00482                     quartz_totlen_t totlen = 0;
00483                     for (int i = pq.size(); i > 0; --i) {
00484                         DocIDKeyedCursor * bc = pq.top();
00485                         pq.pop();
00486                         if (!is_metainfo_key(bc->key)) {
00487                             throw Xapian::DatabaseCorruptError("No METAINFO item in record table.");
00488                         }
00489                         const char * data = bc->tag.data();
00490                         const char * end = data + bc->tag.size();
00491                         if (!unpack_uint(&data, end, &did)) {
00492                             throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00493                         }
00494                         if (!unpack_uint_last(&data, end, &totlen)) {
00495                             throw Xapian::DatabaseCorruptError("Tag containing meta information is corrupt.");
00496                         }
00497                         tot_totlen += totlen;
00498                         if (tot_totlen < tot_totlen) {
00499                             throw "totlen wrapped!";
00500                         }
00501                         if (bc->next()) {
00502                             pq.push(bc);
00503                         } else {
00504                             delete bc;
00505                         }
00506                     }
00507                     string tag = pack_uint(tot_off);
00508                     tag += pack_uint_last(tot_totlen);
00509                     out.add(string("", 1), tag);
00510                 }
00511 
00512                 while (!pq.empty()) {
00513                     DocIDKeyedCursor * bc = pq.top();
00514                     pq.pop();
00515                     out.add(bc->key, bc->tag);
00516                     if (bc->next()) {
00517                         pq.push(bc);
00518                     } else {
00519                         delete bc;
00520                     }
00521                 }
00522             }
00523 
00524             for (vector<Btree *>::const_iterator b = btrees.begin();
00525                  b != btrees.end(); ++b) {
00526                 delete *b;
00527             }
00528             btrees.clear();
00529 
00530             out.commit(1);
00531 
00532             cout << '\r' << *t << ": ";
00533             struct stat sb;
00534             if (!bad_stat && stat(dest + "DB", &sb) == 0) {
00535                 off_t out_size = sb.st_size / 1024;
00536                 if (out_size == in_size) {
00537                     cout << "Size unchanged (";
00538                 } else if (out_size < in_size) {
00539                     cout << "Reduced by "
00540                          << 100 * double(in_size - out_size) / in_size << "% "
00541                          << in_size - out_size << "K (" << in_size << "K -> ";
00542                 } else {
00543                     cout << "INCREASED by "
00544                          << 100 * double(out_size - in_size) / in_size << "% "
00545                          << out_size - in_size << "K (" << in_size << "K -> ";
00546                 }
00547                 cout << out_size << "K)";
00548             } else {
00549                 cout << "Done (couldn't stat all the DB files)";
00550             }
00551             cout << endl;
00552         }
00553 
00554         // Copy meta file
00555         // FIXME: may need to do something smarter that just copying an
00556         // arbitrary meta file if the meta file format changes...
00557         string src(argv[optind]);
00558         src += "/meta";
00559         string dest = destdir;
00560         dest += "/meta.tmp";
00561         {
00562             ifstream metain(src.c_str());
00563             ofstream metaout(dest.c_str());
00564             char buf[2048];
00565             while (!metain.eof()) {
00566                 // FIXME check for errors
00567                 metain.read(buf, sizeof(buf));
00568                 metaout.write(buf, metain.gcount());
00569             }
00570         }
00571         string meta = destdir;
00572         meta += "/meta";
00573         if (rename(dest.c_str(), meta.c_str()) == -1) {
00574             cerr << argv[0] << ": couldn't rename `" << dest << "' to `"
00575                  << meta << "': " << strerror(errno) << endl;
00576             exit(1);
00577         }
00578     } catch (const Xapian::Error &error) {
00579         cerr << argv[0] << ": " << error.get_description() << endl;
00580         exit(1);
00581     }
00582 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.