examples/delve.cc

Go to the documentation of this file.
00001 /* delve.cc: Allow inspection of the contents of a Xapian database
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2006,2007,2008 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 
00025 #include <xapian.h>
00026 
00027 #include <algorithm>
00028 #include <iostream>
00029 #include <vector>
00030 
00031 #include "gnu_getopt.h"
00032 
00033 #include <string.h>
00034 
00035 using namespace Xapian;
00036 using namespace std;
00037 
00038 static char separator = ' ';
00039 
00040 static bool verbose = false;
00041 static bool showvalues = false;
00042 static bool showdocdata = false;
00043 
00044 #define PROG_NAME "delve"
00045 #define PROG_DESC "Inspect the contents of a Xapian database"
00046 
00047 static void show_usage() {
00048     cout << "Usage: "PROG_NAME" [OPTIONS] DATABASE...\n\n"
00049 "Options:\n"
00050 "  -r <recno>            for term list(s)\n"
00051 "  -t <term>             for posting list(s)\n"
00052 "  -t <term> -r <recno>  for position list(s)\n"
00053 "  -s, --stemmer=LANG    set the stemming language, the default is 'none'\n"
00054 "  -1                    output one list entry per line\n"
00055 "  -V                    output values for each document referred to\n"
00056 "  -V<valueno>           output value valueno for each document in the database\n"
00057 "  -d                    output document data for each document referred to\n"
00058 "  -v                    extra info (wdf and len for postlist;\n"
00059 "                        wdf and termfreq for termlist; number of terms for db)\n"
00060 "      --help            display this help and exit\n"
00061 "      --version         output version information and exit" << endl;
00062 }
00063 
00064 static void
00065 show_db_stats(Database &db)
00066 {
00067     // Display a few database stats.
00068     cout << "number of documents = " << db.get_doccount() << endl;
00069     cout << "average document length = " << db.get_avlength() << endl;
00070     if (verbose) {
00071         // To find the number of terms, we have to count them!
00072         // This will take a few seconds or minutes, so only do it if -v
00073         // was specified.
00074         termcount terms = 0;
00075         TermIterator t = db.allterms_begin();
00076         const TermIterator end = db.allterms_end();
00077         while (t != end) {
00078             ++terms;
00079             ++t;
00080         }
00081         cout << "number of unique terms = " << terms << endl;
00082     }
00083 }
00084 
00085 static void
00086 show_values(Database &db, docid docid, char sep)
00087 {
00088     Document doc = db.get_document(docid);
00089     ValueIterator v = doc.values_begin();
00090     ValueIterator vend = doc.values_end();
00091     while (v != vend) {
00092         cout << sep << v.get_valueno() << ':' << *v;
00093         ++v;
00094     }
00095 }
00096 
00097 static void
00098 show_values(Database &db,
00099             vector<docid>::const_iterator i,
00100             vector<docid>::const_iterator end)
00101 {
00102     while (i != end) {
00103         cout << "Values for record #" << *i << ':';
00104         show_values(db, *i, separator);
00105         cout << endl;
00106         ++i;
00107     }
00108 }
00109 
00110 static void
00111 show_docdata(Database &db, docid docid, char sep)
00112 {
00113     cout << sep << "[" << db.get_document(docid).get_data() << ']';
00114 }
00115 
00116 static void
00117 show_docdata(Database &db,
00118              vector<docid>::const_iterator i,
00119              vector<docid>::const_iterator end)
00120 {
00121     while (i != end) {
00122         cout << "Data for record #" << *i << ':' << endl;
00123         cout << db.get_document(*i).get_data() << endl;
00124         ++i;
00125     }
00126 }
00127 
00128 static void
00129 show_termlists(Database &db,
00130                vector<docid>::const_iterator i,
00131                vector<docid>::const_iterator end)
00132 {
00133     // Display termlists
00134     while (i != end) {
00135         TermIterator t = db.termlist_begin(*i);
00136         TermIterator tend = db.termlist_end(*i);
00137         cout << "Term List for record #" << *i << ':';
00138         while (t != tend) {
00139             cout << separator << *t;
00140             if (verbose) {
00141                 cout << ' ' << t.get_wdf() << ' ' << t.get_termfreq();
00142             }
00143             ++t;
00144         }
00145         cout << endl;
00146         ++i;
00147     }
00148 }
00149 
00150 static Stem stemmer;
00151 
00152 int
00153 main(int argc, char **argv)
00154 {
00155     if (argc > 1 && argv[1][0] == '-') {
00156         if (strcmp(argv[1], "--help") == 0) {
00157             cout << PROG_NAME" - "PROG_DESC"\n\n";
00158             show_usage();
00159             exit(0);
00160         }
00161         if (strcmp(argv[1], "--version") == 0) {
00162             cout << PROG_NAME" - "PACKAGE_STRING << endl;
00163             exit(0);
00164         }
00165     }
00166 
00167     vector<docid> recnos;
00168     vector<string> terms;
00169     vector<string> dbs;
00170 
00171     valueno valno = 0; // Avoid "may be used uninitialised" warnings.
00172     bool valno_set = false;
00173 
00174     int c;
00175     while ((c = gnu_getopt(argc, argv, "r:t:s:1vkV::d")) != -1) {
00176         switch (c) {
00177             case 'r':
00178                 recnos.push_back(atoi(optarg));
00179                 break;
00180             case 't':
00181                 terms.push_back(optarg);
00182                 break;
00183             case 's':
00184                 stemmer = Stem(optarg);
00185                 break;
00186             case '1':
00187                 separator = '\n';
00188                 break;
00189             case 'V': case 'k': /* -k for backward compatibility */
00190                 showvalues = true;
00191                 if (optarg) {
00192                     valno = atoi(optarg);
00193                     valno_set = true;
00194                 }
00195                 break;
00196             case 'd':
00197                 showdocdata = true;
00198                 break;
00199             case 'v':
00200                 verbose = true;
00201                 break;
00202             default:
00203                 show_usage();
00204                 exit(1);
00205         }
00206     }
00207 
00208     while (argv[optind]) dbs.push_back(argv[optind++]);
00209 
00210     if (dbs.empty()) {
00211         show_usage();
00212         exit(1);
00213     }
00214 
00215     std::sort(recnos.begin(), recnos.end());
00216 
00217     Database db;
00218     {
00219         vector<string>::const_iterator i;
00220         for (i = dbs.begin(); i != dbs.end(); i++) {
00221             try {
00222                 db.add_database(Database(*i));
00223             } catch (const Error &e) {
00224                 cout << "Error opening database `" << *i << "': ";
00225                 cout << e.get_description() << endl;
00226                 return 1;
00227             }
00228         }
00229     }
00230 
00231     try {
00232         if (terms.empty() && recnos.empty() && !valno_set) {
00233             show_db_stats(db);
00234             return 0;
00235         }
00236 
00237         if (!recnos.empty()) {
00238             if (showvalues) {
00239                 show_values(db, recnos.begin(), recnos.end());
00240             }
00241 
00242             if (showdocdata) {
00243                 show_docdata(db, recnos.begin(), recnos.end());
00244             }
00245         }
00246 
00247         if (valno_set) {
00248             doccount n = db.get_doccount();
00249             docid did = 0;
00250             docid hwm = db.get_lastdocid();
00251             cout << "Value " << valno << " for each document:";
00252             while (n && did != hwm) {
00253                 try {
00254                     Document doc = db.get_document(++did);
00255                     string val = doc.get_value(valno);
00256                     if (!val.empty())
00257                         cout << separator << did << ':' << doc.get_value(valno);
00258                     --n;
00259                 } catch (DocNotFoundError &) {
00260                 }
00261             }
00262             cout << endl;
00263         }
00264 
00265         if (terms.empty()) {
00266             show_termlists(db, recnos.begin(), recnos.end());
00267             return 0;
00268         }
00269 
00270         vector<string>::const_iterator i;
00271         for (i = terms.begin(); i != terms.end(); i++) {
00272             string term = stemmer(*i);
00273             PostingIterator p = db.postlist_begin(term);
00274             PostingIterator pend = db.postlist_end(term);
00275             if (p == pend) {
00276                 cout << "term `" << term << "' not in database\n";
00277                 continue;
00278             }
00279             if (recnos.empty()) {
00280                 // Display posting list
00281                 cout << "Posting List for term `" << term << "' (termfreq "
00282                      << db.get_termfreq(term) << ", collfreq "
00283                      << db.get_collection_freq(term) << "):";
00284                 while (p != pend) {
00285                     cout << separator << *p;
00286                     if (verbose) {
00287                         cout << ' ' << p.get_wdf()
00288                             << ' ' << p.get_doclength();
00289                     }
00290                     if (showvalues) show_values(db, *p, ' ');
00291                     if (showdocdata) show_docdata(db, *p, ' ');
00292                     p++;
00293                 }
00294                 cout << endl;
00295             } else {
00296                 // Display position lists
00297                 vector<docid>::const_iterator j;
00298                 for (j = recnos.begin(); j != recnos.end(); j++) {
00299                     p.skip_to(*j);
00300                     if (p == pend || *p != *j) {
00301                         cout << "term `" << term <<
00302                             "' doesn't index document #" << *j << endl;
00303                     } else {
00304                         cout << "Position List for term `" << term
00305                             << "', record #" << *j << ':';
00306                         try {
00307                             PositionIterator pos = p.positionlist_begin();
00308                             PositionIterator posend = p.positionlist_end();
00309                             while (pos != posend) {
00310                                 cout << separator << *pos;
00311                                 ++pos;
00312                             }
00313                             cout << endl;
00314                         } catch (const Error &e) {
00315                             cout << "Error: " << e.get_description() << endl;
00316                         }
00317                     }
00318                 }
00319             }
00320         }
00321     } catch (const Error &e) {
00322         cout << "\nError: " << e.get_description() << endl;
00323         return 1;
00324     }
00325 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.