00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <config.h>
00024
00025 #include <xapian.h>
00026
00027 #include <algorithm>
00028 #include <iostream>
00029 #include <vector>
00030
00031 #include "gnu_getopt.h"
00032
00033 #include <string.h>
00034
00035 using namespace Xapian;
00036 using namespace std;
00037
00038 static char separator = ' ';
00039
00040 static bool verbose = false;
00041 static bool showvalues = false;
00042 static bool showdocdata = false;
00043
00044 #define PROG_NAME "delve"
00045 #define PROG_DESC "Inspect the contents of a Xapian database"
00046
00047 static void show_usage() {
00048 cout << "Usage: "PROG_NAME" [OPTIONS] DATABASE...\n\n"
00049 "Options:\n"
00050 " -r <recno> for term list(s)\n"
00051 " -t <term> for posting list(s)\n"
00052 " -t <term> -r <recno> for position list(s)\n"
00053 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
00054 " -1 output one list entry per line\n"
00055 " -V output values for each document referred to\n"
00056 " -V<valueno> output value valueno for each document in the database\n"
00057 " -d output document data for each document referred to\n"
00058 " -v extra info (wdf and len for postlist;\n"
00059 " wdf and termfreq for termlist; number of terms for db)\n"
00060 " --help display this help and exit\n"
00061 " --version output version information and exit" << endl;
00062 }
00063
00064 static void
00065 show_db_stats(Database &db)
00066 {
00067
00068 cout << "number of documents = " << db.get_doccount() << endl;
00069 cout << "average document length = " << db.get_avlength() << endl;
00070 if (verbose) {
00071
00072
00073
00074 termcount terms = 0;
00075 TermIterator t = db.allterms_begin();
00076 const TermIterator end = db.allterms_end();
00077 while (t != end) {
00078 ++terms;
00079 ++t;
00080 }
00081 cout << "number of unique terms = " << terms << endl;
00082 }
00083 }
00084
00085 static void
00086 show_values(Database &db, docid docid, char sep)
00087 {
00088 Document doc = db.get_document(docid);
00089 ValueIterator v = doc.values_begin();
00090 ValueIterator vend = doc.values_end();
00091 while (v != vend) {
00092 cout << sep << v.get_valueno() << ':' << *v;
00093 ++v;
00094 }
00095 }
00096
00097 static void
00098 show_values(Database &db,
00099 vector<docid>::const_iterator i,
00100 vector<docid>::const_iterator end)
00101 {
00102 while (i != end) {
00103 cout << "Values for record #" << *i << ':';
00104 show_values(db, *i, separator);
00105 cout << endl;
00106 ++i;
00107 }
00108 }
00109
00110 static void
00111 show_docdata(Database &db, docid docid, char sep)
00112 {
00113 cout << sep << "[" << db.get_document(docid).get_data() << ']';
00114 }
00115
00116 static void
00117 show_docdata(Database &db,
00118 vector<docid>::const_iterator i,
00119 vector<docid>::const_iterator end)
00120 {
00121 while (i != end) {
00122 cout << "Data for record #" << *i << ':' << endl;
00123 cout << db.get_document(*i).get_data() << endl;
00124 ++i;
00125 }
00126 }
00127
00128 static void
00129 show_termlists(Database &db,
00130 vector<docid>::const_iterator i,
00131 vector<docid>::const_iterator end)
00132 {
00133
00134 while (i != end) {
00135 TermIterator t = db.termlist_begin(*i);
00136 TermIterator tend = db.termlist_end(*i);
00137 cout << "Term List for record #" << *i << ':';
00138 while (t != tend) {
00139 cout << separator << *t;
00140 if (verbose) {
00141 cout << ' ' << t.get_wdf() << ' ' << t.get_termfreq();
00142 }
00143 ++t;
00144 }
00145 cout << endl;
00146 ++i;
00147 }
00148 }
00149
00150 static Stem stemmer;
00151
00152 int
00153 main(int argc, char **argv)
00154 {
00155 if (argc > 1 && argv[1][0] == '-') {
00156 if (strcmp(argv[1], "--help") == 0) {
00157 cout << PROG_NAME" - "PROG_DESC"\n\n";
00158 show_usage();
00159 exit(0);
00160 }
00161 if (strcmp(argv[1], "--version") == 0) {
00162 cout << PROG_NAME" - "PACKAGE_STRING << endl;
00163 exit(0);
00164 }
00165 }
00166
00167 vector<docid> recnos;
00168 vector<string> terms;
00169 vector<string> dbs;
00170
00171 valueno valno = 0;
00172 bool valno_set = false;
00173
00174 int c;
00175 while ((c = gnu_getopt(argc, argv, "r:t:s:1vkV::d")) != -1) {
00176 switch (c) {
00177 case 'r':
00178 recnos.push_back(atoi(optarg));
00179 break;
00180 case 't':
00181 terms.push_back(optarg);
00182 break;
00183 case 's':
00184 stemmer = Stem(optarg);
00185 break;
00186 case '1':
00187 separator = '\n';
00188 break;
00189 case 'V': case 'k':
00190 showvalues = true;
00191 if (optarg) {
00192 valno = atoi(optarg);
00193 valno_set = true;
00194 }
00195 break;
00196 case 'd':
00197 showdocdata = true;
00198 break;
00199 case 'v':
00200 verbose = true;
00201 break;
00202 default:
00203 show_usage();
00204 exit(1);
00205 }
00206 }
00207
00208 while (argv[optind]) dbs.push_back(argv[optind++]);
00209
00210 if (dbs.empty()) {
00211 show_usage();
00212 exit(1);
00213 }
00214
00215 std::sort(recnos.begin(), recnos.end());
00216
00217 Database db;
00218 {
00219 vector<string>::const_iterator i;
00220 for (i = dbs.begin(); i != dbs.end(); i++) {
00221 try {
00222 db.add_database(Database(*i));
00223 } catch (const Error &e) {
00224 cout << "Error opening database `" << *i << "': ";
00225 cout << e.get_description() << endl;
00226 return 1;
00227 }
00228 }
00229 }
00230
00231 try {
00232 if (terms.empty() && recnos.empty() && !valno_set) {
00233 show_db_stats(db);
00234 return 0;
00235 }
00236
00237 if (!recnos.empty()) {
00238 if (showvalues) {
00239 show_values(db, recnos.begin(), recnos.end());
00240 }
00241
00242 if (showdocdata) {
00243 show_docdata(db, recnos.begin(), recnos.end());
00244 }
00245 }
00246
00247 if (valno_set) {
00248 doccount n = db.get_doccount();
00249 docid did = 0;
00250 docid hwm = db.get_lastdocid();
00251 cout << "Value " << valno << " for each document:";
00252 while (n && did != hwm) {
00253 try {
00254 Document doc = db.get_document(++did);
00255 string val = doc.get_value(valno);
00256 if (!val.empty())
00257 cout << separator << did << ':' << doc.get_value(valno);
00258 --n;
00259 } catch (DocNotFoundError &) {
00260 }
00261 }
00262 cout << endl;
00263 }
00264
00265 if (terms.empty()) {
00266 show_termlists(db, recnos.begin(), recnos.end());
00267 return 0;
00268 }
00269
00270 vector<string>::const_iterator i;
00271 for (i = terms.begin(); i != terms.end(); i++) {
00272 string term = stemmer(*i);
00273 PostingIterator p = db.postlist_begin(term);
00274 PostingIterator pend = db.postlist_end(term);
00275 if (p == pend) {
00276 cout << "term `" << term << "' not in database\n";
00277 continue;
00278 }
00279 if (recnos.empty()) {
00280
00281 cout << "Posting List for term `" << term << "' (termfreq "
00282 << db.get_termfreq(term) << ", collfreq "
00283 << db.get_collection_freq(term) << "):";
00284 while (p != pend) {
00285 cout << separator << *p;
00286 if (verbose) {
00287 cout << ' ' << p.get_wdf()
00288 << ' ' << p.get_doclength();
00289 }
00290 if (showvalues) show_values(db, *p, ' ');
00291 if (showdocdata) show_docdata(db, *p, ' ');
00292 p++;
00293 }
00294 cout << endl;
00295 } else {
00296
00297 vector<docid>::const_iterator j;
00298 for (j = recnos.begin(); j != recnos.end(); j++) {
00299 p.skip_to(*j);
00300 if (p == pend || *p != *j) {
00301 cout << "term `" << term <<
00302 "' doesn't index document #" << *j << endl;
00303 } else {
00304 cout << "Position List for term `" << term
00305 << "', record #" << *j << ':';
00306 try {
00307 PositionIterator pos = p.positionlist_begin();
00308 PositionIterator posend = p.positionlist_end();
00309 while (pos != posend) {
00310 cout << separator << *pos;
00311 ++pos;
00312 }
00313 cout << endl;
00314 } catch (const Error &e) {
00315 cout << "Error: " << e.get_description() << endl;
00316 }
00317 }
00318 }
00319 }
00320 }
00321 } catch (const Error &e) {
00322 cout << "\nError: " << e.get_description() << endl;
00323 return 1;
00324 }
00325 }