backends/inmemory/inmemory_database.h

Go to the documentation of this file.
00001 /* inmemory_database.h: C++ class definition for multiple database access
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts
00006  * Copyright 2006 Richard Boulton
00007  *
00008  * This program is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU General Public License as
00010  * published by the Free Software Foundation; either version 2 of the
00011  * License, or (at your option) any later version.
00012  *
00013  * This program is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  * GNU General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU General Public License
00019  * along with this program; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00021  * USA
00022  */
00023 
00024 #ifndef OM_HGUARD_INMEMORY_DATABASE_H
00025 #define OM_HGUARD_INMEMORY_DATABASE_H
00026 
00027 #include "leafpostlist.h"
00028 #include "termlist.h"
00029 #include "database.h"
00030 #include <map>
00031 #include <vector>
00032 #include <algorithm>
00033 #include <xapian/document.h>
00034 #include "inmemory_positionlist.h"
00035 #include <omassert.h>
00036 
00037 using namespace std;
00038 
00039 // Class representing a posting (a term/doc pair, and
00040 // all the relevant positional information, is a single posting)
00041 class InMemoryPosting {
00042     public:
00043         Xapian::docid did;
00044         bool valid;
00045         vector<Xapian::termpos> positions; // Sorted vector of positions
00046         Xapian::termcount wdf;
00047 
00048         // Merge two postings (same term/doc pair, new positional info)
00049         void merge(const InMemoryPosting & post) {
00050             Assert(did == post.did);
00051 
00052             positions.insert(positions.end(),
00053                              post.positions.begin(),
00054                              post.positions.end());
00055             // FIXME - inefficient - use merge (and list<>)?
00056             sort(positions.begin(), positions.end());
00057         }
00058 };
00059 
00060 class InMemoryTermEntry {
00061     public:
00062         string tname;
00063         vector<Xapian::termpos> positions; // Sorted vector of positions
00064         Xapian::termcount wdf;
00065 
00066         // Merge two postings (same term/doc pair, new positional info)
00067         void merge(const InMemoryTermEntry & post) {
00068             Assert(tname == post.tname);
00069 
00070             positions.insert(positions.end(),
00071                              post.positions.begin(),
00072                              post.positions.end());
00073             // FIXME - inefficient - use merge (and list<>)?
00074             sort(positions.begin(), positions.end());
00075         }
00076 };
00077 
00078 // Compare by document ID
00079 class InMemoryPostingLessThan {
00080     public:
00081         int operator() (const InMemoryPosting &p1, const InMemoryPosting &p2)
00082         {
00083             return p1.did < p2.did;
00084         }
00085 };
00086 
00087 // Compare by termname
00088 class InMemoryTermEntryLessThan {
00089     public:
00090         int operator() (const InMemoryTermEntry&p1, const InMemoryTermEntry&p2)
00091         {
00092             return p1.tname < p2.tname;
00093         }
00094 };
00095 
00096 // Class representing a term and the documents indexing it
00097 class InMemoryTerm {
00098     public:
00099         // Sorted list of documents indexing this term.
00100         vector<InMemoryPosting> docs;
00101 
00102         Xapian::termcount term_freq;
00103         Xapian::termcount collection_freq;
00104 
00105         InMemoryTerm() : term_freq(0), collection_freq(0) {}
00106 
00107         void add_posting(const InMemoryPosting & post);
00108 };
00109 
00111 class InMemoryDoc {
00112     public:
00113         bool is_valid;
00114         // Sorted list of terms indexing this document.
00115         vector<InMemoryTermEntry> terms;
00116 
00117         /* Initialise invalid by default, so that resizing the termlist array
00118          * doesn't create valid documents. */
00119         InMemoryDoc() : is_valid(false) {}
00120 
00121         // Initialise specifying validity.
00122         InMemoryDoc(bool is_valid_) : is_valid(is_valid_) {}
00123 
00124         void add_posting(const InMemoryTermEntry & post);
00125 };
00126 
00127 class InMemoryDatabase;
00128 
00131 class InMemoryPostList : public LeafPostList {
00132     friend class InMemoryDatabase;
00133     private:
00134         vector<InMemoryPosting>::const_iterator pos;
00135         vector<InMemoryPosting>::const_iterator end;
00136         Xapian::doccount termfreq;
00137         bool started;
00138 
00142         InMemoryPositionList mypositions;
00143 
00144         Xapian::Internal::RefCntPtr<const InMemoryDatabase> db;
00145 
00146         InMemoryPostList(Xapian::Internal::RefCntPtr<const InMemoryDatabase> db,
00147                          const InMemoryTerm & term);
00148     public:
00149         Xapian::doccount get_termfreq() const;
00150 
00151         Xapian::docid       get_docid() const;     // Gets current docid
00152         Xapian::doclength   get_doclength() const; // Length of current document
00153         Xapian::termcount   get_wdf() const;          // Within Document Frequency
00154         PositionList * read_position_list();
00155         PositionList * open_position_list() const;
00156 
00157         PostList *next(Xapian::weight w_min); // Moves to next docid
00158 
00159         PostList *skip_to(Xapian::docid did, Xapian::weight w_min); // Moves to next docid >= specified docid
00160 
00161         // True if we're off the end of the list.
00162         bool at_end() const;
00163 
00164         string get_description() const;
00165 };
00166 
00169 class InMemoryAllDocsPostList : public LeafPostList {
00170     friend class InMemoryDatabase;
00171     private:
00172         Xapian::docid did;
00173 
00174         Xapian::Internal::RefCntPtr<const InMemoryDatabase> db;
00175 
00176         InMemoryAllDocsPostList(Xapian::Internal::RefCntPtr<const InMemoryDatabase> db);
00177     public:
00178         Xapian::doccount get_termfreq() const;
00179 
00180         Xapian::docid       get_docid() const;     // Gets current docid
00181         Xapian::doclength   get_doclength() const; // Length of current document
00182         Xapian::termcount   get_wdf() const;       // Within Document Frequency
00183         PositionList * read_position_list();
00184         PositionList * open_position_list() const;
00185 
00186         PostList *next(Xapian::weight w_min);      // Moves to next docid
00187 
00188         PostList *skip_to(Xapian::docid did, Xapian::weight w_min); // Moves to next docid >= specified docid
00189 
00190         // True if we're off the end of the list
00191         bool at_end() const;
00192 
00193         string get_description() const;
00194 };
00195 
00196 // Term List
00197 class InMemoryTermList : public TermList {
00198     friend class InMemoryDatabase;
00199     private:
00200         vector<InMemoryTermEntry>::const_iterator pos;
00201         vector<InMemoryTermEntry>::const_iterator end;
00202         Xapian::termcount terms;
00203         bool started;
00204 
00205         Xapian::Internal::RefCntPtr<const InMemoryDatabase> db;
00206         Xapian::docid did;
00207         Xapian::doclength document_length;
00208 
00209         InMemoryTermList(Xapian::Internal::RefCntPtr<const InMemoryDatabase> db, Xapian::docid did,
00210                          const InMemoryDoc & doc,
00211                          Xapian::doclength len);
00212     public:
00213         Xapian::termcount get_approx_size() const;
00214 
00216         void accumulate_stats(Xapian::Internal::ExpandStats & stats) const;
00217 
00218         string get_termname() const;
00219         Xapian::termcount get_wdf() const; // Number of occurrences of term in current doc
00220         Xapian::doccount get_termfreq() const;  // Number of docs indexed by term
00221         TermList * next();
00222         bool at_end() const;
00223         Xapian::termcount positionlist_count() const;
00224         Xapian::PositionIterator positionlist_begin() const;
00225 };
00226 
00231 class InMemoryDatabase : public Xapian::Database::Internal {
00232     friend class InMemoryAllDocsPostList;
00233 
00234     map<string, InMemoryTerm> postlists;
00235     vector<InMemoryDoc> termlists;
00236     vector<std::string> doclists;
00237     vector<std::map<Xapian::valueno, string> > valuelists;
00238 
00239     vector<Xapian::doclength> doclengths;
00240 
00241     std::map<string, string> metadata;
00242 
00243     Xapian::doccount totdocs;
00244 
00245     Xapian::doclength totlen;
00246 
00247     bool positions_present;
00248 
00249     // Stop copy / assignment being allowed
00250     InMemoryDatabase& operator=(const InMemoryDatabase &);
00251     InMemoryDatabase(const InMemoryDatabase &);
00252 
00253     void make_term(const string & tname);
00254 
00255     bool doc_exists(Xapian::docid did) const;
00256     Xapian::docid make_doc(const string & docdata);
00257 
00258     /* The common parts of add_doc and replace_doc */
00259     void finish_add_doc(Xapian::docid did, const Xapian::Document &document);
00260     void add_values(Xapian::docid did, const map<Xapian::valueno, string> &values_);
00261 
00262     void make_posting(InMemoryDoc * doc,
00263                       const string & tname,
00264                       Xapian::docid did,
00265                       Xapian::termpos position,
00266                       Xapian::termcount wdf,
00267                       bool use_position = true);
00268 
00270 
00272     void flush();
00273     void cancel();
00274 
00275     Xapian::docid add_document(const Xapian::Document & document);
00276     // Stop the default implementation of delete_document(term) and
00277     // replace_document(term) from being hidden.  This isn't really
00278     // a problem as we only try to call them through the base class
00279     // (where they aren't hidden) but some compilers generate a warning
00280     // about the hiding.
00281 #if (!defined __GNUC__ && !defined _MSC_VER) || __GNUC__ > 2
00282     using Xapian::Database::Internal::delete_document;
00283     using Xapian::Database::Internal::replace_document;
00284 #endif
00285     void delete_document(Xapian::docid did);
00286     void replace_document(Xapian::docid did, const Xapian::Document & document);
00288 
00289   public:
00294     InMemoryDatabase();
00295 
00296     ~InMemoryDatabase();
00297 
00298     Xapian::doccount get_doccount() const;
00299 
00300     Xapian::docid get_lastdocid() const;
00301 
00302     Xapian::doclength get_avlength() const;
00303     Xapian::doclength get_doclength(Xapian::docid did) const;
00304 
00305     Xapian::doccount get_termfreq(const string & tname) const;
00306     Xapian::termcount get_collection_freq(const string & tname) const;
00307     bool term_exists(const string & tname) const;
00308     bool has_positions() const;
00309 
00310     LeafPostList * open_post_list(const string & tname) const;
00311     TermList * open_term_list(Xapian::docid did) const;
00312     Xapian::Document::Internal * open_document(Xapian::docid did, bool lazy = false) const;
00313 
00314     std::string get_metadata(const std::string & key) const;
00315     void set_metadata(const std::string & key, const std::string & value);
00316 
00317     Xapian::termcount positionlist_count(Xapian::docid did,
00318                                          const string & tname) const;
00319     PositionList * open_position_list(Xapian::docid did,
00320                                       const string & tname) const;
00321     TermList * open_allterms(const string & prefix) const;
00322 };
00323 
00324 #endif /* OM_HGUARD_INMEMORY_DATABASE_H */

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.