backends/flint/flint_postlist.cc

Go to the documentation of this file.
00001 /* flint_postlist.cc: Postlists in a flint database
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002,2003,2004,2005,2007 Olly Betts
00005  * Copyright 2007 Lemur Consulting Ltd
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 #include "flint_postlist.h"
00025 
00026 #include "flint_cursor.h"
00027 #include "flint_database.h"
00028 #include "flint_utils.h"
00029 #include "noreturn.h"
00030 #include "omdebug.h"
00031 #include "utils.h"
00032 
00033 Xapian::doccount
00034 FlintPostListTable::get_termfreq(const string & term) const
00035 {
00036     string key = make_key(term);
00037     string tag;
00038     if (!get_exact_entry(key, tag)) return 0;
00039 
00040     Xapian::doccount termfreq;
00041     const char * p = tag.data();
00042     FlintPostList::read_number_of_entries(&p, p + tag.size(), &termfreq, NULL);
00043     return termfreq;
00044 }
00045 
00046 Xapian::termcount
00047 FlintPostListTable::get_collection_freq(const string & term) const
00048 {
00049     string key = make_key(term);
00050     string tag;
00051     if (!get_exact_entry(key, tag)) return 0;
00052 
00053     Xapian::termcount collfreq;
00054     const char * p = tag.data();
00055     FlintPostList::read_number_of_entries(&p, p + tag.size(), NULL, &collfreq);
00056     return collfreq;
00057 }
00058 
00059 // How big should chunks in the posting list be?  (They
00060 // will grow slightly bigger than this, but not more than a
00061 // few bytes extra) - FIXME: tune this value to try to
00062 // maximise how well blocks are used.  Or performance.
00063 // Or indexing speed.  Or something...
00064 const unsigned int CHUNKSIZE = 2000;
00065 
00072 class PostlistChunkWriter {
00073     public:
00074         PostlistChunkWriter(const string &orig_key_,
00075                             bool is_first_chunk_,
00076                             const string &tname_,
00077                             bool is_last_chunk_);
00078 
00080         void append(FlintTable * table, Xapian::docid did,
00081                     Xapian::termcount wdf, flint_doclen_t doclen);
00082 
00084         void raw_append(Xapian::docid first_did_, Xapian::docid current_did_,
00085                         const string & s) {
00086             Assert(!started);
00087             first_did = first_did_;
00088             current_did = current_did_;
00089             if (!s.empty()) {
00090                 chunk.append(s);
00091                 started = true;
00092             }
00093         }
00094 
00099         void flush(FlintTable *table);
00100 
00101     private:
00102         string orig_key;
00103         string tname;
00104         bool is_first_chunk;
00105         bool is_last_chunk;
00106         bool started;
00107 
00108         Xapian::docid first_did;
00109         Xapian::docid current_did;
00110 
00111         string chunk;
00112 };
00113 
00114 // Static functions
00115 
00117 XAPIAN_NORETURN(static void report_read_error(const char * position));
00118 static void report_read_error(const char * position)
00119 {
00120     if (position == 0) {
00121         // data ran out
00122         throw Xapian::DatabaseCorruptError("Data ran out unexpectedly when reading posting list.");
00123     }
00124     // overflow
00125     throw Xapian::RangeError("Value in posting list too large.");
00126 }
00127 
00128 static inline bool get_tname_from_key(const char **src, const char *end,
00129                                string &tname)
00130 {
00131     return unpack_string_preserving_sort(src, end, tname);
00132 }
00133 
00134 static inline bool
00135 check_tname_in_key_lite(const char **keypos, const char *keyend, const string &tname)
00136 {
00137     string tname_in_key;
00138 
00139     // Read the termname.
00140     if (!get_tname_from_key(keypos, keyend, tname_in_key)) {
00141         report_read_error(*keypos);
00142     }
00143 
00144     // This should only fail if the postlist doesn't exist at all.
00145     return tname_in_key == tname;
00146 }
00147 
00148 static inline bool
00149 check_tname_in_key(const char **keypos, const char *keyend, const string &tname)
00150 {
00151     if (*keypos == keyend) return false;
00152 
00153     return check_tname_in_key_lite(keypos, keyend, tname);
00154 }
00155 
00157 static Xapian::docid
00158 read_start_of_first_chunk(const char ** posptr,
00159                           const char * end,
00160                           Xapian::doccount * number_of_entries_ptr,
00161                           Xapian::termcount * collection_freq_ptr)
00162 {
00163     DEBUGCALL_STATIC(DB, Xapian::docid, "read_start_of_first_chunk",
00164                      (const void *)posptr << ", " <<
00165                      (const void *)end << ", " <<
00166                      (void *)number_of_entries_ptr << ", " <<
00167                      (void *)collection_freq_ptr);
00168 
00169     FlintPostList::read_number_of_entries(posptr, end,
00170                            number_of_entries_ptr, collection_freq_ptr);
00171     if (number_of_entries_ptr)
00172         DEBUGLINE(DB, "number_of_entries = " << *number_of_entries_ptr);
00173     if (collection_freq_ptr)
00174         DEBUGLINE(DB, "collection_freq = " << *collection_freq_ptr);
00175 
00176     Xapian::docid did;
00177     // Read the docid of the first entry in the posting list.
00178     if (!unpack_uint(posptr, end, &did))
00179         report_read_error(*posptr);
00180     ++did;
00181     DEBUGLINE(DB, "doc_id = " << did);
00182     RETURN(did);
00183 }
00184 
00185 static inline void read_did_increase(const char ** posptr,
00186                               const char * end,
00187                               Xapian::docid * did_ptr)
00188 {
00189     Xapian::docid did_increase;
00190     if (!unpack_uint(posptr, end, &did_increase)) report_read_error(*posptr);
00191     *did_ptr += did_increase + 1;
00192 }
00193 
00195 static inline void read_wdf_and_length(const char ** posptr,
00196                                 const char * end,
00197                                 Xapian::termcount * wdf_ptr,
00198                                 flint_doclen_t * doclength_ptr)
00199 {
00200     if (!unpack_uint(posptr, end, wdf_ptr)) report_read_error(*posptr);
00201     if (!unpack_uint(posptr, end, doclength_ptr)) report_read_error(*posptr);
00202 }
00203 
00205 static Xapian::docid
00206 read_start_of_chunk(const char ** posptr,
00207                     const char * end,
00208                     Xapian::docid first_did_in_chunk,
00209                     bool * is_last_chunk_ptr)
00210 {
00211     DEBUGCALL_STATIC(DB, Xapian::docid, "read_start_of_chunk",
00212                      reinterpret_cast<const void*>(posptr) << ", " <<
00213                      reinterpret_cast<const void*>(end) << ", " <<
00214                      first_did_in_chunk << ", " <<
00215                      reinterpret_cast<const void*>(is_last_chunk_ptr));
00216 
00217     // Read whether this is the last chunk
00218     if (!unpack_bool(posptr, end, is_last_chunk_ptr))
00219         report_read_error(*posptr);
00220     if (is_last_chunk_ptr)
00221         DEBUGLINE(DB, "is_last_chunk = " << *is_last_chunk_ptr);
00222 
00223     // Read what the final document ID in this chunk is.
00224     Xapian::docid increase_to_last;
00225     if (!unpack_uint(posptr, end, &increase_to_last))
00226         report_read_error(*posptr);
00227     ++increase_to_last;
00228     Xapian::docid last_did_in_chunk = first_did_in_chunk + increase_to_last;
00229     DEBUGLINE(DB, "last_did_in_chunk = " << last_did_in_chunk);
00230     RETURN(last_did_in_chunk);
00231 }
00232 
00233 static string make_wdf_and_length(Xapian::termcount wdf, flint_doclen_t doclength)
00234 {
00235     return pack_uint(wdf) + pack_uint(doclength);
00236 }
00237 
00238 static void write_start_of_chunk(string & chunk,
00239                                  unsigned int start_of_chunk_header,
00240                                  unsigned int end_of_chunk_header,
00241                                  bool is_last_chunk,
00242                                  Xapian::docid first_did_in_chunk,
00243                                  Xapian::docid last_did_in_chunk)
00244 {
00245     Assert((size_t)(end_of_chunk_header - start_of_chunk_header) <= chunk.size());
00246     Assert(last_did_in_chunk >= first_did_in_chunk);
00247     Xapian::docid increase_to_last = last_did_in_chunk - first_did_in_chunk;
00248 
00249     chunk.replace(start_of_chunk_header,
00250                   end_of_chunk_header - start_of_chunk_header,
00251                   pack_bool(is_last_chunk) + pack_uint(increase_to_last - 1));
00252     // FIXME - storing increase_to_last - 1 is bogus as this value is
00253     // -1 when a postlist chunk has a single entry!  Luckily the code
00254     // works despite this, but it's ugly.
00255 }
00256 
00261 class PostlistChunkReader {
00262     public:
00268         PostlistChunkReader(Xapian::docid first_did, const string & data_)
00269             : data(data_), pos(data.data()), end(pos + data.length()), at_end(data.empty()), did(first_did)
00270         {
00271             if (!at_end) read_wdf_and_length(&pos, end, &wdf, &doclength);
00272         }
00273 
00274         Xapian::docid get_docid() const {
00275             return did;
00276         }
00277         Xapian::termcount get_wdf() const {
00278             return wdf;
00279         }
00280         flint_doclen_t get_doclength() const {
00281             DEBUGCALL(DB, flint_doclen_t, "PostlistChunkReader::get_doclength", "");
00282             RETURN(doclength);
00283         }
00284 
00285         bool is_at_end() const {
00286             return at_end;
00287         }
00288 
00291         void next();
00292 
00293     private:
00294         string data;
00295 
00296         const char *pos;
00297         const char *end;
00298 
00299         bool at_end;
00300 
00301         Xapian::docid did;
00302         Xapian::termcount wdf;
00303         flint_doclen_t doclength;
00304 };
00305 
00306 void
00307 PostlistChunkReader::next()
00308 {
00309     if (pos == end) {
00310         at_end = true;
00311     } else {
00312         read_did_increase(&pos, end, &did);
00313         read_wdf_and_length(&pos, end, &wdf, &doclength);
00314     }
00315 }
00316 
00317 PostlistChunkWriter::PostlistChunkWriter(const string &orig_key_,
00318                                          bool is_first_chunk_,
00319                                          const string &tname_,
00320                                          bool is_last_chunk_)
00321         : orig_key(orig_key_),
00322           tname(tname_), is_first_chunk(is_first_chunk_),
00323           is_last_chunk(is_last_chunk_),
00324           started(false)
00325 {
00326     DEBUGCALL(DB, void, "PostlistChunkWriter::PostlistChunkWriter",
00327               orig_key_ << ", " << is_first_chunk_ << ", " << tname_ << ", " <<
00328               is_last_chunk_);
00329 }
00330 
00331 void
00332 PostlistChunkWriter::append(FlintTable * table, Xapian::docid did,
00333                             Xapian::termcount wdf, flint_doclen_t doclen)
00334 {
00335     if (!started) {
00336         started = true;
00337         first_did = did;
00338     } else {
00339         Assert(did > current_did);
00340         // Start a new chunk if this one has grown to the threshold.
00341         if (chunk.size() >= CHUNKSIZE) {
00342             bool save_is_last_chunk = is_last_chunk;
00343             is_last_chunk = false;
00344             flush(table);
00345             is_last_chunk = save_is_last_chunk;
00346             is_first_chunk = false;
00347             first_did = did;
00348             chunk = "";
00349             orig_key = FlintPostListTable::make_key(tname, first_did);
00350         } else {
00351             chunk.append(pack_uint(did - current_did - 1));
00352         }
00353     }
00354     current_did = did;
00355     chunk.append(make_wdf_and_length(wdf, doclen));
00356 }
00357 
00360 static inline string
00361 make_start_of_first_chunk(Xapian::termcount entries,
00362                           Xapian::termcount collectionfreq,
00363                           Xapian::docid new_did)
00364 {
00365     return pack_uint(entries) + pack_uint(collectionfreq) + pack_uint(new_did - 1);
00366 }
00367 
00370 static inline string
00371 make_start_of_chunk(bool new_is_last_chunk,
00372                     Xapian::docid new_first_did,
00373                     Xapian::docid new_final_did)
00374 {
00375     Assert(new_final_did >= new_first_did);
00376     return pack_bool(new_is_last_chunk) +
00377             pack_uint(new_final_did - new_first_did - 1);
00378 }
00379 
00380 void
00381 PostlistChunkWriter::flush(FlintTable *table)
00382 {
00383     DEBUGCALL(DB, void, "PostlistChunkWriter::flush", table);
00384 
00385     /* This is one of the more messy parts involved with updating posting
00386      * list chunks.
00387      * 
00388      * Depending on circumstances, we may have to delete an entire chunk
00389      * or file it under a different key, as well as possibly modifying both
00390      * the previous and next chunk of the postlist.
00391      */
00392 
00393     if (!started) {
00394         /* This chunk is now empty so disappears entirely.
00395          *
00396          * If this was the last chunk, then the previous chunk
00397          * must have its "is_last_chunk" flag updated.
00398          *
00399          * If this was the first chunk, then the next chunk must
00400          * be transformed into the first chunk.  Messy!
00401          */
00402         DEBUGLINE(DB, "PostlistChunkWriter::flush(): deleting chunk");
00403         Assert(!orig_key.empty());
00404         if (is_first_chunk) {
00405             DEBUGLINE(DB, "PostlistChunkWriter::flush(): deleting first chunk");
00406             if (is_last_chunk) {
00407                 /* This is the first and the last chunk, ie the only
00408                  * chunk, so just delete the tag.
00409                  */
00410                 table->del(orig_key);
00411                 return;
00412             }
00413 
00414             /* This is the messiest case.  The first chunk is to
00415              * be removed, and there is at least one chunk after
00416              * it.  Need to rewrite the next chunk as the first
00417              * chunk.
00418              */
00419             AutoPtr<FlintCursor> cursor(table->cursor_get());
00420 
00421             if (!cursor->find_entry(orig_key)) {
00422                 throw Xapian::DatabaseCorruptError("The key we're working on has disappeared");
00423             }
00424 
00425             // Extract existing counts from the first chunk so we can reinsert
00426             // them into the block we're renaming.
00427             Xapian::termcount num_ent, coll_freq;
00428             {
00429                 cursor->read_tag();
00430                 const char *tagpos = cursor->current_tag.data();
00431                 const char *tagend = tagpos + cursor->current_tag.size();
00432 
00433                 (void)read_start_of_first_chunk(&tagpos, tagend,
00434                                                 &num_ent, &coll_freq);
00435             }
00436 
00437             // Seek to the next chunk.
00438             cursor->next();
00439             if (cursor->after_end()) {
00440                 throw Xapian::DatabaseCorruptError("Expected another key but found none");
00441             }
00442             const char *kpos = cursor->current_key.data();
00443             const char *kend = kpos + cursor->current_key.size();
00444             if (!check_tname_in_key(&kpos, kend, tname)) {
00445                 throw Xapian::DatabaseCorruptError("Expected another key with the same term name but found a different one");
00446             }
00447 
00448             // Read the new first docid
00449             Xapian::docid new_first_did;
00450             if (!unpack_uint_preserving_sort(&kpos, kend,
00451                                              &new_first_did)) {
00452                 report_read_error(kpos);
00453             }
00454 
00455             cursor->read_tag();
00456             const char *tagpos = cursor->current_tag.data();
00457             const char *tagend = tagpos + cursor->current_tag.size();
00458 
00459             // Read the chunk header
00460             bool new_is_last_chunk;
00461             Xapian::docid new_last_did_in_chunk =
00462                 read_start_of_chunk(&tagpos, tagend, new_first_did,
00463                                     &new_is_last_chunk);
00464 
00465             string chunk_data(tagpos, tagend);
00466 
00467             // First remove the renamed tag
00468             table->del(cursor->current_key);
00469 
00470             // And now write it as the first chunk
00471             string tag;
00472             tag = make_start_of_first_chunk(num_ent, coll_freq, new_first_did);
00473             tag += make_start_of_chunk(new_is_last_chunk,
00474                                               new_first_did,
00475                                               new_last_did_in_chunk);
00476             tag += chunk_data;
00477             table->add(orig_key, tag);
00478             return;
00479         }
00480 
00481         DEBUGLINE(DB, "PostlistChunkWriter::flush(): deleting secondary chunk");
00482         /* This isn't the first chunk.  Check whether we're the last
00483          * chunk.
00484          */
00485 
00486         // Delete this chunk
00487         table->del(orig_key);
00488 
00489         if (is_last_chunk) {
00490             DEBUGLINE(DB, "PostlistChunkWriter::flush(): deleting secondary last chunk");
00491             // Update the previous chunk's is_last_chunk flag.
00492             AutoPtr<FlintCursor> cursor(table->cursor_get());
00493 
00494             /* Should not find the key we just deleted, but should
00495              * find the previous chunk. */
00496             if (cursor->find_entry(orig_key)) {
00497                 throw Xapian::DatabaseCorruptError("Flint key not deleted as we expected");
00498             }
00499             // Make sure this is a chunk with the right term attached.
00500             const char * keypos = cursor->current_key.data();
00501             const char * keyend = keypos + cursor->current_key.size();
00502             if (!check_tname_in_key(&keypos, keyend, tname)) {
00503                 throw Xapian::DatabaseCorruptError("Couldn't find chunk before delete chunk");
00504             }
00505 
00506             bool is_prev_first_chunk = (keypos == keyend);
00507 
00508             // Now update the last_chunk
00509             cursor->read_tag();
00510             string tag = cursor->current_tag;
00511 
00512             const char *tagpos = tag.data();
00513             const char *tagend = tagpos + tag.size();
00514 
00515             // Skip first chunk header
00516             Xapian::docid first_did_in_chunk;
00517             if (is_prev_first_chunk) {
00518                 first_did_in_chunk = read_start_of_first_chunk(&tagpos, tagend,
00519                                           0, 0);
00520             } else {
00521                 if (!unpack_uint_preserving_sort(&keypos, keyend,
00522                                                  &first_did_in_chunk))
00523                     report_read_error(keypos);
00524             }
00525             bool wrong_is_last_chunk;
00526             string::size_type start_of_chunk_header = tagpos - tag.data();
00527             Xapian::docid last_did_in_chunk =
00528                 read_start_of_chunk(&tagpos, tagend, first_did_in_chunk,
00529                                     &wrong_is_last_chunk);
00530             string::size_type end_of_chunk_header = tagpos - tag.data();
00531 
00532             // write new is_last flag
00533             write_start_of_chunk(tag,
00534                                  start_of_chunk_header,
00535                                  end_of_chunk_header,
00536                                  true, // is_last_chunk
00537                                  first_did_in_chunk,
00538                                  last_did_in_chunk);
00539             table->add(cursor->current_key, tag);
00540         }
00541     } else {
00542         DEBUGLINE(DB, "PostlistChunkWriter::flush(): updating chunk which still has items in it");
00543         /* The chunk still has some items in it.  Two major subcases:
00544          * a) This is the first chunk.
00545          * b) This isn't the first chunk.
00546          *
00547          * The subcases just affect the chunk header.
00548          */
00549         string tag;
00550 
00551         /* First write the header, which depends on whether this is the
00552          * first chunk.
00553          */
00554         if (is_first_chunk) {
00555             /* The first chunk.  This is the relatively easy case,
00556              * and we just have to write this one back to disk.
00557              */
00558             DEBUGLINE(DB, "PostlistChunkWriter::flush(): rewriting the first chunk, which still has items in it");
00559             string key = FlintPostListTable::make_key(tname);
00560             bool ok = table->get_exact_entry(key, tag);
00561             (void)ok;
00562             Assert(ok);
00563             Assert(!tag.empty());
00564 
00565             Xapian::termcount num_ent, coll_freq;
00566             {
00567                 const char * tagpos = tag.data();
00568                 const char * tagend = tagpos + tag.size();
00569                 (void)read_start_of_first_chunk(&tagpos, tagend,
00570                                                 &num_ent, &coll_freq);
00571             }
00572 
00573             tag = make_start_of_first_chunk(num_ent, coll_freq, first_did);
00574 
00575             tag += make_start_of_chunk(is_last_chunk, first_did, current_did);
00576             tag += chunk;
00577             table->add(key, tag);
00578             return;
00579         }
00580 
00581         DEBUGLINE(DB, "PostlistChunkWriter::flush(): updating secondary chunk which still has items in it");
00582         /* Not the first chunk.
00583          *
00584          * This has the easy sub-sub-case:
00585          *   The first entry in the chunk hasn't changed
00586          * ...and the hard sub-sub-case:
00587          *   The first entry in the chunk has changed.  This is
00588          *   harder because the key for the chunk changes, so
00589          *   we've got to do a switch.
00590          */
00591 
00592         // First find out the initial docid
00593         const char *keypos = orig_key.data();
00594         const char *keyend = keypos + orig_key.size();
00595         if (!check_tname_in_key(&keypos, keyend, tname)) {
00596             throw Xapian::DatabaseCorruptError("Have invalid key writing to postlist");
00597         }
00598         Xapian::docid initial_did;
00599         if (!unpack_uint_preserving_sort(&keypos, keyend, &initial_did)) {
00600             report_read_error(keypos);
00601         }
00602         string new_key;
00603         if (initial_did != first_did) {
00604             /* The fiddlier case:
00605              * Create a new tag with the correct key, and replace
00606              * the old one.
00607              */
00608             new_key = FlintPostListTable::make_key(tname, first_did);
00609             table->del(orig_key);
00610         } else {
00611             new_key = orig_key;
00612         }
00613 
00614         // ...and write the start of this chunk.
00615         tag = make_start_of_chunk(is_last_chunk, first_did, current_did);
00616 
00617         tag += chunk;
00618         table->add(new_key, tag);
00619     }
00620 }
00621 
00626 void FlintPostList::read_number_of_entries(const char ** posptr,
00627                                    const char * end,
00628                                    Xapian::doccount * number_of_entries_ptr,
00629                                    Xapian::termcount * collection_freq_ptr)
00630 {
00631     if (!unpack_uint(posptr, end, number_of_entries_ptr))
00632         report_read_error(*posptr);
00633     if (!unpack_uint(posptr, end, collection_freq_ptr))
00634         report_read_error(*posptr);
00635 }
00636 
00656 FlintPostList::FlintPostList(Xapian::Internal::RefCntPtr<const FlintDatabase> this_db_,
00657                              const string & tname_)
00658         : this_db(this_db_),
00659           tname(tname_),
00660           have_started(false),
00661           cursor(this_db->postlist_table.cursor_get()),
00662           is_at_end(false)
00663 {
00664     DEBUGCALL(DB, void, "FlintPostList::ostList",
00665               this_db_.get() << ", " << tname_);
00666     string key = FlintPostListTable::make_key(tname);
00667     int found = cursor->find_entry(key);
00668     if (!found) {
00669         number_of_entries = 0;
00670         is_at_end = true;
00671         pos = 0;
00672         end = 0;
00673         first_did_in_chunk = 0;
00674         last_did_in_chunk = 0;
00675         return;
00676     }
00677     cursor->read_tag();
00678     pos = cursor->current_tag.data();
00679     end = pos + cursor->current_tag.size();
00680 
00681     did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL);
00682     first_did_in_chunk = did;
00683     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
00684                                             &is_last_chunk);
00685     read_wdf_and_length(&pos, end, &wdf, &doclength);
00686 }
00687 
00688 FlintPostList::~FlintPostList()
00689 {
00690     DEBUGCALL(DB, void, "FlintPostList::~FlintPostList", "");
00691 }
00692 
00693 bool
00694 FlintPostList::next_in_chunk()
00695 {
00696     DEBUGCALL(DB, bool, "FlintPostList::next_in_chunk", "");
00697     if (pos == end) RETURN(false);
00698 
00699     read_did_increase(&pos, end, &did);
00700     read_wdf_and_length(&pos, end, &wdf, &doclength);
00701 
00702     // Either not at last doc in chunk, or pos == end, but not both.
00703     Assert(did <= last_did_in_chunk);
00704     Assert(did < last_did_in_chunk || pos == end);
00705     Assert(pos != end || did == last_did_in_chunk);
00706 
00707     RETURN(true);
00708 }
00709 
00710 void
00711 FlintPostList::next_chunk()
00712 {
00713     DEBUGCALL(DB, void, "FlintPostList::next_chunk", "");
00714     if (is_last_chunk) {
00715         is_at_end = true;
00716         return;
00717     }
00718 
00719     cursor->next();
00720     if (cursor->after_end()) {
00721         is_at_end = true;
00722         throw Xapian::DatabaseCorruptError("Unexpected end of posting list for `" +
00723                                      tname + "'");
00724     }
00725     const char * keypos = cursor->current_key.data();
00726     const char * keyend = keypos + cursor->current_key.size();
00727     // Check we're still in same postlist
00728     if (!check_tname_in_key_lite(&keypos, keyend, tname)) {
00729         is_at_end = true;
00730         throw Xapian::DatabaseCorruptError("Unexpected end of posting list for `" +
00731                                      tname + "'");
00732     }
00733 
00734     Xapian::docid newdid;
00735     if (!unpack_uint_preserving_sort(&keypos, keyend, &newdid)) {
00736         report_read_error(keypos);
00737     }
00738     if (newdid <= did) {
00739         throw Xapian::DatabaseCorruptError("Document ID in new chunk of postlist (" +
00740                 om_tostring(newdid) +
00741                 ") is not greater than final document ID in previous chunk (" +
00742                 om_tostring(did) + ")");
00743     }
00744     did = newdid;
00745 
00746     cursor->read_tag();
00747     pos = cursor->current_tag.data();
00748     end = pos + cursor->current_tag.size();
00749 
00750     first_did_in_chunk = did;
00751     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
00752                                             &is_last_chunk);
00753     read_wdf_and_length(&pos, end, &wdf, &doclength);
00754 }
00755 
00756 PositionList *
00757 FlintPostList::read_position_list()
00758 {
00759     DEBUGCALL(DB, PositionList *, "FlintPostList::read_position_list", "");
00760     positionlist.read_data(&this_db->position_table, did, tname);
00761     RETURN(&positionlist);
00762 }
00763 
00764 PositionList *
00765 FlintPostList::open_position_list() const
00766 {
00767     DEBUGCALL(DB, PositionList *, "FlintPostList::open_position_list", "");
00768     RETURN(new FlintPositionList(&this_db->position_table, did, tname));
00769 }
00770 
00771 PostList *
00772 FlintPostList::next(Xapian::weight w_min)
00773 {
00774     DEBUGCALL(DB, PostList *, "FlintPostList::next", w_min);
00775     (void)w_min; // no warning
00776 
00777     if (!have_started) {
00778         have_started = true;
00779     } else {
00780         if (!next_in_chunk()) next_chunk();
00781     }
00782 
00783     DEBUGLINE(DB, string("Moved to ") <<
00784               (is_at_end ? string("end.") : string("docid, wdf, doclength = ") +
00785                om_tostring(did) + ", " + om_tostring(wdf) + ", " +
00786                om_tostring(doclength)));
00787 
00788     RETURN(NULL);
00789 }
00790 
00791 bool
00792 FlintPostList::current_chunk_contains(Xapian::docid desired_did)
00793 {
00794     DEBUGCALL(DB, bool, "FlintPostList::current_chunk_contains", desired_did);
00795     if (desired_did >= first_did_in_chunk &&
00796         desired_did <= last_did_in_chunk) {
00797         RETURN(true);
00798     }
00799     RETURN(false);
00800 }
00801 
00802 void
00803 FlintPostList::move_to_chunk_containing(Xapian::docid desired_did)
00804 {
00805     DEBUGCALL(DB, void,
00806               "FlintPostList::move_to_chunk_containing", desired_did);
00807     (void)cursor->find_entry(FlintPostListTable::make_key(tname, desired_did));
00808     Assert(!cursor->after_end());
00809 
00810     const char * keypos = cursor->current_key.data();
00811     const char * keyend = keypos + cursor->current_key.size();
00812     // Check we're still in same postlist
00813     if (!check_tname_in_key_lite(&keypos, keyend, tname)) {
00814         // This should only happen if the postlist doesn't exist at all.
00815         is_at_end = true;
00816         is_last_chunk = true;
00817         return;
00818     }
00819     is_at_end = false;
00820 
00821     cursor->read_tag();
00822     pos = cursor->current_tag.data();
00823     end = pos + cursor->current_tag.size();
00824 
00825     if (keypos == keyend) {
00826         // In first chunk
00827 #ifdef XAPIAN_DEBUG
00828         Xapian::doccount old_number_of_entries = number_of_entries;
00829         did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL);
00830         Assert(old_number_of_entries == number_of_entries);
00831 #else
00832         did = read_start_of_first_chunk(&pos, end, NULL, NULL);
00833 #endif
00834     } else {
00835         // In normal chunk
00836         if (!unpack_uint_preserving_sort(&keypos, keyend, &did)) {
00837             report_read_error(keypos);
00838         }
00839     }
00840 
00841     first_did_in_chunk = did;
00842     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
00843                                             &is_last_chunk);
00844     read_wdf_and_length(&pos, end, &wdf, &doclength);
00845 
00846     // Possible, since desired_did might be after end of this chunk and before
00847     // the next.
00848     if (desired_did > last_did_in_chunk) next_chunk();
00849 }
00850 
00851 bool
00852 FlintPostList::move_forward_in_chunk_to_at_least(Xapian::docid desired_did)
00853 {
00854     DEBUGCALL(DB, bool,
00855               "FlintPostList::move_forward_in_chunk_to_at_least", desired_did);
00856     if (desired_did > last_did_in_chunk) {
00857         pos = end;
00858         RETURN(false);
00859     }
00860     while (did < desired_did) {
00861         // FIXME: perhaps we don't need to decode the wdf and document length
00862         // for documents we're skipping past.
00863         bool at_end_of_chunk = !next_in_chunk();
00864         if (at_end_of_chunk) RETURN(false);
00865     }
00866     RETURN(true);
00867 }
00868 
00869 PostList *
00870 FlintPostList::skip_to(Xapian::docid desired_did, Xapian::weight w_min)
00871 {
00872     DEBUGCALL(DB, PostList *,
00873               "FlintPostList::skip_to", desired_did << ", " << w_min);
00874     (void)w_min; // no warning
00875     // We've started now - if we hadn't already, we're already positioned
00876     // at start so there's no need to actually do anything.
00877     have_started = true;
00878 
00879     // Don't skip back, and don't need to do anything if already there.
00880     if (is_at_end || desired_did <= did) RETURN(NULL);
00881 
00882     // Move to correct chunk
00883     if (!current_chunk_contains(desired_did)) {
00884         move_to_chunk_containing(desired_did);
00885         // Might be at_end now, so we need to check before trying to move
00886         // forward in chunk.
00887         if (is_at_end) RETURN(NULL);
00888     }
00889 
00890     // Move to correct position in chunk
00891     bool have_document = move_forward_in_chunk_to_at_least(desired_did);
00892 #ifdef XAPIAN_DEBUG
00893     Assert(have_document);
00894 #else
00895     (void)have_document;
00896 #endif
00897 
00898     DEBUGLINE(DB, string("Skipped to ") <<
00899               (is_at_end ? string("end.") : string("docid, wdf, doclength = ") +
00900                om_tostring(did) + ", " + om_tostring(wdf) + ", " +
00901                om_tostring(doclength) + "."));
00902 
00903     RETURN(NULL);
00904 }
00905 
00906 string
00907 FlintPostList::get_description() const
00908 {
00909     return tname + ":" + om_tostring(number_of_entries);
00910 }
00911 
00912 // Returns the last did to allow in this chunk.
00913 Xapian::docid
00914 FlintPostListTable::get_chunk(const string &tname,
00915           Xapian::docid did, bool adding,
00916           PostlistChunkReader ** from, PostlistChunkWriter **to)
00917 {
00918     // Get chunk containing entry
00919     string key = make_key(tname, did);
00920 
00921     // Find the right chunk
00922     AutoPtr<FlintCursor> cursor(cursor_get());
00923 
00924     cursor->find_entry(key);
00925     Assert(!cursor->after_end());
00926 
00927     const char * keypos = cursor->current_key.data();
00928     const char * keyend = keypos + cursor->current_key.size();
00929 
00930     if (!check_tname_in_key(&keypos, keyend, tname)) {
00931         // Postlist for this termname doesn't exist.
00932         if (!adding)
00933             throw Xapian::DatabaseCorruptError("Attempted to delete or modify an entry in a non-existent posting list for " + tname);
00934 
00935         *from = NULL;
00936         *to = new PostlistChunkWriter("", true, tname, true);
00937         return Xapian::docid(-1);
00938     }
00939 
00940     // See if we're appending - if so we can shortcut by just copying
00941     // the data part of the chunk wholesale.
00942     bool is_first_chunk = (keypos == keyend);
00943 
00944     cursor->read_tag();
00945     const char * pos = cursor->current_tag.data();
00946     const char * end = pos + cursor->current_tag.size();
00947     Xapian::docid first_did_in_chunk;
00948     if (is_first_chunk) {
00949         first_did_in_chunk = read_start_of_first_chunk(&pos, end, NULL, NULL);
00950     } else {
00951         if (!unpack_uint_preserving_sort(&keypos, keyend,
00952                                          &first_did_in_chunk)) {
00953             report_read_error(keypos);
00954         }
00955     }
00956 
00957     bool is_last_chunk;
00958     Xapian::docid last_did_in_chunk;
00959     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk, &is_last_chunk);
00960     *to = new PostlistChunkWriter(cursor->current_key, is_first_chunk, tname,
00961                                   is_last_chunk);
00962     if (did > last_did_in_chunk) {
00963         // This is the shortcut.  Not very pretty, but I'll leave refactoring
00964         // until I've a clearer picture of everything which needs to be done.
00965         // (FIXME)
00966         *from = NULL;
00967         (*to)->raw_append(first_did_in_chunk, last_did_in_chunk,
00968                           string(pos, end)); 
00969     } else {
00970         *from = new PostlistChunkReader(first_did_in_chunk, string(pos, end));
00971     }
00972     if (is_last_chunk) return Xapian::docid(-1);
00973 
00974     // Find first did of next tag.
00975     cursor->next();
00976     if (cursor->after_end()) {
00977         throw Xapian::DatabaseCorruptError("Expected another key but found none");
00978     }
00979     const char *kpos = cursor->current_key.data();
00980     const char *kend = kpos + cursor->current_key.size();
00981     if (!check_tname_in_key(&kpos, kend, tname)) {
00982         throw Xapian::DatabaseCorruptError("Expected another key with the same term name but found a different one");
00983     }
00984 
00985     // Read the new first docid
00986     Xapian::docid first_did_of_next_chunk;
00987     if (!unpack_uint_preserving_sort(&kpos, kend, &first_did_of_next_chunk)) {
00988         report_read_error(kpos);
00989     }
00990     return first_did_of_next_chunk - 1;
00991 }
00992 
00993 void
00994 FlintPostListTable::merge_changes(
00995     const map<string, map<Xapian::docid, pair<char, Xapian::termcount> > > & mod_plists,
00996     const map<Xapian::docid, Xapian::termcount> & doclens,
00997     const map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> > & freq_deltas)
00998 {
00999     DEBUGCALL(DB, void, "FlintPostListTable::merge_changes", "mod_plists, doclens, freq_deltas");
01000     map<string, map<Xapian::docid, pair<char, Xapian::termcount> > >::const_iterator i;
01001     for (i = mod_plists.begin(); i != mod_plists.end(); ++i) {
01002         if (i->second.empty()) continue;
01003         string tname = i->first;
01004         {
01005             // Rewrite the first chunk of this posting list with the updated
01006             // termfreq and collfreq.
01007             map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> >::const_iterator deltas = freq_deltas.find(tname);
01008             Assert(deltas != freq_deltas.end());
01009 
01010             string current_key = make_key(tname);
01011             string tag;
01012             (void)get_exact_entry(current_key, tag);
01013 
01014             // Read start of first chunk to get termfreq and collfreq.
01015             const char *pos = tag.data();
01016             const char *end = pos + tag.size();
01017             Xapian::termcount termfreq, collfreq;
01018             Xapian::docid firstdid, lastdid;
01019             bool islast;
01020             if (pos == end) {
01021                 termfreq = 0;
01022                 collfreq = 0;
01023                 firstdid = 0;
01024                 lastdid = 0;
01025                 islast = true;
01026             } else {
01027                 firstdid = read_start_of_first_chunk(&pos, end,
01028                                                      &termfreq, &collfreq);
01029                 // Handle the generic start of chunk header.
01030                 lastdid = read_start_of_chunk(&pos, end, firstdid, &islast);
01031             }
01032 
01033             termfreq += deltas->second.first;
01034             if (termfreq == 0) {
01035                 // All postings deleted!  So we can shortcut by zapping the
01036                 // posting list.
01037                 if (islast) {
01038                     // Only one entry for this posting list.
01039                     del(current_key);
01040                     continue;
01041                 }
01042                 AutoPtr<FlintCursor> cursor(cursor_get());
01043                 bool found = cursor->find_entry(current_key);
01044                 Assert(found);
01045                 if (!found) continue; // Reduce damage!
01046                 while (cursor->del()) {
01047                     const char *kpos = cursor->current_key.data();
01048                     const char *kend = kpos + cursor->current_key.size();
01049                     if (!check_tname_in_key_lite(&kpos, kend, tname)) break;
01050                 }
01051                 continue;
01052             }
01053             collfreq += deltas->second.second;
01054 
01055             // Rewrite start of first chunk to update termfreq and collfreq.
01056             string newhdr = make_start_of_first_chunk(termfreq, collfreq, firstdid);
01057             newhdr += make_start_of_chunk(islast, firstdid, lastdid);
01058             if (pos == end) {
01059                 add(current_key, newhdr);
01060             } else {
01061                 Assert((size_t)(pos - tag.data()) <= tag.size());
01062                 tag.replace(0, pos - tag.data(), newhdr);
01063                 add(current_key, tag);
01064             }
01065         }
01066         map<Xapian::docid, pair<char, Xapian::termcount> >::const_iterator j;
01067         j = i->second.begin();
01068         Assert(j != i->second.end()); // This case is caught above.
01069 
01070         Xapian::docid max_did;
01071         PostlistChunkReader *from;
01072         PostlistChunkWriter *to;
01073         max_did = get_chunk(tname, j->first, j->second.first == 'A',
01074                             &from, &to);
01075         for ( ; j != i->second.end(); ++j) {
01076             Xapian::docid did = j->first;
01077 
01078 next_chunk:
01079             DEBUGLINE(DB, "Updating tname=" << tname << ", did=" << did);
01080             if (from) while (!from->is_at_end()) {
01081                 Xapian::docid copy_did = from->get_docid();
01082                 if (copy_did >= did) {
01083                     if (copy_did == did) {
01084                         Assert(j->second.first != 'A');
01085                         from->next();
01086                     }
01087                     break;
01088                 }
01089                 to->append(this, copy_did,
01090                            from->get_wdf(), from->get_doclength());
01091                 from->next();
01092             }
01093             if ((!from || from->is_at_end()) && did > max_did) {
01094                 delete from;
01095                 to->flush(this);
01096                 delete to;
01097                 max_did = get_chunk(tname, did, false, &from, &to);
01098                 goto next_chunk;
01099             }
01100 
01101             if (j->second.first != 'D') {
01102                 map<Xapian::docid, Xapian::termcount>::const_iterator k = doclens.find(did);
01103                 Assert(k != doclens.end());
01104                 Xapian::termcount new_doclen = k->second;
01105                 Xapian::termcount new_wdf = j->second.second;
01106 
01107                 to->append(this, did, new_wdf, new_doclen);
01108             }
01109         }
01110 
01111         if (from) {
01112             while (!from->is_at_end()) {
01113                 to->append(this, from->get_docid(),
01114                            from->get_wdf(), from->get_doclength());
01115                 from->next();
01116             }
01117             delete from;
01118         }
01119         to->flush(this);
01120         delete to;
01121     }
01122 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.