backends/quartz/quartz_postlist.cc

Go to the documentation of this file.
00001 /* quartz_postlist.cc: Postlists in quartz databases
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002,2003,2004,2005,2007 Olly Betts
00005  * Copyright 2007 Lemur Consulting Ltd
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 #include "omdebug.h"
00025 #include "quartz_postlist.h"
00026 #include "quartz_utils.h"
00027 #include "bcursor.h"
00028 #include "database.h"
00029 #include "noreturn.h"
00030 #include "utils.h"
00031 
00033 static void
00034 make_key(const string & tname, Xapian::docid did, string & key)
00035 {
00036     key = pack_string_preserving_sort(tname);
00037     key += pack_uint_preserving_sort(did);
00038 }
00039 
00040 static void
00041 make_key(const string & tname, string & key)
00042 {
00043     key = pack_string_preserving_sort(tname);
00044 }
00045 
00046 // How big should chunks in the posting list be?  (They
00047 // will grow slightly bigger than this, but not more than a
00048 // few bytes extra) - FIXME: tune this value to try to
00049 // maximise how well blocks are used.  Or performance.
00050 // Or indexing speed.  Or something...
00051 const unsigned int CHUNKSIZE = 2000;
00052 
00059 class QPostlistChunkWriter {
00060     public:
00061         QPostlistChunkWriter(const string &orig_key_,
00062                             bool is_first_chunk_,
00063                             const string &tname_,
00064                             bool is_last_chunk_);
00065 
00067         void append(Btree * table, Xapian::docid did,
00068                     Xapian::termcount wdf, quartz_doclen_t doclen);
00069 
00071         void raw_append(Xapian::docid first_did_, Xapian::docid current_did_,
00072                         const string & s) {
00073             Assert(!started);
00074             first_did = first_did_;
00075             current_did = current_did_;
00076             if (!s.empty()) {
00077                 chunk.append(s);
00078                 started = true;
00079             }
00080         }
00081 
00086         void flush(Btree *table);
00087 
00088     private:
00089         string orig_key;
00090         string tname;
00091         bool is_first_chunk;
00092         bool is_last_chunk;
00093         bool started;
00094 
00095         Xapian::docid first_did;
00096         Xapian::docid current_did;
00097 
00098         string chunk;
00099 };
00100 
00101 // Static functions
00102 
00104 XAPIAN_NORETURN(static void report_read_error(const char * position));
00105 static void report_read_error(const char * position)
00106 {
00107     if (position == 0) {
00108         // data ran out
00109         throw Xapian::DatabaseCorruptError("Data ran out unexpectedly when reading posting list.");
00110     }
00111     // overflow
00112     throw Xapian::RangeError("Value in posting list too large.");
00113 }
00114 
00115 static inline bool get_tname_from_key(const char **src, const char *end,
00116                                string &tname)
00117 {
00118     return unpack_string_preserving_sort(src, end, tname);
00119 }
00120 
00121 static inline bool
00122 check_tname_in_key_lite(const char **keypos, const char *keyend, const string &tname)
00123 {
00124     string tname_in_key;
00125 
00126     // Read the termname.
00127     if (!get_tname_from_key(keypos, keyend, tname_in_key)) {
00128         report_read_error(*keypos);
00129     }
00130 
00131     // This should only fail if the postlist doesn't exist at all.
00132     return tname_in_key == tname;
00133 }
00134 
00135 static inline bool
00136 check_tname_in_key(const char **keypos, const char *keyend, const string &tname)
00137 {
00138     if (*keypos == keyend) return false;
00139 
00140     return check_tname_in_key_lite(keypos, keyend, tname);
00141 }
00142 
00144 static Xapian::docid
00145 read_start_of_first_chunk(const char ** posptr,
00146                           const char * end,
00147                           Xapian::doccount * number_of_entries_ptr,
00148                           Xapian::termcount * collection_freq_ptr)
00149 {
00150     DEBUGCALL_STATIC(DB, Xapian::docid, "read_start_of_first_chunk",
00151                      (const void *)posptr << ", " <<
00152                      (const void *)end << ", " <<
00153                      (void *)number_of_entries_ptr << ", " <<
00154                      (void *)collection_freq_ptr);
00155 
00156     QuartzPostList::read_number_of_entries(posptr, end,
00157                            number_of_entries_ptr, collection_freq_ptr);
00158     if (number_of_entries_ptr)
00159         DEBUGLINE(DB, "number_of_entries = " << *number_of_entries_ptr);
00160     if (collection_freq_ptr)
00161         DEBUGLINE(DB, "collection_freq = " << *collection_freq_ptr);
00162 
00163     Xapian::docid did;
00164     // Read the docid of the first entry in the posting list.
00165     if (!unpack_uint(posptr, end, &did))
00166         report_read_error(*posptr);
00167     ++did;
00168     DEBUGLINE(DB, "doc_id = " << did);
00169     RETURN(did);
00170 }
00171 
00172 Xapian::doccount
00173 QuartzPostListTable::get_termfreq(const string & term) const
00174 {
00175     string key, tag;
00176     make_key(term, key);
00177     if (!get_exact_entry(key, tag)) return 0;
00178 
00179     Xapian::doccount termfreq;
00180     const char * p = tag.data();
00181     QuartzPostList::read_number_of_entries(&p, p + tag.size(), &termfreq, NULL);
00182     return termfreq;
00183 }
00184 
00185 Xapian::termcount
00186 QuartzPostListTable::get_collection_freq(const string & term) const
00187 {
00188     string key, tag;
00189     make_key(term, key);
00190     if (!get_exact_entry(key, tag)) return 0;
00191 
00192     Xapian::termcount collfreq;
00193     const char * p = tag.data();
00194     QuartzPostList::read_number_of_entries(&p, p + tag.size(), NULL, &collfreq);
00195     return collfreq;
00196 }
00197 
00198 static inline void read_did_increase(const char ** posptr,
00199                               const char * end,
00200                               Xapian::docid * did_ptr)
00201 {
00202     Xapian::docid did_increase;
00203     if (!unpack_uint(posptr, end, &did_increase)) report_read_error(*posptr);
00204     *did_ptr += did_increase + 1;
00205 }
00206 
00208 static inline void read_wdf_and_length(const char ** posptr,
00209                                 const char * end,
00210                                 Xapian::termcount * wdf_ptr,
00211                                 quartz_doclen_t * doclength_ptr)
00212 {
00213     if (!unpack_uint(posptr, end, wdf_ptr)) report_read_error(*posptr);
00214     if (!unpack_uint(posptr, end, doclength_ptr)) report_read_error(*posptr);
00215 }
00216 
00218 static Xapian::docid
00219 read_start_of_chunk(const char ** posptr,
00220                     const char * end,
00221                     Xapian::docid first_did_in_chunk,
00222                     bool * is_last_chunk_ptr)
00223 {
00224     DEBUGCALL_STATIC(DB, Xapian::docid, "read_start_of_chunk",
00225                      reinterpret_cast<const void*>(posptr) << ", " <<
00226                      reinterpret_cast<const void*>(end) << ", " <<
00227                      first_did_in_chunk << ", " <<
00228                      reinterpret_cast<const void*>(is_last_chunk_ptr));
00229 
00230     // Read whether this is the last chunk
00231     if (!unpack_bool(posptr, end, is_last_chunk_ptr))
00232         report_read_error(*posptr);
00233     if (is_last_chunk_ptr)
00234         DEBUGLINE(DB, "is_last_chunk = " << *is_last_chunk_ptr);
00235 
00236     // Read what the final document ID in this chunk is.
00237     Xapian::docid increase_to_last;
00238     if (!unpack_uint(posptr, end, &increase_to_last))
00239         report_read_error(*posptr);
00240     ++increase_to_last;
00241     Xapian::docid last_did_in_chunk = first_did_in_chunk + increase_to_last;
00242     DEBUGLINE(DB, "last_did_in_chunk = " << last_did_in_chunk);
00243     RETURN(last_did_in_chunk);
00244 }
00245 
00246 static string make_wdf_and_length(Xapian::termcount wdf, quartz_doclen_t doclength)
00247 {
00248     return pack_uint(wdf) + pack_uint(doclength);
00249 }
00250 
00251 static void write_start_of_chunk(string & chunk,
00252                                  unsigned int start_of_chunk_header,
00253                                  unsigned int end_of_chunk_header,
00254                                  bool is_last_chunk,
00255                                  Xapian::docid first_did_in_chunk,
00256                                  Xapian::docid last_did_in_chunk)
00257 {
00258     Assert((size_t)(end_of_chunk_header - start_of_chunk_header) <= chunk.size());
00259     Assert(last_did_in_chunk >= first_did_in_chunk);
00260     Xapian::docid increase_to_last = last_did_in_chunk - first_did_in_chunk;
00261 
00262     chunk.replace(start_of_chunk_header,
00263                   end_of_chunk_header - start_of_chunk_header,
00264                   pack_bool(is_last_chunk) + pack_uint(increase_to_last - 1));
00265     // FIXME - storing increase_to_last - 1 is bogus as this value is
00266     // -1 when a postlist chunk has a single entry!  Luckily the code
00267     // works despite this, but it's ugly.
00268 }
00269 
00274 class QPostlistChunkReader {
00275     public:
00281         QPostlistChunkReader(Xapian::docid first_did, const string & data_)
00282             : data(data_), pos(data.data()), end(pos + data.length()), at_end(data.empty()), did(first_did)
00283         {
00284             if (!at_end) read_wdf_and_length(&pos, end, &wdf, &doclength);
00285         }
00286 
00287         Xapian::docid get_docid() const {
00288             return did;
00289         }
00290         Xapian::termcount get_wdf() const {
00291             return wdf;
00292         }
00293         quartz_doclen_t get_doclength() const {
00294             return doclength;
00295         }
00296 
00297         bool is_at_end() const {
00298             return at_end;
00299         }
00300 
00303         void next();
00304 
00305     private:
00306         string data;
00307 
00308         const char *pos;
00309         const char *end;
00310 
00311         bool at_end;
00312 
00313         Xapian::docid did;
00314         Xapian::termcount wdf;
00315         quartz_doclen_t doclength;
00316 };
00317 
00318 void
00319 QPostlistChunkReader::next()
00320 {
00321     if (pos == end) {
00322         at_end = true;
00323     } else {
00324         read_did_increase(&pos, end, &did);
00325         read_wdf_and_length(&pos, end, &wdf, &doclength);
00326     }
00327 }
00328 
00329 QPostlistChunkWriter::QPostlistChunkWriter(const string &orig_key_,
00330                                          bool is_first_chunk_,
00331                                          const string &tname_,
00332                                          bool is_last_chunk_)
00333         : orig_key(orig_key_),
00334           tname(tname_), is_first_chunk(is_first_chunk_),
00335           is_last_chunk(is_last_chunk_),
00336           started(false)
00337 {
00338     DEBUGCALL(DB, void, "QPostlistChunkWriter::QPostlistChunkWriter",
00339               orig_key_ << ", " << is_first_chunk_ << ", " << tname_ << ", " <<
00340               is_last_chunk_);
00341 }
00342 
00343 void
00344 QPostlistChunkWriter::append(Btree * table, Xapian::docid did,
00345                             Xapian::termcount wdf, quartz_doclen_t doclen)
00346 {
00347     if (!started) {
00348         started = true;
00349         first_did = did;
00350     } else {
00351         Assert(did > current_did);
00352         // Start a new chunk if this one has grown to the threshold.
00353         if (chunk.size() >= CHUNKSIZE) {
00354             bool save_is_last_chunk = is_last_chunk;
00355             is_last_chunk = false;
00356             flush(table);
00357             is_last_chunk = save_is_last_chunk;
00358             is_first_chunk = false;
00359             first_did = did;
00360             chunk = "";
00361             make_key(tname, first_did, orig_key);
00362         } else {
00363             chunk.append(pack_uint(did - current_did - 1));
00364         }
00365     }
00366     current_did = did;
00367     chunk.append(make_wdf_and_length(wdf, doclen));
00368 }
00369 
00372 static inline string
00373 make_start_of_first_chunk(Xapian::termcount entries,
00374                           Xapian::termcount collectionfreq,
00375                           Xapian::docid new_did)
00376 {
00377     return pack_uint(entries) + pack_uint(collectionfreq) + pack_uint(new_did - 1);
00378 }
00379 
00382 static inline string
00383 make_start_of_chunk(bool new_is_last_chunk,
00384                     Xapian::docid new_first_did,
00385                     Xapian::docid new_final_did)
00386 {
00387     Assert(new_final_did >= new_first_did);
00388     return pack_bool(new_is_last_chunk) +
00389             pack_uint(new_final_did - new_first_did - 1);
00390 }
00391 
00392 void
00393 QPostlistChunkWriter::flush(Btree *table)
00394 {
00395     DEBUGCALL(DB, void, "QPostlistChunkWriter::flush", table);
00396 
00397     /* This is one of the more messy parts involved with updating posting
00398      * list chunks.
00399      * 
00400      * Depending on circumstances, we may have to delete an entire chunk
00401      * or file it under a different key, as well as possibly modifying both
00402      * the previous and next chunk of the postlist.
00403      */
00404 
00405     if (!started) {
00406         /* This chunk is now empty so disappears entirely.
00407          *
00408          * If this was the last chunk, then the previous chunk
00409          * must have its "is_last_chunk" flag updated.
00410          *
00411          * If this was the first chunk, then the next chunk must
00412          * be transformed into the first chunk.  Messy!
00413          */
00414         DEBUGLINE(DB, "QPostlistChunkWriter::flush(): deleting chunk");
00415         Assert(!orig_key.empty());
00416         if (is_first_chunk) {
00417             DEBUGLINE(DB, "QPostlistChunkWriter::flush(): deleting first chunk");
00418             if (is_last_chunk) {
00419                 /* This is the first and the last chunk, ie the only
00420                  * chunk, so just delete the tag.
00421                  */
00422                 table->del(orig_key);
00423                 return;
00424             }
00425 
00426             /* This is the messiest case.  The first chunk is to
00427              * be removed, and there is at least one chunk after
00428              * it.  Need to rewrite the next chunk as the first
00429              * chunk.
00430              */
00431             AutoPtr<Bcursor> cursor(table->cursor_get());
00432 
00433             if (!cursor->find_entry(orig_key)) {
00434                 throw Xapian::DatabaseCorruptError("The key we're working on has disappeared");
00435             }
00436 
00437             // Extract existing counts from the first chunk so we can reinsert
00438             // them into the block we're renaming.
00439             Xapian::termcount num_ent, coll_freq;
00440             {
00441                 cursor->read_tag();
00442                 const char *tagpos = cursor->current_tag.data();
00443                 const char *tagend = tagpos + cursor->current_tag.size();
00444 
00445                 (void)read_start_of_first_chunk(&tagpos, tagend,
00446                                                 &num_ent, &coll_freq);
00447             }
00448 
00449             // Seek to the next chunk.
00450             cursor->next();
00451             if (cursor->after_end()) {
00452                 throw Xapian::DatabaseCorruptError("Expected another key but found none");
00453             }
00454             const char *kpos = cursor->current_key.data();
00455             const char *kend = kpos + cursor->current_key.size();
00456             if (!check_tname_in_key(&kpos, kend, tname)) {
00457                 throw Xapian::DatabaseCorruptError("Expected another key with the same term name but found a different one");
00458             }
00459 
00460             // Read the new first docid
00461             Xapian::docid new_first_did;
00462             if (!unpack_uint_preserving_sort(&kpos, kend,
00463                                              &new_first_did)) {
00464                 report_read_error(kpos);
00465             }
00466 
00467             cursor->read_tag();
00468             const char *tagpos = cursor->current_tag.data();
00469             const char *tagend = tagpos + cursor->current_tag.size();
00470 
00471             // Read the chunk header
00472             bool new_is_last_chunk;
00473             Xapian::docid new_last_did_in_chunk =
00474                 read_start_of_chunk(&tagpos, tagend, new_first_did,
00475                                     &new_is_last_chunk);
00476 
00477             string chunk_data(tagpos, tagend);
00478 
00479             // First remove the renamed tag
00480             table->del(cursor->current_key);
00481 
00482             // And now write it as the first chunk
00483             string tag;
00484             tag = make_start_of_first_chunk(num_ent, coll_freq, new_first_did);
00485             tag += make_start_of_chunk(new_is_last_chunk,
00486                                               new_first_did,
00487                                               new_last_did_in_chunk);
00488             tag += chunk_data;
00489             table->add(orig_key, tag);
00490             return;
00491         }
00492 
00493         DEBUGLINE(DB, "QPostlistChunkWriter::flush(): deleting secondary chunk");
00494         /* This isn't the first chunk.  Check whether we're the last
00495          * chunk.
00496          */
00497 
00498         // Delete this chunk
00499         table->del(orig_key);
00500 
00501         if (is_last_chunk) {
00502             DEBUGLINE(DB, "QPostlistChunkWriter::flush(): deleting secondary last chunk");
00503             // Update the previous chunk's is_last_chunk flag.
00504             AutoPtr<Bcursor> cursor(table->cursor_get());
00505 
00506             /* Should not find the key we just deleted, but should
00507              * find the previous chunk. */
00508             if (cursor->find_entry(orig_key)) {
00509                 throw Xapian::DatabaseCorruptError("Quartz key not deleted as we expected");
00510             }
00511             // Make sure this is a chunk with the right term attached.
00512             const char * keypos = cursor->current_key.data();
00513             const char * keyend = keypos + cursor->current_key.size();
00514             if (!check_tname_in_key(&keypos, keyend, tname)) {
00515                 throw Xapian::DatabaseCorruptError("Couldn't find chunk before delete chunk");
00516             }
00517 
00518             bool is_prev_first_chunk = (keypos == keyend);
00519 
00520             // Now update the last_chunk
00521             cursor->read_tag();
00522             string tag = cursor->current_tag;
00523 
00524             const char *tagpos = tag.data();
00525             const char *tagend = tagpos + tag.size();
00526 
00527             // Skip first chunk header
00528             Xapian::docid first_did_in_chunk;
00529             if (is_prev_first_chunk) {
00530                 first_did_in_chunk = read_start_of_first_chunk(&tagpos, tagend,
00531                                           0, 0);
00532             } else {
00533                 if (!unpack_uint_preserving_sort(&keypos, keyend,
00534                                                  &first_did_in_chunk))
00535                     report_read_error(keypos);
00536             }
00537             bool wrong_is_last_chunk;
00538             string::size_type start_of_chunk_header = tagpos - tag.data();
00539             Xapian::docid last_did_in_chunk =
00540                 read_start_of_chunk(&tagpos, tagend, first_did_in_chunk,
00541                                     &wrong_is_last_chunk);
00542             string::size_type end_of_chunk_header = tagpos - tag.data();
00543 
00544             // write new is_last flag
00545             write_start_of_chunk(tag,
00546                                  start_of_chunk_header,
00547                                  end_of_chunk_header,
00548                                  true, // is_last_chunk
00549                                  first_did_in_chunk,
00550                                  last_did_in_chunk);
00551             table->add(cursor->current_key, tag);
00552         }
00553     } else {
00554         DEBUGLINE(DB, "QPostlistChunkWriter::flush(): updating chunk which still has items in it");
00555         /* The chunk still has some items in it.  Two major subcases:
00556          * a) This is the first chunk.
00557          * b) This isn't the first chunk.
00558          *
00559          * The subcases just affect the chunk header.
00560          */
00561         string tag;
00562 
00563         /* First write the header, which depends on whether this is the
00564          * first chunk.
00565          */
00566         if (is_first_chunk) {
00567             /* The first chunk.  This is the relatively easy case,
00568              * and we just have to write this one back to disk.
00569              */
00570             DEBUGLINE(DB, "QPostlistChunkWriter::flush(): rewriting the first chunk, which still has items in it");
00571             string key;
00572             make_key(tname, key);
00573             bool ok = table->get_exact_entry(key, tag);
00574             (void)ok;
00575             Assert(ok);
00576             Assert(!tag.empty());
00577 
00578             Xapian::termcount num_ent, coll_freq;
00579             {
00580                 const char * tagpos = tag.data();
00581                 const char * tagend = tagpos + tag.size();
00582                 (void)read_start_of_first_chunk(&tagpos, tagend,
00583                                                 &num_ent, &coll_freq);
00584             }
00585 
00586             tag = make_start_of_first_chunk(num_ent, coll_freq, first_did);
00587 
00588             tag += make_start_of_chunk(is_last_chunk, first_did, current_did);
00589             tag += chunk;
00590             table->add(key, tag);
00591             return;
00592         }
00593 
00594         DEBUGLINE(DB, "QPostlistChunkWriter::flush(): updating secondary chunk which still has items in it");
00595         /* Not the first chunk.
00596          *
00597          * This has the easy sub-sub-case:
00598          *   The first entry in the chunk hasn't changed
00599          * ...and the hard sub-sub-case:
00600          *   The first entry in the chunk has changed.  This is
00601          *   harder because the key for the chunk changes, so
00602          *   we've got to do a switch.
00603          */
00604 
00605         // First find out the initial docid
00606         const char *keypos = orig_key.data();
00607         const char *keyend = keypos + orig_key.size();
00608         if (!check_tname_in_key(&keypos, keyend, tname)) {
00609             throw Xapian::DatabaseCorruptError("Have invalid key writing to postlist");
00610         }
00611         Xapian::docid initial_did;
00612         if (!unpack_uint_preserving_sort(&keypos, keyend, &initial_did)) {
00613             report_read_error(keypos);
00614         }
00615         string new_key;
00616         if (initial_did != first_did) {
00617             /* The fiddlier case:
00618              * Create a new tag with the correct key, and replace
00619              * the old one.
00620              */
00621             make_key(tname, first_did, new_key);
00622             table->del(orig_key);
00623         } else {
00624             new_key = orig_key;
00625         }
00626 
00627         // ...and write the start of this chunk.
00628         tag = make_start_of_chunk(is_last_chunk, first_did, current_did);
00629 
00630         tag += chunk;
00631         table->add(new_key, tag);
00632     }
00633 }
00634 
00639 void QuartzPostList::read_number_of_entries(const char ** posptr,
00640                                    const char * end,
00641                                    Xapian::doccount * number_of_entries_ptr,
00642                                    Xapian::termcount * collection_freq_ptr)
00643 {
00644     if (!unpack_uint(posptr, end, number_of_entries_ptr))
00645         report_read_error(*posptr);
00646     if (!unpack_uint(posptr, end, collection_freq_ptr))
00647         report_read_error(*posptr);
00648 }
00649 
00668 QuartzPostList::QuartzPostList(Xapian::Internal::RefCntPtr<const Xapian::Database::Internal> this_db_,
00669                                const Btree * table_,
00670                                const Btree * positiontable_,
00671                                const string & tname_)
00672         : this_db(this_db_),
00673           table(table_),
00674           positiontable(positiontable_),
00675           tname(tname_),
00676           cursor(table->cursor_get()),
00677           is_at_end(false),
00678           have_started(false)
00679 {
00680     DEBUGCALL(DB, void, "QuartzPostList::QuartzPostList",
00681               this_db_.get() << ", " << table_ << ", " <<
00682               positiontable_ << ", " << tname_);
00683     string key;
00684     make_key(tname, key);
00685     int found = cursor->find_entry(key);
00686     if (!found) {
00687         number_of_entries = 0;
00688         is_at_end = true;
00689         pos = 0;
00690         end = 0;
00691         first_did_in_chunk = 0;
00692         last_did_in_chunk = 0;
00693         return;
00694     }
00695     cursor->read_tag();
00696     pos = cursor->current_tag.data();
00697     end = pos + cursor->current_tag.size();
00698 
00699     did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL);
00700     first_did_in_chunk = did;
00701     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
00702                                             &is_last_chunk);
00703     read_wdf_and_length(&pos, end, &wdf, &doclength);
00704 }
00705 
00706 QuartzPostList::~QuartzPostList()
00707 {
00708     DEBUGCALL(DB, void, "QuartzPostList::~QuartzPostList", "");
00709 }
00710 
00711 bool
00712 QuartzPostList::next_in_chunk()
00713 {
00714     DEBUGCALL(DB, bool, "QuartzPostList::next_in_chunk", "");
00715     if (pos == end) RETURN(false);
00716 
00717     read_did_increase(&pos, end, &did);
00718     read_wdf_and_length(&pos, end, &wdf, &doclength);
00719 
00720     // Either not at last doc in chunk, or pos == end, but not both.
00721     Assert(did <= last_did_in_chunk);
00722     Assert(did < last_did_in_chunk || pos == end);
00723     Assert(pos != end || did == last_did_in_chunk);
00724 
00725     RETURN(true);
00726 }
00727 
00728 void
00729 QuartzPostList::next_chunk()
00730 {
00731     DEBUGCALL(DB, void, "QuartzPostList::next_chunk", "");
00732     if (is_last_chunk) {
00733         is_at_end = true;
00734         return;
00735     }
00736 
00737     cursor->next();
00738     if (cursor->after_end()) {
00739         is_at_end = true;
00740         throw Xapian::DatabaseCorruptError("Unexpected end of posting list for `" +
00741                                      tname + "'");
00742     }
00743     const char * keypos = cursor->current_key.data();
00744     const char * keyend = keypos + cursor->current_key.size();
00745     // Check we're still in same postlist
00746     if (!check_tname_in_key_lite(&keypos, keyend, tname)) {
00747         is_at_end = true;
00748         throw Xapian::DatabaseCorruptError("Unexpected end of posting list for `" +
00749                                      tname + "'");
00750     }
00751 
00752     Xapian::docid newdid;
00753     if (!unpack_uint_preserving_sort(&keypos, keyend, &newdid)) {
00754         report_read_error(keypos);
00755     }
00756     if (newdid <= did) {
00757         throw Xapian::DatabaseCorruptError("Document ID in new chunk of postlist (" +
00758                 om_tostring(newdid) +
00759                 ") is not greater than final document ID in previous chunk (" +
00760                 om_tostring(did) + ")");
00761     }
00762     did = newdid;
00763 
00764     cursor->read_tag();
00765     pos = cursor->current_tag.data();
00766     end = pos + cursor->current_tag.size();
00767 
00768     first_did_in_chunk = did;
00769     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
00770                                             &is_last_chunk);
00771     read_wdf_and_length(&pos, end, &wdf, &doclength);
00772 }
00773 
00774 PositionList *
00775 QuartzPostList::read_position_list()
00776 {
00777     DEBUGCALL(DB, PositionList *, "QuartzPostList::read_position_list", "");
00778 
00779     positionlist.read_data(positiontable, did, tname);
00780 
00781     RETURN(&positionlist);
00782 }
00783 
00784 PositionList *
00785 QuartzPostList::open_position_list() const
00786 {
00787     DEBUGCALL(DB, PositionList *, "QuartzPostList::open_position_list", "");
00788 
00789     AutoPtr<QuartzPositionList> poslist(new QuartzPositionList());
00790     poslist->read_data(positiontable, did, tname);
00791 
00792     RETURN(poslist.release());
00793 }
00794 
00795 PostList *
00796 QuartzPostList::next(Xapian::weight w_min)
00797 {
00798     DEBUGCALL(DB, PostList *, "QuartzPostList::next", w_min);
00799     (void)w_min; // no warning
00800 
00801     if (!have_started) {
00802         have_started = true;
00803     } else {
00804         if (!next_in_chunk()) next_chunk();
00805     }
00806 
00807     DEBUGLINE(DB, string("Moved to ") <<
00808               (is_at_end ? string("end.") : string("docid, wdf, doclength = ") +
00809                om_tostring(did) + ", " + om_tostring(wdf) + ", " +
00810                om_tostring(doclength)));
00811 
00812     RETURN(NULL);
00813 }
00814 
00815 bool
00816 QuartzPostList::current_chunk_contains(Xapian::docid desired_did)
00817 {
00818     DEBUGCALL(DB, bool, "QuartzPostList::current_chunk_contains", desired_did);
00819     if (desired_did >= first_did_in_chunk &&
00820         desired_did <= last_did_in_chunk) {
00821         RETURN(true);
00822     }
00823     RETURN(false);
00824 }
00825 
00826 void
00827 QuartzPostList::move_to_chunk_containing(Xapian::docid desired_did)
00828 {
00829     DEBUGCALL(DB, void,
00830               "QuartzPostList::move_to_chunk_containing", desired_did);
00831     string key;
00832     make_key(tname, desired_did, key);
00833     (void) cursor->find_entry(key);
00834     Assert(!cursor->after_end());
00835 
00836     const char * keypos = cursor->current_key.data();
00837     const char * keyend = keypos + cursor->current_key.size();
00838     // Check we're still in same postlist
00839     if (!check_tname_in_key_lite(&keypos, keyend, tname)) {
00840         // This should only happen if the postlist doesn't exist at all.
00841         is_at_end = true;
00842         is_last_chunk = true;
00843         return;
00844     }
00845     is_at_end = false;
00846 
00847     cursor->read_tag();
00848     pos = cursor->current_tag.data();
00849     end = pos + cursor->current_tag.size();
00850 
00851     if (keypos == keyend) {
00852         // In first chunk
00853 #ifdef XAPIAN_DEBUG
00854         Xapian::doccount old_number_of_entries = number_of_entries;
00855         did = read_start_of_first_chunk(&pos, end, &number_of_entries, NULL);
00856         Assert(old_number_of_entries == number_of_entries);
00857 #else
00858         did = read_start_of_first_chunk(&pos, end, 0, 0);
00859 #endif
00860     } else {
00861         // In normal chunk
00862         if (!unpack_uint_preserving_sort(&keypos, keyend, &did)) {
00863             report_read_error(keypos);
00864         }
00865     }
00866 
00867     first_did_in_chunk = did;
00868     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk,
00869                                             &is_last_chunk);
00870     read_wdf_and_length(&pos, end, &wdf, &doclength);
00871 
00872     // Possible, since desired_did might be after end of this chunk and before
00873     // the next.
00874     if (desired_did > last_did_in_chunk) next_chunk();
00875 }
00876 
00877 bool
00878 QuartzPostList::move_forward_in_chunk_to_at_least(Xapian::docid desired_did)
00879 {
00880     DEBUGCALL(DB, bool,
00881               "QuartzPostList::move_forward_in_chunk_to_at_least", desired_did);
00882     if (desired_did > last_did_in_chunk) {
00883         pos = end;
00884         RETURN(false);
00885     }
00886     while (did < desired_did) {
00887         // FIXME: perhaps we don't need to decode the wdf and document length
00888         // for documents we're skipping past.
00889         bool at_end_of_chunk = !next_in_chunk();
00890         if (at_end_of_chunk) RETURN(false);
00891     }
00892     RETURN(true);
00893 }
00894 
00895 PostList *
00896 QuartzPostList::skip_to(Xapian::docid desired_did, Xapian::weight w_min)
00897 {
00898     DEBUGCALL(DB, PostList *,
00899               "QuartzPostList::skip_to", desired_did << ", " << w_min);
00900     (void)w_min; // no warning
00901     // We've started now - if we hadn't already, we're already positioned
00902     // at start so there's no need to actually do anything.
00903     have_started = true;
00904 
00905     // Don't skip back, and don't need to do anything if already there.
00906     if (is_at_end || desired_did <= did) RETURN(NULL);
00907 
00908     // Move to correct chunk
00909     if (!current_chunk_contains(desired_did)) {
00910         move_to_chunk_containing(desired_did);
00911         // Might be at_end now, so we need to check before trying to move
00912         // forward in chunk.
00913         if (is_at_end) RETURN(NULL);
00914     }
00915 
00916     // Move to correct position in chunk
00917     bool have_document = move_forward_in_chunk_to_at_least(desired_did);
00918 #ifdef XAPIAN_DEBUG
00919     Assert(have_document);
00920 #else
00921     (void)have_document;
00922 #endif
00923 
00924     DEBUGLINE(DB, string("Skipped to ") <<
00925               (is_at_end ? string("end.") : string("docid, wdf, doclength = ") +
00926                om_tostring(did) + ", " + om_tostring(wdf) + ", " +
00927                om_tostring(doclength) + "."));
00928 
00929     RETURN(NULL);
00930 }
00931 
00932 string
00933 QuartzPostList::get_description() const
00934 {
00935     return tname + ":" + om_tostring(number_of_entries);
00936 }
00937 
00938 // Returns the last did to allow in this chunk.
00939 Xapian::docid
00940 QuartzPostListTable::get_chunk(const string &tname,
00941           Xapian::docid did, bool adding,
00942           QPostlistChunkReader ** from, QPostlistChunkWriter **to)
00943 {
00944     // Get chunk containing entry
00945     string key;
00946     make_key(tname, did, key);
00947 
00948     // Find the right chunk
00949     AutoPtr<Bcursor> cursor(cursor_get());
00950 
00951     cursor->find_entry(key);
00952     Assert(!cursor->after_end());
00953 
00954     const char * keypos = cursor->current_key.data();
00955     const char * keyend = keypos + cursor->current_key.size();
00956 
00957     if (!check_tname_in_key(&keypos, keyend, tname)) {
00958         // Postlist for this termname doesn't exist.
00959         if (!adding)
00960             throw Xapian::DatabaseCorruptError("Attempted to delete or modify an entry in a non-existent posting list for " + tname);
00961 
00962         *from = NULL;
00963         *to = new QPostlistChunkWriter("", true, tname, true);
00964         return Xapian::docid(-1);
00965     }
00966  
00967     // See if we're appending - if so we can shortcut by just copying
00968     // the data part of the chunk wholesale.
00969     bool is_first_chunk = (keypos == keyend);
00970 
00971     cursor->read_tag();
00972     const char * pos = cursor->current_tag.data();
00973     const char * end = pos + cursor->current_tag.size();
00974     Xapian::docid first_did_in_chunk;
00975     if (is_first_chunk) {
00976         first_did_in_chunk = read_start_of_first_chunk(&pos, end, NULL, NULL);
00977     } else {
00978         if (!unpack_uint_preserving_sort(&keypos, keyend,
00979                                          &first_did_in_chunk)) {
00980             report_read_error(keypos);
00981         }
00982     }
00983 
00984     bool is_last_chunk;
00985     Xapian::docid last_did_in_chunk;
00986     last_did_in_chunk = read_start_of_chunk(&pos, end, first_did_in_chunk, &is_last_chunk);
00987     *to = new QPostlistChunkWriter(cursor->current_key, is_first_chunk, tname,
00988                                   is_last_chunk);
00989     if (did > last_did_in_chunk) {
00990         // This is the shortcut.  Not very pretty, but I'll leave refactoring
00991         // until I've a clearer picture of everything which needs to be done.
00992         // (FIXME)
00993         *from = NULL;
00994         (*to)->raw_append(first_did_in_chunk, last_did_in_chunk,
00995                           string(pos, end)); 
00996     } else {
00997         *from = new QPostlistChunkReader(first_did_in_chunk, string(pos, end));
00998     }
00999     if (is_last_chunk) return Xapian::docid(-1);
01000 
01001     // Find first did of next tag.
01002     cursor->next();
01003     if (cursor->after_end()) {
01004         throw Xapian::DatabaseCorruptError("Expected another key but found none");
01005     }
01006     const char *kpos = cursor->current_key.data();
01007     const char *kend = kpos + cursor->current_key.size();
01008     if (!check_tname_in_key(&kpos, kend, tname)) {
01009         throw Xapian::DatabaseCorruptError("Expected another key with the same term name but found a different one");
01010     }
01011 
01012     // Read the new first docid
01013     Xapian::docid first_did_of_next_chunk;
01014     if (!unpack_uint_preserving_sort(&kpos, kend, &first_did_of_next_chunk)) {
01015         report_read_error(kpos);
01016     }
01017     return first_did_of_next_chunk - 1;
01018 }
01019 
01020 void
01021 QuartzPostListTable::merge_changes(
01022     const map<string, map<Xapian::docid, pair<char, Xapian::termcount> > > & mod_plists,
01023     const map<Xapian::docid, Xapian::termcount> & doclens,
01024     const map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> > & freq_deltas)
01025 {
01026     map<string, map<Xapian::docid, pair<char, Xapian::termcount> > >::const_iterator i;
01027     for (i = mod_plists.begin(); i != mod_plists.end(); ++i) {
01028         if (i->second.empty()) continue;
01029         string tname = i->first;
01030         {
01031             // Rewrite the first chunk of this posting list with the updated
01032             // termfreq and collfreq.
01033             map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> >::const_iterator deltas = freq_deltas.find(tname);
01034             Assert(deltas != freq_deltas.end());
01035 
01036             string current_key;
01037             make_key(tname, current_key);
01038             string tag;
01039             (void)get_exact_entry(current_key, tag);
01040 
01041             // Read start of first chunk to get termfreq and collfreq.
01042             const char *pos = tag.data();
01043             const char *end = pos + tag.size();
01044             Xapian::termcount termfreq, collfreq;
01045             Xapian::docid firstdid, lastdid;
01046             bool islast;
01047             if (pos == end) {
01048                 termfreq = 0;
01049                 collfreq = 0;
01050                 firstdid = 0;
01051                 lastdid = 0;
01052                 islast = true;
01053             } else {
01054                 firstdid = read_start_of_first_chunk(&pos, end,
01055                                                      &termfreq, &collfreq);
01056                 // Handle the generic start of chunk header.
01057                 lastdid = read_start_of_chunk(&pos, end, firstdid, &islast);
01058             }
01059 
01060             termfreq += deltas->second.first;
01061             if (termfreq == 0) {
01062                 // All postings deleted!  So we can shortcut by zapping the
01063                 // posting list.
01064                 del(current_key);
01065                 if (islast) continue;
01066                 AutoPtr<Bcursor> cursor(cursor_get());
01067                 (void)cursor->find_entry(current_key);
01068                 // find_entry() returns the entry <= that asked for.
01069                 cursor->next();
01070                 while (!cursor->after_end()) {
01071                     const char *kpos = cursor->current_key.data();
01072                     const char *kend = kpos + cursor->current_key.size();
01073                     if (!check_tname_in_key_lite(&kpos, kend, tname)) break;
01074                     cursor->del();
01075                 }
01076                 continue;
01077             }
01078             collfreq += deltas->second.second;
01079 
01080             // Rewrite start of first chunk to update termfreq and collfreq.
01081             string newhdr = make_start_of_first_chunk(termfreq, collfreq, firstdid);
01082             newhdr += make_start_of_chunk(islast, firstdid, lastdid);
01083             if (pos == end) {
01084                 add(current_key, newhdr);
01085             } else {
01086                 Assert((size_t)(pos - tag.data()) <= tag.size());
01087                 tag.replace(0, pos - tag.data(), newhdr);
01088                 add(current_key, tag);
01089             }
01090         }
01091         map<Xapian::docid, pair<char, Xapian::termcount> >::const_iterator j;
01092         j = i->second.begin();
01093         Assert(j != i->second.end()); // This case is caught above.
01094 
01095         Xapian::docid max_did;
01096         QPostlistChunkReader *from;
01097         QPostlistChunkWriter *to;
01098         max_did = get_chunk(tname, j->first, j->second.first == 'A',
01099                             &from, &to);
01100         for ( ; j != i->second.end(); ++j) {
01101             Xapian::docid did = j->first;
01102 
01103 next_chunk:
01104             if (from) while (!from->is_at_end()) {
01105                 Xapian::docid copy_did = from->get_docid();
01106                 if (copy_did >= did) {
01107                     if (copy_did == did) {
01108                         Assert(j->second.first != 'A');
01109                         from->next();
01110                     }
01111                     break;
01112                 }
01113                 to->append(this, copy_did,
01114                            from->get_wdf(), from->get_doclength());
01115                 from->next();
01116             }
01117             if ((!from || from->is_at_end()) && did > max_did) {
01118                 delete from;
01119                 to->flush(this);
01120                 delete to;
01121                 max_did = get_chunk(tname, did, false, &from, &to);
01122                 goto next_chunk;
01123             }
01124 
01125             if (j->second.first != 'D') {
01126                 map<Xapian::docid, Xapian::termcount>::const_iterator k = doclens.find(did);
01127                 Assert(k != doclens.end());
01128                 Xapian::termcount new_doclen = k->second;
01129                 Xapian::termcount new_wdf = j->second.second;
01130 
01131                 to->append(this, did, new_wdf, new_doclen);
01132             }
01133         }
01134 
01135         if (from) {
01136             while (!from->is_at_end()) {
01137                 to->append(this, from->get_docid(),
01138                            from->get_wdf(), from->get_doclength());
01139                 from->next();
01140             }
01141             delete from;
01142         }
01143         to->flush(this);
01144         delete to;
01145     }
01146 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.