backends/quartz/btree.cc

Go to the documentation of this file.
00001 /* btree.cc: Btree implementation
00002  *
00003  * Copyright 1999,2000,2001 BrightStation PLC
00004  * Copyright 2002 Ananova Ltd
00005  * Copyright 2002,2003,2004,2005,2006,2007,2008 Olly Betts
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
00020  * USA
00021  */
00022 
00023 #include <config.h>
00024 
00025 #include <xapian/error.h>
00026 
00027 #include "safeerrno.h"
00028 #include "safefcntl.h"
00029 #ifdef __WIN32__
00030 # include "msvc_posix_wrapper.h"
00031 #endif
00032 
00033 #include "stringutils.h" // For STRINGIZE().
00034 
00035 // Define to use "dangerous" mode - in this mode we write modified btree
00036 // blocks back in place.  This is somewhat faster (especially once we're
00037 // I/O bound) but the database can't be safely searched during indexing
00038 // and if indexing is terminated uncleanly, the database may be corrupt.
00039 //
00040 // Despite the risks, it's useful for speeding up a full rebuild.
00041 //
00042 // FIXME: make this mode run-time selectable, and record that it is currently
00043 // in use somewhere on disk, so readers can check and refuse to open the
00044 // database.
00045 //
00046 // #define DANGEROUS
00047 
00048 #include <sys/types.h>
00049 
00050 // Trying to include the correct headers with the correct defines set to
00051 // get pread() and pwrite() prototyped on every platform without breaking any
00052 // other platform is a real can of worms.  So instead we probe for what
00053 // prototypes (if any) are required in configure and put them into
00054 // PREAD_PROTOTYPE and PWRITE_PROTOTYPE.
00055 #if defined HAVE_PREAD && defined PREAD_PROTOTYPE
00056 PREAD_PROTOTYPE
00057 #endif
00058 #if defined HAVE_PWRITE && defined PWRITE_PROTOTYPE
00059 PWRITE_PROTOTYPE
00060 #endif
00061 
00062 #include "safeunistd.h"
00063 
00064 #include <stdio.h>
00065 #include <string.h>   /* for memmove */
00066 #include <limits.h>   /* for CHAR_BIT */
00067 
00068 #include "btree.h"
00069 #include "btree_util.h"
00070 #include "btree_base.h"
00071 #include "bcursor.h"
00072 #include "quartz_utils.h"
00073 
00074 #include "omassert.h"
00075 #include "omdebug.h"
00076 #include <xapian/error.h>
00077 #include "utils.h"
00078 
00079 #include <algorithm>  // for std::min()
00080 #include <string>
00081 #include <vector>
00082 
00083 using std::min;
00084 using std::string;
00085 using std::vector;
00086 
00087 //#define BTREE_DEBUG_FULL 1
00088 #undef BTREE_DEBUG_FULL
00089 
00090 #ifdef BTREE_DEBUG_FULL
00091 /*------debugging aids from here--------*/
00092 
00093 static void print_key(const byte * p, int c, int j);
00094 static void print_tag(const byte * p, int c, int j);
00095 
00096 /*
00097 static void report_cursor(int N, Btree * B, Cursor * C)
00098 {
00099     int i;
00100     printf("%d)\n", N);
00101     for (i = 0; i <= B->level; i++)
00102         printf("p=%d, c=%d, n=[%d], rewrite=%d\n",
00103                 C[i].p, C[i].c, C[i].n, C[i].rewrite);
00104 }
00105 */
00106 
00107 /*------to here--------*/
00108 #endif /* BTREE_DEBUG_FULL */
00109 
00110 /* Input/output is defined with calls to the basic Unix system interface: */
00111 
00112 int sys_open_to_read_no_except(const string & name)
00113 {
00114 #ifdef __WIN32__
00115     int fd = msvc_posix_open(name.c_str(), O_RDONLY | O_BINARY);
00116 #else
00117     int fd = open(name.c_str(), O_RDONLY | O_BINARY);
00118 #endif
00119     return fd;
00120 }
00121 
00122 int sys_open_to_read(const string & name)
00123 {
00124     int fd = sys_open_to_read_no_except(name);
00125     if (fd < 0) {
00126         string message = string("Couldn't open ")
00127                 + name + " to read: " + strerror(errno);
00128         throw Xapian::DatabaseOpeningError(message);
00129     }
00130     return fd;
00131 }
00132 
00133 static int sys_open_to_write_no_except(const string & name)
00134 {
00135 #ifdef __WIN32__
00136     int fd = msvc_posix_open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_BINARY);
00137 #else
00138     int fd = open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666);
00139 #endif
00140     return fd;
00141 }
00142 
00143 int sys_open_to_write(const string & name)
00144 {
00145     int fd = sys_open_to_write_no_except(name);
00146     if (fd < 0) {
00147         string message = string("Couldn't open ")
00148                 + name + " to write: " + strerror(errno);
00149         throw Xapian::DatabaseOpeningError(message);
00150     }
00151     return fd;
00152 }
00153 
00154 static int sys_open_for_readwrite(const string & name)
00155 {
00156     int fd = open(name.c_str(), O_RDWR | O_BINARY);
00157     if (fd < 0) {
00158         string message = string("Couldn't open ")
00159                 + name + " read/write: " + strerror(errno);
00160         throw Xapian::DatabaseOpeningError(message);
00161     }
00162     return fd;
00163 }
00164 
00165 static void sys_write_bytes(int h, int n, const char * p)
00166 {
00167     while (true) {
00168         ssize_t bytes_written = write(h, p, n);
00169         if (bytes_written == n) {
00170             // normal case - write succeeded, so return.
00171             return;
00172         } else if (bytes_written == -1) {
00173             if (errno == EINTR) continue;
00174             string message = "Error writing block: ";
00175             message += strerror(errno);
00176             throw Xapian::DatabaseError(message);
00177         } else if (bytes_written == 0) {
00178             string message = "Error writing block: wrote no data";
00179             throw Xapian::DatabaseError(message);
00180         } else if (bytes_written < n) {
00181             /* Wrote part of the block, which is not an error.  We should
00182              * continue writing the rest of the block.
00183              */
00184             n -= bytes_written;
00185             p += bytes_written;
00186         }
00187     }
00188 }
00189 
00190 string sys_read_all_bytes(int h, size_t bytes_to_read)
00191 {
00192     ssize_t bytes_read;
00193     string retval;
00194     while (true) {
00195         char buf[1024];
00196         bytes_read = read(h, buf, min(sizeof(buf), bytes_to_read));
00197         if (bytes_read > 0) {
00198             // add byte to string, continue unless we reached max
00199             retval.append(buf, bytes_read);
00200             bytes_to_read -= bytes_read;
00201             if (bytes_to_read == 0) {
00202                 break;
00203             }
00204         } else if (bytes_read == 0) {
00205             // end of file, we're finished
00206             break;
00207         } else if (bytes_read == -1) {
00208             if (errno == EINTR) continue;
00209             string message = "Error reading all bytes: ";
00210             message += strerror(errno);
00211             throw Xapian::DatabaseError(message);
00212         }
00213     }
00214     return retval;
00215 }
00216 
00217 void
00218 sys_write_string(int h, const string &s)
00219 {
00220     sys_write_bytes(h, s.length(), s.data());
00221 }
00222 
00223 int sys_flush(int h) {
00224 #if defined HAVE_FDATASYNC
00225     return (fdatasync(h) != -1);
00226 #elif defined HAVE_FSYNC
00227     return (fsync(h) != -1);
00228 #elif defined __WIN32__
00229     return (_commit(h) != -1);
00230 #else
00231 #error "Have neither fsync() nor fdatasync() nor _commit() - cannot sync."
00232 #endif
00233 }
00234 
00235 static void sys_unlink(const string &filename)
00236 {
00237 #ifdef __WIN32__
00238     if (msvc_posix_unlink(filename.c_str()) == -1) {
00239 #else
00240     if (unlink(filename) == -1) {
00241 #endif
00242         string message = "Failed to unlink ";
00243         message += filename;
00244         message += ": ";
00245         message += strerror(errno);
00246         throw Xapian::DatabaseCorruptError(message);
00247     }
00248 }
00249 
00250 /* There are two bit maps in bit_map0 and bit_map. The nth bit of bitmap is 0
00251    if the nth block is free, otherwise 1. bit_map0 is the initial state of
00252    bitmap at the start of the current transaction.
00253 
00254    Note use of the limits.h values:
00255    UCHAR_MAX = 255, an unsigned with all bits set, and
00256    CHAR_BIT = 8, the number of bits per byte
00257 
00258    BYTE_PAIR_RANGE below is the smallest +ve number that can't be held in two
00259    bytes -- 64K effectively.
00260 */
00261 
00262 #define BYTE_PAIR_RANGE (1 << 2 * CHAR_BIT)
00263 
00265 void
00266 Btree::read_block(uint4 n, byte * p) const
00267 {
00268     // Log the value of p, not the contents of the block it points to...
00269     DEBUGCALL(DB, void, "Btree::read_block", n << ", " << (void*)p);
00270     /* Use the base bit_map_size not the bitmap's size, because
00271      * the latter is uninitialised in readonly mode.
00272      */
00273     Assert(n / CHAR_BIT < base.get_bit_map_size());
00274 
00275 #ifdef HAVE_PREAD
00276     off_t offset = off_t(block_size) * n;
00277     int m = block_size;
00278     while (true) {
00279         ssize_t bytes_read = pread(handle, reinterpret_cast<char *>(p), m,
00280                                    offset);
00281         // normal case - read succeeded, so return.
00282         if (bytes_read == m) return;
00283         if (bytes_read == -1) {
00284             if (errno == EINTR) continue;
00285             string message = "Error reading block " + om_tostring(n) + ": ";
00286             message += strerror(errno);
00287             throw Xapian::DatabaseError(message);
00288         } else if (bytes_read == 0) {
00289             string message = "Error reading block " + om_tostring(n) + ": got end of file";
00290             throw Xapian::DatabaseError(message);
00291         } else if (bytes_read < m) {
00292             /* Read part of the block, which is not an error.  We should
00293              * continue reading the rest of the block.
00294              */
00295             m -= bytes_read;
00296             p += bytes_read;
00297             offset += bytes_read;
00298         }
00299     }
00300 #else
00301     if (lseek(handle, off_t(block_size) * n, SEEK_SET) == -1) {
00302         string message = "Error seeking to block: ";
00303         message += strerror(errno);
00304         throw Xapian::DatabaseError(message);
00305     }
00306 
00307     int m = block_size;
00308     while (true) {
00309         ssize_t bytes_read = read(handle, reinterpret_cast<char *>(p), m);
00310         // normal case - read succeeded, so return.
00311         if (bytes_read == m) return;
00312         if (bytes_read == -1) {
00313             if (errno == EINTR) continue;
00314             string message = "Error reading block " + om_tostring(n) + ": ";
00315             message += strerror(errno);
00316             throw Xapian::DatabaseError(message);
00317         } else if (bytes_read == 0) {
00318             string message = "Error reading block " + om_tostring(n) + ": got end of file";
00319             throw Xapian::DatabaseError(message);
00320         } else if (bytes_read < m) {
00321             /* Read part of the block, which is not an error.  We should
00322              * continue reading the rest of the block.
00323              */
00324             m -= bytes_read;
00325             p += bytes_read;
00326         }
00327     }
00328 #endif
00329 }
00330 
00337 void
00338 Btree::write_block(uint4 n, const byte * p) const
00339 {
00340     DEBUGCALL(DB, void, "Btree::write_block", n << ", " << p);
00341     Assert(writable);
00342     /* Check that n is in range. */
00343     Assert(n / CHAR_BIT < base.get_bit_map_size());
00344 
00345     /* don't write to non-free */;
00346     AssertParanoid(base.block_free_at_start(n));
00347 
00348     /* write revision is okay */
00349     AssertEqParanoid(REVISION(p), latest_revision_number + 1);
00350 
00351     if (both_bases) {
00352         // Delete the old base before modifying the database.
00353         sys_unlink(name + "base" + other_base_letter);
00354         both_bases = false;
00355         latest_revision_number = revision_number;
00356     }
00357 
00358 #ifdef HAVE_PWRITE
00359     off_t offset = off_t(block_size) * n;
00360     int m = block_size;
00361     while (true) {
00362         ssize_t bytes_written = pwrite(handle, p, m, offset);
00363         if (bytes_written == m) {
00364             // normal case - write succeeded, so return.
00365             return;
00366         } else if (bytes_written == -1) {
00367             if (errno == EINTR) continue;
00368             string message = "Error writing block: ";
00369             message += strerror(errno);
00370             throw Xapian::DatabaseError(message);
00371         } else if (bytes_written == 0) {
00372             string message = "Error writing block: wrote no data";
00373             throw Xapian::DatabaseError(message);
00374         } else if (bytes_written < m) {
00375             /* Wrote part of the block, which is not an error.  We should
00376              * continue writing the rest of the block.
00377              */
00378             m -= bytes_written;
00379             p += bytes_written;
00380             offset += bytes_written;
00381         }
00382     }
00383 #else
00384     if (lseek(handle, (off_t)block_size * n, SEEK_SET) == -1) {
00385         string message = "Error seeking to block: ";
00386         message += strerror(errno);
00387         throw Xapian::DatabaseError(message);
00388     }
00389 
00390     sys_write_bytes(handle, block_size, (const char *)p);
00391 #endif
00392 }
00393 
00394 
00395 /* A note on cursors:
00396 
00397    Each B-tree level has a corresponding array element C[j] in a
00398    cursor, C. C[0] is the leaf (or data) level, and C[B->level] is the
00399    root block level. Within a level j,
00400 
00401        C[j].p  addresses the block
00402        C[j].c  is the offset into the directory entry in the block
00403        C[j].n  is the number of the block at C[j].p
00404 
00405    A look up in the B-tree causes navigation of the blocks starting
00406    from the root. In each block, p, we find an offset, c, to an item
00407    which gives the number, n, of the block for the next level. This
00408    leads to an array of values p,c,n which are held inside the cursor.
00409 
00410    Any Btree object B has a built-in cursor, at B->C. But other cursors may
00411    be created.  If BC is a created cursor, BC->C is the cursor in the
00412    sense given above, and BC->B is the handle for the B-tree again.
00413 */
00414 
00415 
00416 void
00417 Btree::set_overwritten() const
00418 {
00419     DEBUGCALL(DB, void, "Btree::set_overwritten", "");
00420     // If we're writable, there shouldn't be another writer who could cause
00421     // overwritten to be flagged, so that's a DatabaseCorruptError.
00422     if (writable)
00423         throw Xapian::DatabaseCorruptError("Db block overwritten - are there multiple writers?");
00424     throw Xapian::DatabaseModifiedError("The revision being read has been discarded - you should call Xapian::Database::reopen() and retry the operation");
00425 }
00426 
00427 /* block_to_cursor(C, j, n) puts block n into position C[j] of cursor
00428    C, writing the block currently at C[j] back to disk if necessary.
00429    Note that
00430 
00431        C[j].rewrite
00432 
00433    is true iff C[j].n is different from block n in file DB. If it is
00434    false no rewriting is necessary.
00435 */
00436 
00437 void
00438 Btree::block_to_cursor(Cursor * C_, int j, uint4 n) const
00439 {
00440     DEBUGCALL(DB, void, "Btree::block_to_cursor", (void*)C_ << ", " << j << ", " << n);
00441     if (n == C_[j].n) return;
00442     byte * p = C_[j].p;
00443     Assert(p);
00444 
00445     // FIXME: only needs to be done in write mode
00446     if (C_[j].rewrite) {
00447         Assert(writable);
00448         Assert(C == C_);
00449         write_block(C_[j].n, p);
00450         C_[j].rewrite = false;
00451     }
00452     // Check if the block is in the built-in cursor (potentially in
00453     // modified form).
00454     if (writable && n == C[j].n) {
00455         memcpy(p, C[j].p, block_size);
00456     } else {
00457         read_block(n, p);
00458     }
00459 
00460     C_[j].n = n;
00461     if (j < level) {
00462         /* unsigned comparison */
00463         if (REVISION(p) > REVISION(C_[j + 1].p)) {
00464             set_overwritten();
00465             return;
00466         }
00467     }
00468     AssertEq(j, GET_LEVEL(p));
00469 }
00470 
00492 void
00493 Btree::alter()
00494 {
00495     DEBUGCALL(DB, void, "Btree::alter", "");
00496     Assert(writable);
00497 #ifdef DANGEROUS
00498     C[0].rewrite = true;
00499 #else
00500     int j = 0;
00501     byte * p = C[j].p;
00502     while (true) {
00503         if (C[j].rewrite) return; /* all new, so return */
00504         C[j].rewrite = true;
00505 
00506         uint4 n = C[j].n;
00507         if (base.block_free_at_start(n)) {
00508             Assert(REVISION(p) == latest_revision_number + 1);
00509             return;
00510         }
00511         Assert(REVISION(p) < latest_revision_number + 1);
00512         base.free_block(n);
00513         n = base.next_free_block();
00514         C[j].n = n;
00515         SET_REVISION(p, latest_revision_number + 1);
00516 
00517         if (j == level) return;
00518         j++;
00519         p = C[j].p;
00520         Item_wr(p, C[j].c).set_block_given_by(n);
00521     }
00522 #endif
00523 }
00524 
00537 int Btree::find_in_block(const byte * p, Key key, bool leaf, int c)
00538 {
00539     DEBUGCALL_STATIC(DB, int, "Btree::find_in_block", reinterpret_cast<const void*>(p) << ", " << reinterpret_cast<const void*>(key.get_address()) << ", " << leaf << ", " << c);
00540     int i = DIR_START;
00541     if (leaf) i -= D2;
00542     int j = DIR_END(p);
00543 
00544     if (c != -1) {
00545         if (c < j && i < c && Item(p, c).key() <= key)
00546             i = c;
00547         c += D2;
00548         if (c < j && i < c && key < Item(p, c).key())
00549             j = c;
00550     }
00551 
00552     while (j - i > D2) {
00553         int k = i + ((j - i)/(D2 * 2))*D2; /* mid way */
00554         if (key < Item(p, k).key()) j = k; else i = k;
00555     }
00556     RETURN(i);
00557 }
00558 
00566 bool
00567 Btree::find(Cursor * C_) const
00568 {
00569     DEBUGCALL(DB, bool, "Btree::find", (void*)C_);
00570     // Note: the parameter is needed when we're called by BCursor
00571     const byte * p;
00572     int c;
00573     Key key = kt.key();
00574     for (int j = level; j > 0; --j) {
00575         p = C_[j].p;
00576         c = find_in_block(p, key, false, C_[j].c);
00577 #ifdef BTREE_DEBUG_FULL
00578         printf("Block in Btree:find - code position 1");
00579         report_block_full(j, C_[j].n, p);
00580 #endif /* BTREE_DEBUG_FULL */
00581         C_[j].c = c;
00582         block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
00583     }
00584     p = C_[0].p;
00585     c = find_in_block(p, key, true, C_[0].c);
00586 #ifdef BTREE_DEBUG_FULL
00587     printf("Block in Btree:find - code position 2");
00588     report_block_full(0, C_[0].n, p);
00589 #endif /* BTREE_DEBUG_FULL */
00590     C_[0].c = c;
00591     if (c < DIR_START) RETURN(false);
00592     RETURN(Item(p, c).key() == key);
00593 }
00594 
00600 void
00601 Btree::compact(byte * p)
00602 {
00603     DEBUGCALL(DB, void, "Btree::compact", (void*)p);
00604     Assert(writable);
00605     Assert(p);
00606     Assert(buffer);
00607     int e = block_size;
00608     byte * b = buffer;
00609     int dir_end = DIR_END(p);
00610     for (int c = DIR_START; c < dir_end; c += D2) {
00611         Item item(p, c);
00612         int l = item.size();
00613         Assert(e >= dir_end);
00614         e -= l;
00615         memmove(b + e, item.get_address(), l);
00616         SETD(p, c, e);  /* reform in b */
00617     }
00618     memmove(p + e, b + e, block_size - e);  /* copy back */
00619     e -= dir_end;
00620     SET_TOTAL_FREE(p, e);
00621     SET_MAX_FREE(p, e);
00622 }
00623 
00627 void
00628 Btree::split_root(uint4 split_n)
00629 {
00630     DEBUGCALL(DB, void, "Btree::split_root", split_n);
00631     /* gain a level */
00632     ++level;
00633 
00634     /* check level overflow - this isn't something that should ever happen
00635      * but deserves more than an Assert()... */
00636     if (level == BTREE_CURSOR_LEVELS) {
00637         throw Xapian::DatabaseCorruptError("Btree has grown impossibly large ("STRINGIZE(BTREE_CURSOR_LEVELS)" levels)");
00638     }
00639 
00640     byte * q = zeroed_new(block_size);
00641     if (q == 0) {
00642         throw std::bad_alloc();
00643     }
00644     C[level].p = q;
00645     C[level].c = DIR_START;
00646     C[level].n = base.next_free_block();
00647     C[level].rewrite = true;
00648     SET_REVISION(q, latest_revision_number + 1);
00649     SET_LEVEL(q, level);
00650     SET_DIR_END(q, DIR_START);
00651     compact(q);   /* to reset TOTAL_FREE, MAX_FREE */
00652 
00653     /* form a null key in b with a pointer to the old root */
00654     byte b[10]; /* 7 is exact */
00655     Item_wr item(b);
00656     item.form_null_key(split_n);
00657     add_item(item, level);
00658 }
00659 
00675 void
00676 Btree::enter_key(int j, Key prevkey, Key newkey)
00677 {
00678     Assert(writable);
00679     Assert(prevkey < newkey);
00680     Assert(j >= 1);
00681 
00682     uint4 blocknumber = C[j - 1].n;
00683 
00684     // FIXME update to use Key
00685     // Keys are truncated here: but don't truncate the count at the end away.
00686     const int newkey_len = newkey.length();
00687     int i;
00688 
00689     if (j == 1) {
00690         // Truncate the key to the minimal key which differs from prevkey,
00691         // the preceding key in the block.
00692         i = 0;
00693         const int min_len = min(newkey_len, prevkey.length());
00694         while (i < min_len && prevkey[i] == newkey[i]) {
00695             i++;
00696         }
00697 
00698         // Want one byte of difference.
00699         if (i < newkey_len) i++;
00700     } else {
00701         /* Can't truncate between branch levels, since the separated keys
00702          * are in at the leaf level, and truncating again will change the
00703          * branch point.
00704          */
00705         i = newkey_len;
00706     }
00707 
00708     byte b[UCHAR_MAX + 6];
00709     Item_wr item(b);
00710     Assert(i <= 256 - I2 - C2);
00711     Assert(i <= (int)sizeof(b) - I2 - C2 - 4);
00712     item.set_key_and_block(newkey, i, blocknumber);
00713 
00714     // When j > 1 we can make the first key of block p null.  This is probably
00715     // worthwhile as it trades a small amount of CPU and RAM use for a small
00716     // saving in disk use.  Other redundant keys will still creep in though.
00717     if (j > 1) {
00718         byte * p = C[j - 1].p;
00719         uint4 n = get_int4(newkey.get_address(), newkey_len + K1 + C2);
00720         int new_total_free = TOTAL_FREE(p) + newkey_len + C2;
00721         // FIXME: incredibly icky going from key to item like this...
00722         Item_wr(const_cast<byte *>(newkey.get_address()) - I2).form_null_key(n);
00723         SET_TOTAL_FREE(p, new_total_free);
00724     }
00725 
00726     C[j].c = find_in_block(C[j].p, item.key(), false, 0) + D2;
00727     C[j].rewrite = true; /* a subtle point: this *is* required. */
00728     add_item(item, j);
00729 }
00730 
00735 int
00736 Btree::mid_point(byte * p)
00737 {
00738     int n = 0;
00739     int dir_end = DIR_END(p);
00740     int size = block_size - TOTAL_FREE(p) - dir_end;
00741     for (int c = DIR_START; c < dir_end; c += D2) {
00742         int l = Item(p, c).size();
00743         n += 2 * l;
00744         if (n >= size) {
00745             if (l < n - size) return c;
00746             return c + D2;
00747         }
00748     }
00749 
00750     /* falling out of mid_point */
00751     Assert(false);
00752     return 0; /* Stop compiler complaining about end of method. */
00753 }
00754 
00763 void
00764 Btree::add_item_to_block(byte * p, Item_wr kt_, int c)
00765 {
00766     Assert(writable);
00767     int dir_end = DIR_END(p);
00768     int kt_len = kt_.size();
00769     int needed = kt_len + D2;
00770     int new_total = TOTAL_FREE(p) - needed;
00771     int new_max = MAX_FREE(p) - needed;
00772 
00773     Assert(new_total >= 0);
00774 
00775     if (new_max < 0) {
00776         compact(p);
00777         new_max = MAX_FREE(p) - needed;
00778         Assert(new_max >= 0);
00779     }
00780     Assert(dir_end >= c);
00781 
00782     memmove(p + c + D2, p + c, dir_end - c);
00783     dir_end += D2;
00784     SET_DIR_END(p, dir_end);
00785 
00786     int o = dir_end + new_max;
00787     SETD(p, c, o);
00788     memmove(p + o, kt_.get_address(), kt_len);
00789 
00790     SET_MAX_FREE(p, new_max);
00791 
00792     SET_TOTAL_FREE(p, new_total);
00793 }
00794 
00800 void
00801 Btree::add_item(Item_wr kt_, int j)
00802 {
00803     Assert(writable);
00804     byte * p = C[j].p;
00805     int c = C[j].c;
00806     uint4 n;
00807 
00808     int needed = kt_.size() + D2;
00809     if (TOTAL_FREE(p) < needed) {
00810         int m;
00811         // Prepare to split p. After splitting, the block is in two halves, the
00812         // lower half is split_p, the upper half p again. add_to_upper_half
00813         // becomes true when the item gets added to p, false when it gets added
00814         // to split_p.
00815 
00816         if (seq_count < 0) {
00817             // If we're not in sequential mode, we split at the mid point
00818             // of the node.
00819             m = mid_point(p);
00820         } else {
00821             // During sequential addition, split at the insert point
00822             m = c;
00823         }
00824 
00825         uint4 split_n = C[j].n;
00826         C[j].n = base.next_free_block();
00827 
00828         memcpy(split_p, p, block_size);  // replicate the whole block in split_p
00829         SET_DIR_END(split_p, m);
00830         compact(split_p);      /* to reset TOTAL_FREE, MAX_FREE */
00831 
00832         {
00833             int residue = DIR_END(p) - m;
00834             int new_dir_end = DIR_START + residue;
00835             memmove(p + DIR_START, p + m, residue);
00836             SET_DIR_END(p, new_dir_end);
00837         }
00838 
00839         compact(p);      /* to reset TOTAL_FREE, MAX_FREE */
00840 
00841         bool add_to_upper_half;
00842         if (seq_count < 0) {
00843             add_to_upper_half = (c >= m);
00844         } else {
00845             // And add item to lower half if split_p has room, otherwise upper
00846             // half
00847             add_to_upper_half = (TOTAL_FREE(split_p) < needed);
00848         }
00849 
00850         if (add_to_upper_half) {
00851             c -= (m - DIR_START);
00852             Assert(seq_count < 0 || c <= DIR_START + D2);
00853             Assert(c >= DIR_START);
00854             Assert(c <= DIR_END(p));
00855             add_item_to_block(p, kt_, c);
00856             n = C[j].n;
00857         } else {
00858             Assert(c >= DIR_START);
00859             Assert(c <= DIR_END(split_p));
00860             add_item_to_block(split_p, kt_, c);
00861             n = split_n;
00862         }
00863         write_block(split_n, split_p);
00864 
00865         // Check if we're splitting the root block.
00866         if (j == level) split_root(split_n);
00867 
00868         /* Enter a separating key at level j + 1 between */
00869         /* the last key of block split_p, and the first key of block p */
00870         enter_key(j + 1,
00871                   Item(split_p, DIR_END(split_p) - D2).key(),
00872                   Item(p, DIR_START).key());
00873     } else {
00874         add_item_to_block(p, kt_, c);
00875         n = C[j].n;
00876     }
00877     if (j == 0) {
00878         changed_n = n;
00879         changed_c = c;
00880     }
00881 }
00882 
00890 void
00891 Btree::delete_item(int j, bool repeatedly)
00892 {
00893     Assert(writable);
00894     byte * p = C[j].p;
00895     int c = C[j].c;
00896     int kt_len = Item(p, c).size(); /* size of the item to be deleted */
00897     int dir_end = DIR_END(p) - D2;   /* directory length will go down by 2 bytes */
00898 
00899     memmove(p + c, p + c + D2, dir_end - c);
00900     SET_DIR_END(p, dir_end);
00901     SET_MAX_FREE(p, MAX_FREE(p) + D2);
00902     SET_TOTAL_FREE(p, TOTAL_FREE(p) + kt_len + D2);
00903 
00904     if (!repeatedly) return;
00905     if (j < level) {
00906         if (dir_end == DIR_START) {
00907             base.free_block(C[j].n);
00908             C[j].rewrite = false;
00909             C[j].n = BLK_UNUSED;
00910             C[j + 1].rewrite = true;  /* *is* necessary */
00911             delete_item(j + 1, true);
00912         }
00913     } else {
00914         Assert(j == level);
00915         while (dir_end == DIR_START + D2 && level > 0) {
00916             /* single item in the root block, so lose a level */
00917             uint4 new_root = Item(p, DIR_START).block_given_by();
00918             delete [] p;
00919             C[level].p = 0;
00920             base.free_block(C[level].n);
00921             C[level].rewrite = false;
00922             C[level].n = BLK_UNUSED;
00923             level--;
00924 
00925             block_to_cursor(C, level, new_root);
00926 
00927             p = C[level].p;
00928             dir_end = DIR_END(p); /* prepare for the loop */
00929         }
00930     }
00931 }
00932 
00933 /* debugging aid:
00934 static addcount = 0;
00935 */
00936 
00962 int
00963 Btree::add_kt(bool found)
00964 {
00965     Assert(writable);
00966     int components = 0;
00967 
00968     /*
00969     {
00970         printf("%d) %s ", addcount++, (found ? "replacing" : "adding"));
00971         print_bytes(kt[I2] - K1 - C2, kt + I2 + K1); putchar('\n');
00972     }
00973     */
00974     alter();
00975 
00976     if (found) { /* replacement */
00977         seq_count = SEQ_START_POINT;
00978         sequential = false;
00979 
00980         byte * p = C[0].p;
00981         int c = C[0].c;
00982         Item item(p, c);
00983         int kt_size = kt.size();
00984         int needed = kt_size - item.size();
00985 
00986         components = Item(p, c).components_of();
00987 
00988         if (needed <= 0) {
00989             /* simple replacement */
00990             memmove(const_cast<byte *>(item.get_address()),
00991                     kt.get_address(), kt_size);
00992             SET_TOTAL_FREE(p, TOTAL_FREE(p) - needed);
00993         } else {
00994             /* new item into the block's freespace */
00995             int new_max = MAX_FREE(p) - kt_size;
00996             if (new_max >= 0) {
00997                 int o = DIR_END(p) + new_max;
00998                 memmove(p + o, kt.get_address(), kt_size);
00999                 SETD(p, c, o);
01000                 SET_MAX_FREE(p, new_max);
01001                 SET_TOTAL_FREE(p, TOTAL_FREE(p) - needed);
01002             } else {
01003                 /* do it the long way */
01004                 delete_item(0, false);
01005                 add_item(kt, 0);
01006             }
01007         }
01008     } else {
01009         /* addition */
01010         if (changed_n == C[0].n && changed_c == C[0].c) {
01011             if (seq_count < 0) seq_count++;
01012         } else {
01013             seq_count = SEQ_START_POINT;
01014             sequential = false;
01015         }
01016         C[0].c += D2;
01017         add_item(kt, 0);
01018     }
01019     return components;
01020 }
01021 
01022 /* delete_kt() corresponds to add_kt(found), but there are only
01023    two cases: if the key is not found nothing is done, and if it is
01024    found the corresponding item is deleted with delete_item.
01025 */
01026 
01027 int
01028 Btree::delete_kt()
01029 {
01030     Assert(writable);
01031 
01032     bool found = find(C);
01033 
01034     int components = 0;
01035     seq_count = SEQ_START_POINT;
01036     sequential = false;
01037 
01038     /*
01039     {
01040         printf("%d) %s ", addcount++, (found ? "deleting " : "ignoring "));
01041         print_bytes(B->kt[I2] - K1 - C2, B->kt + I2 + K1); putchar('\n');
01042     }
01043     */
01044     if (found) {
01045         components = Item(C[0].p, C[0].c).components_of();
01046         alter();
01047         delete_item(0, true);
01048     }
01049     return components;
01050 }
01051 
01052 /* Btree::form_key(key) treats address kt as an item holder and fills in
01053 the key part:
01054 
01055            (I) K key c (C tag)
01056 
01057 The bracketed parts are left blank. The key is filled in with key_len bytes and
01058 K set accordingly. c is set to 1.
01059 */
01060 
01061 void Btree::form_key(const string & key) const
01062 {
01063     kt.form_key(key);
01064 }
01065 
01066 /* Btree::add(key, tag) adds the key/tag item to the
01067    B-tree, replacing any existing item with the same key.
01068 
01069    For a long tag, we end up having to add m components, of the form
01070 
01071        key 1 m tag1
01072        key 2 m tag2
01073        ...
01074        key m m tagm
01075 
01076    and tag1+tag2+...+tagm are equal to tag. These in their turn may be replacing
01077    n components of the form
01078 
01079        key 1 n TAG1
01080        key 2 n TAG2
01081        ...
01082        key n n TAGn
01083 
01084    and n may be greater than, equal to, or less than m. These cases are dealt
01085    with in the code below. If m < n for example, we end up with a series of
01086    deletions.
01087 */
01088 
01089 bool
01090 Btree::add(const string &key, string tag)
01091 {
01092     DEBUGCALL(DB, bool, "Btree::add", key << ", " << tag);
01093     Assert(writable);
01094 
01095     if (key.size() > BTREE_MAX_KEY_LEN) {
01096         throw Xapian::InvalidArgumentError(
01097                 "Key too long: length was " +
01098                 om_tostring(key.size()) +
01099                 " bytes, maximum length of a key is " + 
01100                 STRINGIZE(BTREE_MAX_KEY_LEN) + " bytes");
01101     }
01102 
01103     form_key(key);
01104 
01105     // sort of matching kt.append_chunk(), but setting the chunk
01106     const size_t cd = kt.key().length() + K1 + I2 + C2 + C2;  // offset to the tag data
01107     const size_t L = max_item_size - cd; // largest amount of tag data for any chunk
01108     size_t first_L = L;                  // - amount for tag1
01109     bool found = find(C);
01110     if (!found) {
01111         byte * p = C[0].p;
01112         size_t n = TOTAL_FREE(p) % (max_item_size + D2);
01113         if (n > D2 + cd) {
01114             n -= (D2 + cd);
01115             // if n >= last then fully filling this block won't produce
01116             // an extra item, so we might as well do this even if
01117             // full_compaction isn't active.
01118             //
01119             // In the full_compaction case, it turns out we shouldn't always
01120             // try to fill every last byte.  Doing so can actually increase the
01121             // total space required (I believe this effect is due to longer
01122             // dividing keys being required in the index blocks).  Empirically,
01123             // n >= key.size() + K appears a good criterion for K ~= 34.  This
01124             // seems to save about 0.2% in total database size over always
01125             // splitting the tag.  It'll also give be slightly faster retrieval
01126             // as we can avoid reading an extra block occasionally.
01127             size_t last = tag.length() % L;
01128             if (n >= last || (full_compaction && n >= key.size() + 34))
01129                 first_L = n;
01130         }
01131     }
01132 
01133     // a null tag must be added in of course
01134     int m = tag.empty() ? 1 : (tag.length() - first_L + L - 1) / L + 1;
01135                                       // there are m items to add
01136     /* FIXME: sort out this error higher up and turn this into
01137      * an assert.
01138      */
01139     if (m >= BYTE_PAIR_RANGE) RETURN(false);
01140 
01141     int n = 0; // initialise to shut off warning
01142                                       // - and there will be n to delete
01143     int o = 0;                        // Offset into the tag
01144     size_t residue = tag.length();    // Bytes of the tag remaining to add in
01145     int replacement = false;          // Has there been a replacement ?
01146     int i;
01147     kt.set_components_of(m);
01148     for (i = 1; i <= m; i++) {
01149         size_t l = (i == m ? residue : (i == 1 ? first_L : L));
01150         Assert(cd + l <= block_size);
01151         Assert(string::size_type(o + l) <= tag.length());
01152         kt.set_tag(cd, tag.data() + o, l);
01153         kt.set_component_of(i);
01154         
01155         o += l;
01156         residue -= l;
01157 
01158         if (i > 1) found = find(C);
01159         n = add_kt(found);
01160         if (n > 0) replacement = true;
01161     }
01162     /* o == tag.length() here, and n may be zero */
01163     for (i = m + 1; i <= n; i++) {
01164         kt.set_component_of(i);
01165         delete_kt();
01166     }
01167     if (!replacement) ++item_count;
01168     Btree_modified = true;
01169     RETURN(true);
01170 }
01171 
01172 /* Btree::del(key) returns false if the key is not in the B-tree,
01173    otherwise deletes it and returns true.
01174 
01175    Again, this is parallel to Btree::add, but simpler in form.
01176 */
01177 
01178 bool
01179 Btree::del(const string &key)
01180 {
01181     DEBUGCALL(DB, bool, "Btree::del", key);
01182     Assert(writable);
01183 
01184     // We can't delete a key which we is too long for us to store.
01185     if (key.size() > BTREE_MAX_KEY_LEN) RETURN(false);
01186 
01187     if (key.empty()) RETURN(false);
01188     form_key(key);
01189 
01190     int n = delete_kt();  /* there are n items to delete */
01191     if (n <= 0) RETURN(false);
01192 
01193     for (int i = 2; i <= n; i++) {
01194         kt.set_component_of(i);
01195         delete_kt();
01196     }
01197 
01198     item_count--;
01199     Btree_modified = true;
01200     RETURN(true);
01201 }
01202 
01203 bool
01204 Btree::get_exact_entry(const string &key, string & tag) const
01205 {
01206     DEBUGCALL(DB, bool, "Btree::get_exact_entry", key << ", " << tag);
01207     Assert(!key.empty());
01208 
01209     // An oversized key can't exist, so attempting to search for it should fail.
01210     if (key.size() > BTREE_MAX_KEY_LEN) RETURN(false);
01211 
01212     RETURN(find_tag(key, &tag));
01213 }
01214 
01215 bool
01216 Btree::find_tag(const string &key, string * tag) const
01217 {
01218     DEBUGCALL(DB, bool, "Btree::find_tag", key << ", &tag");
01219     form_key(key);
01220     if (!find(C)) RETURN(false);
01221 
01222     read_tag(C, tag);
01223     RETURN(true);
01224 }
01225 
01226 void
01227 Btree::read_tag(Cursor * C_, string *tag) const
01228 {
01229     Item item(C_[0].p, C_[0].c);
01230 
01231     /* n components to join */
01232     int n = item.components_of();
01233 
01234     tag->resize(0);
01235     // max_item_size also includes K1 + I2 + C2 + C2 bytes overhead and the key
01236     // (which is at least 1 byte long).
01237     if (n > 1) tag->reserve((max_item_size - (1 + K1 + I2 + C2 + C2)) * n);
01238 
01239     item.append_chunk(tag);
01240 
01241     for (int i = 2; i <= n; i++) {
01242         if (!next(C_, 0)) {
01243             throw Xapian::DatabaseCorruptError("Unexpected end of table when reading continuation of tag");
01244         }
01245         (void)Item(C_[0].p, C_[0].c).append_chunk(tag);
01246     }
01247     // At this point the cursor is on the last item - calling next will move
01248     // it to the next key (Bcursor::get_tag() relies on this).
01249 }
01250 
01251 void
01252 Btree::set_full_compaction(bool parity)
01253 {
01254     Assert(writable);
01255 
01256     if (parity) seq_count = 0;
01257     full_compaction = parity;
01258 }
01259 
01260 Bcursor * Btree::cursor_get() const {
01261     // FIXME Ick - casting away const is nasty
01262     return new Bcursor(const_cast<Btree *>(this));
01263 }
01264 
01265 /************ B-tree opening and closing ************/
01266 
01267 bool
01268 Btree::basic_open(bool revision_supplied, quartz_revision_number_t revision_)
01269 {
01270     int ch = 'X'; /* will be 'A' or 'B' */
01271 
01272     {
01273         const size_t BTREE_BASES = 2;
01274         string err_msg;
01275         static const char basenames[BTREE_BASES] = { 'A', 'B' };
01276 
01277         Btree_base bases[BTREE_BASES];
01278         bool base_ok[BTREE_BASES];
01279 
01280         both_bases = true;
01281         bool valid_base = false;
01282         {
01283             for (size_t i = 0; i < BTREE_BASES; ++i) {
01284                 bool ok = bases[i].read(name, basenames[i], err_msg);
01285                 base_ok[i] = ok;
01286                 if (ok) {
01287                     valid_base = true;
01288                 } else {
01289                     both_bases = false;
01290                 }
01291             }
01292         }
01293 
01294         if (!valid_base) {
01295             string message = "Error opening table `";
01296             message += name;
01297             message += "':\n";
01298             message += err_msg;
01299             throw Xapian::DatabaseOpeningError(message);
01300         }
01301 
01302         if (revision_supplied) {
01303             bool found_revision = false;
01304             for (size_t i = 0; i < BTREE_BASES; ++i) {
01305                 if (base_ok[i] && bases[i].get_revision() == revision_) {
01306                     ch = basenames[i];
01307                     found_revision = true;
01308                     break;
01309                 }
01310             }
01311             if (!found_revision) {
01312                 /* Couldn't open the revision that was asked for.
01313                  * This shouldn't throw an exception, but should just return
01314                  * false to upper levels.
01315                  */
01316                 return false;
01317             }
01318         } else {
01319             quartz_revision_number_t highest_revision = 0;
01320             for (size_t i = 0; i < BTREE_BASES; ++i) {
01321                 if (base_ok[i] && bases[i].get_revision() >= highest_revision) {
01322                     ch = basenames[i];
01323                     highest_revision = bases[i].get_revision();
01324                 }
01325             }
01326         }
01327 
01328         Btree_base *basep = 0;
01329         Btree_base *other_base = 0;
01330 
01331         for (size_t i = 0; i < BTREE_BASES; ++i) {
01332             DEBUGLINE(UNKNOWN, "Checking (ch == " << ch << ") against "
01333                       "basenames[" << i << "] == " << basenames[i]);
01334             DEBUGLINE(UNKNOWN, "bases[" << i << "].get_revision() == " <<
01335                       bases[i].get_revision());
01336             DEBUGLINE(UNKNOWN, "base_ok[" << i << "] == " << base_ok[i]);
01337             if (ch == basenames[i]) {
01338                 basep = &bases[i];
01339 
01340                 // FIXME: assuming only two bases for other_base
01341                 size_t otherbase_num = 1-i;
01342                 if (base_ok[otherbase_num]) {
01343                     other_base = &bases[otherbase_num];
01344                 }
01345                 break;
01346             }
01347         }
01348         Assert(basep);
01349 
01350         /* basep now points to the most recent base block */
01351 
01352         /* Avoid copying the bitmap etc. - swap contents with the base
01353          * object in the vector, since it'll be destroyed anyway soon.
01354          */
01355         base.swap(*basep);
01356 
01357         revision_number =  base.get_revision();
01358         block_size =       base.get_block_size();
01359         root =             base.get_root();
01360         level =            base.get_level();
01361         //bit_map_size =     basep->get_bit_map_size();
01362         item_count =       base.get_item_count();
01363         faked_root_block = base.get_have_fakeroot();
01364         sequential =       base.get_sequential();
01365 
01366         if (other_base != 0) {
01367             latest_revision_number = other_base->get_revision();
01368             if (revision_number > latest_revision_number)
01369                 latest_revision_number = revision_number;
01370         } else {
01371             latest_revision_number = revision_number;
01372         }
01373     }
01374 
01375     /* kt holds constructed items as well as keys */
01376     kt = Item_wr(zeroed_new(block_size));
01377     if (kt.get_address() == 0) {
01378         throw std::bad_alloc();
01379     }
01380 
01381     set_max_item_size(BLOCK_CAPACITY);
01382 
01383     base_letter = ch;
01384 
01385     /* ready to open the main file */
01386 
01387     return true;
01388 }
01389 
01390 void
01391 Btree::read_root()
01392 {
01393     if (faked_root_block) {
01394         /* root block for an unmodified database. */
01395         byte * p = C[0].p;
01396         Assert(p);
01397 
01398         /* clear block - shouldn't be neccessary, but is a bit nicer,
01399          * and means that the same operations should always produce
01400          * the same database. */
01401         memset(p, 0, block_size);
01402 
01403         int o = block_size - I2 - K1 - C2 - C2;
01404         Item_wr(p + o).fake_root_item();
01405 
01406         SETD(p, DIR_START, o);         // its directory entry
01407         SET_DIR_END(p, DIR_START + D2);// the directory size
01408 
01409         o -= (DIR_START + D2);
01410         SET_MAX_FREE(p, o);
01411         SET_TOTAL_FREE(p, o);
01412         SET_LEVEL(p, 0);
01413 
01414         if (!writable) {
01415             /* reading - revision number doesn't matter as long as
01416              * it's not greater than the current one. */
01417             SET_REVISION(p, 0);
01418             C[0].n = 0;
01419         } else {
01420             /* writing - */
01421             SET_REVISION(p, latest_revision_number + 1);
01422             C[0].n = base.next_free_block();
01423         }
01424     } else {
01425         /* using a root block stored on disk */
01426         block_to_cursor(C, level, root);
01427 
01428         if (REVISION(C[level].p) > revision_number) set_overwritten();
01429         /* although this is unlikely */
01430     }
01431 }
01432 
01433 bool
01434 Btree::do_open_to_write(bool revision_supplied, quartz_revision_number_t revision_)
01435 {
01436     /* FIXME: do the exception safety the right way, by making all the
01437      * parts into sensible objects.
01438      */
01439     if (!basic_open(revision_supplied, revision_)) {
01440         if (!revision_supplied) {
01441             throw Xapian::DatabaseOpeningError("Failed to open for writing");
01442         }
01443         /* When the revision is supplied, it's not an exceptional
01444          * case when open failed, so we just return false here.
01445          */
01446         return false;
01447     }
01448 
01449     writable = true;
01450 
01451     handle = sys_open_for_readwrite(name + "DB");
01452 
01453     prev_ptr = &Btree::prev_default;
01454     next_ptr = &Btree::next_default;
01455 
01456     for (int j = 0; j <= level; j++) {
01457         C[j].n = BLK_UNUSED;
01458         C[j].p = new byte[block_size];
01459         if (C[j].p == 0) {
01460             throw std::bad_alloc();
01461         }
01462     }
01463     split_p = new byte[block_size];
01464     if (split_p == 0) {
01465         throw std::bad_alloc();
01466     }
01467     read_root();
01468 
01469     buffer = zeroed_new(block_size);
01470     if (buffer == 0) {
01471         throw std::bad_alloc();
01472     }
01473 
01474     // swap for writing
01475     other_base_letter = base_letter == 'A' ? 'B' : 'A';
01476 
01477     changed_n = 0;
01478     changed_c = DIR_START;
01479     seq_count = SEQ_START_POINT;
01480 
01481     return true;
01482 }
01483 
01484 Btree::Btree(string path_, bool readonly_)
01485         : revision_number(0),
01486           item_count(0),
01487           block_size(0),
01488           latest_revision_number(0),
01489           both_bases(false),
01490           base_letter('A'),
01491           faked_root_block(true),
01492           sequential(true),
01493           handle(-1),
01494           level(0),
01495           root(0),
01496           kt(0),
01497           buffer(0),
01498           base(),
01499           other_base_letter(0),
01500           name(path_),
01501           seq_count(0),
01502           changed_n(0),
01503           changed_c(0),
01504           max_item_size(0),
01505           Btree_modified(false),
01506           full_compaction(false),
01507           writable(!readonly_),
01508           dont_close_handle(false),
01509           split_p(0)
01510 {
01511     DEBUGCALL(DB, void, "Btree::Btree", path_ << ", " << readonly_);
01512 }
01513 
01514 bool
01515 Btree::exists() const {
01516     DEBUGCALL(DB, bool, "Btree::exists", "");
01517     return (file_exists(name + "DB") &&
01518             (file_exists(name + "baseA") || file_exists(name + "baseB")));
01519 }
01520 
01524 void
01525 sys_unlink_if_exists(const string & filename)
01526 {
01527     if (unlink(filename) == -1) {
01528         if (errno == ENOENT) return;
01529         throw Xapian::DatabaseError("Can't delete file: `" + filename +
01530                               "': " + strerror(errno));
01531     }
01532 }
01533 
01534 void
01535 Btree::create(unsigned int block_size_)
01536 {
01537     DEBUGCALL(DB, void, "Btree::create", block_size_);
01538     close();
01539 
01540     // Block size must in the range 2048..BYTE_PAIR_RANGE, and a power of two.
01541     if (block_size_ < 2048 || block_size_ > BYTE_PAIR_RANGE ||
01542         (block_size_ & (block_size_ - 1)) != 0) {
01543         block_size_ = 8192;
01544     }
01545 
01546     // FIXME: it would be good to arrange that this works such that there's
01547     // always a valid table in place...
01548 
01549     /* write initial values to files */
01550 
01551     /* create the base file */
01552     Btree_base base_;
01553     base_.set_block_size(block_size_);
01554     base_.set_have_fakeroot(true);
01555     base_.set_sequential(true);
01556     base_.write_to_file(name + "baseA");
01557 
01558     /* remove the alternative base file, if any */
01559     sys_unlink_if_exists(name + "baseB");
01560 
01561     /* create the main file */
01562     int h = sys_open_to_write(name + "DB");
01563     if (h == -1 || !sys_close(h)) {
01564         string message = "Error creating DB file: ";
01565         message += strerror(errno);
01566         throw Xapian::DatabaseOpeningError(message);
01567     }
01568 }
01569 
01570 Btree::~Btree() {
01571     DEBUGCALL(DB, void, "Btree::~Btree", "");
01572     Btree::close();
01573 }
01574 
01575 void Btree::close() {
01576     DEBUGCALL(DB, void, "Btree::close", "");
01577 
01578     if (handle != -1) {
01579         // If an error occurs here, we just ignore it, since we're just
01580         // trying to free everything.
01581         if (!dont_close_handle) (void)sys_close(handle);
01582         handle = -1;
01583     }
01584 
01585     for (int j = level; j >= 0; j--) {
01586         delete [] C[j].p;
01587         C[j].p = 0;
01588     }
01589     delete [] split_p;
01590     split_p = 0;
01591 
01592     delete [] kt.get_address();
01593     kt = 0;
01594     delete [] buffer;
01595     buffer = 0;
01596 }
01597 
01598 void
01599 Btree::commit(quartz_revision_number_t revision)
01600 {
01601     DEBUGCALL(DB, void, "Btree::commit", revision);
01602     Assert(writable);
01603 
01604     if (revision <= revision_number) {
01605         throw Xapian::DatabaseError("New revision too low");
01606     }
01607 
01608     // FIXME: this doesn't work (probably because the table revisions get
01609     // out of step) but it's wasteful to keep applying changes to value
01610     // and position if they're never used...
01611     //
01612     // if (!Btree_modified) return;
01613 
01614     for (int j = level; j >= 0; j--) {
01615         if (C[j].rewrite) {
01616             write_block(C[j].n, C[j].p);
01617         }
01618     }
01619 
01620     if (!sys_flush(handle)) {
01621         if (!dont_close_handle) (void)sys_close(handle);
01622         handle = -1;
01623         throw Xapian::DatabaseError("Can't commit new revision - failed to close DB");
01624     }
01625 
01626     if (Btree_modified) {
01627         faked_root_block = false;
01628     }
01629 
01630     if (faked_root_block) {
01631         /* We will use a dummy bitmap. */
01632         base.clear_bit_map();
01633     }
01634 
01635     base.set_revision(revision);
01636     base.set_root(C[level].n);
01637     base.set_level(level);
01638     base.set_item_count(item_count);
01639     base.set_have_fakeroot(faked_root_block);
01640     base.set_sequential(sequential);
01641 
01642     {
01643         int tmp = base_letter;
01644         base_letter = other_base_letter;
01645         other_base_letter = tmp;
01646     }
01647     both_bases = true;
01648     latest_revision_number = revision_number = revision;
01649     root = C[level].n;
01650 
01651     Btree_modified = false;
01652 
01653     for (int i = 0; i < BTREE_CURSOR_LEVELS; ++i) {
01654         C[i].n = BLK_UNUSED;
01655         C[i].c = -1;
01656         C[i].rewrite = false;
01657     }
01658 
01659     base.write_to_file(name + "base" + char(base_letter));
01660     // Save to "<table>.tmp" and then rename to "<table>.base<letter>" so that
01661     // a reader can't try to read a partially written base file.
01662     string tmp = name;
01663     tmp += "tmp";
01664     string basefile = name;
01665     basefile += "base";
01666     basefile += char(base_letter);
01667     base.write_to_file(tmp);
01668 #if defined __WIN32__
01669     if (msvc_posix_rename(tmp.c_str(), basefile.c_str()) < 0) {
01670 #else
01671     if (rename(tmp.c_str(), basefile.c_str()) < 0) {
01672 #endif
01673         // With NFS, rename() failing may just mean that the server crashed
01674         // after successfully renaming, but before reporting this, and then
01675         // the retried operation fails.  So we need to check if the source
01676         // file still exists, which we do by calling unlink(), since we want
01677         // to remove the temporary file anyway.
01678         int saved_errno = errno;
01679         if (unlink(tmp) == 0 || errno != ENOENT) {
01680             string msg("Couldn't update base file ");
01681             msg += basefile;
01682             msg += ": ";
01683             msg += strerror(saved_errno);
01684             throw Xapian::DatabaseError(msg);
01685         }
01686     }
01687     base.commit();
01688 
01689     read_root();
01690 
01691     changed_n = 0;
01692     changed_c = DIR_START;
01693     seq_count = SEQ_START_POINT;
01694 }
01695 
01696 void
01697 Btree::cancel()
01698 {
01699     DEBUGCALL(DB, void, "Btree::cancel", "");
01700     Assert(writable);
01701 
01702     // This causes problems: if (!Btree_modified) return;
01703 
01704     string err_msg;
01705     if (!base.read(name, base_letter, err_msg)) {
01706         throw Xapian::DatabaseCorruptError(string("Couldn't reread base ") + char(base_letter));
01707     }
01708 
01709     revision_number =  base.get_revision();
01710     block_size =       base.get_block_size();
01711     root =             base.get_root();
01712     level =            base.get_level();
01713     //bit_map_size =     basep->get_bit_map_size();
01714     item_count =       base.get_item_count();
01715     faked_root_block = base.get_have_fakeroot();
01716     sequential =       base.get_sequential();
01717 
01718     latest_revision_number = revision_number; // FIXME: we can end up reusing a revision if we opened a btree at an older revision, start to modify it, then cancel...
01719 
01720     prev_ptr = &Btree::prev_default;
01721     next_ptr = &Btree::next_default;
01722 
01723     for (int j = 0; j <= level; j++) {
01724         C[j].n = BLK_UNUSED;
01725         C[j].rewrite = false;
01726     }
01727     read_root();
01728 
01729     changed_n = 0;
01730     changed_c = DIR_START;
01731     seq_count = SEQ_START_POINT;
01732 }
01733 
01734 /************ B-tree reading ************/
01735 
01736 bool
01737 Btree::do_open_to_read(bool revision_supplied, quartz_revision_number_t revision_)
01738 {
01739     if (!basic_open(revision_supplied, revision_)) {
01740         if (revision_supplied) {
01741             // The requested revision was not available.
01742             // This could be because the database was modified underneath us, or
01743             // because a base file is missing.  Return false, and work out what
01744             // the problem was at a higher level.
01745             return false;
01746         }
01747         throw Xapian::DatabaseOpeningError("Failed to open table for reading");
01748     }
01749 
01750     handle = sys_open_to_read(name + "DB");
01751 
01752     if (sequential) {
01753         prev_ptr = &Btree::prev_for_sequential;
01754         next_ptr = &Btree::next_for_sequential;
01755     } else {
01756         prev_ptr = &Btree::prev_default;
01757         next_ptr = &Btree::next_default;
01758     }
01759 
01760     for (int j = 0; j <= level; j++) {
01761         C[j].n = BLK_UNUSED;
01762         C[j].p = new byte[block_size];
01763         if (C[j].p == 0) {
01764             throw std::bad_alloc();
01765         }
01766     }
01767 
01768     read_root();
01769     return true;
01770 }
01771 
01772 void
01773 Btree::open()
01774 {
01775     DEBUGCALL(DB, void, "Btree::open", "");
01776     DEBUGLINE(DB, "opening at path " << name);
01777     close();
01778 
01779     if (!writable) {
01780         // Any errors are thrown if revision_supplied is false
01781         (void)do_open_to_read(false, 0);
01782         return;
01783     }
01784 
01785     // Any errors are thrown if revision_supplied is false
01786     (void)do_open_to_write(false, 0);
01787 }
01788 
01789 bool
01790 Btree::open(quartz_revision_number_t revision)
01791 {
01792     DEBUGCALL(DB, bool, "Btree::open", revision);
01793     DEBUGLINE(DB, "opening for particular revision at path " << name);
01794     close();
01795 
01796     if (!writable) {
01797         if (do_open_to_read(true, revision)) {
01798             AssertEq(revision_number, revision);
01799             RETURN(true);
01800         } else {
01801             close();
01802             RETURN(false);
01803         }
01804     }
01805 
01806     if (!do_open_to_write(true, revision)) {
01807         // Can't open at the requested revision.
01808         close();
01809         RETURN(false);
01810     }
01811 
01812     AssertEq(revision_number, revision);
01813     RETURN(true);
01814 }
01815 
01816 bool
01817 Btree::prev_for_sequential(Cursor * C_, int /*dummy*/) const
01818 {
01819     int c = C_[0].c;
01820     if (c == DIR_START) {
01821         byte * p = C_[0].p;
01822         Assert(p);
01823         uint4 n = C_[0].n;
01824         while (true) {
01825             if (n == 0) return false;
01826             n--;
01827             if (writable) {
01828                 if (n == C[0].n) {
01829                     // Block is a leaf block in the built-in cursor
01830                     // (potentially in modified form).
01831                     memcpy(p, C[0].p, block_size);
01832                 } else {
01833                     // Blocks in the built-in cursor may not have been written
01834                     // to disk yet, so we have to check that the block number
01835                     // isn't in the built-in cursor or we'll read an
01836                     // uninitialised block (for which GET_LEVEL(p) will
01837                     // probably return 0).
01838                     int j;
01839                     for (j = 1; j <= level; ++j) {
01840                         if (n == C[j].n) break;
01841                     }
01842                     if (j <= level) continue;
01843 
01844                     // Block isn't in the built-in cursor, so the form on disk
01845                     // is valid, so read it to check if it's the next level 0
01846                     // block.
01847                     read_block(n, p);
01848                 }
01849             } else {
01850                 read_block(n, p);
01851             }
01852             if (REVISION(p) > 1) {
01853                 set_overwritten();
01854                 return false;
01855             }
01856             if (GET_LEVEL(p) == 0) break;
01857         }
01858         c = DIR_END(p);
01859         C_[0].n = n;
01860     }
01861     c -= D2;
01862     C_[0].c = c;
01863     return true;
01864 }
01865 
01866 bool
01867 Btree::next_for_sequential(Cursor * C_, int /*dummy*/) const
01868 {
01869     byte * p = C_[0].p;
01870     Assert(p);
01871     int c = C_[0].c;
01872     c += D2;
01873     if (c == DIR_END(p)) {
01874         uint4 n = C_[0].n;
01875         while (true) {
01876             n++;
01877             if (n > base.get_last_block()) return false;
01878             if (writable) {
01879                 if (n == C[0].n) {
01880                     // Block is a leaf block in the built-in cursor
01881                     // (potentially in modified form).
01882                     memcpy(p, C[0].p, block_size);
01883                 } else {
01884                     // Blocks in the built-in cursor may not have been written
01885                     // to disk yet, so we have to check that the block number
01886                     // isn't in the built-in cursor or we'll read an
01887                     // uninitialised block (for which GET_LEVEL(p) will
01888                     // probably return 0).
01889                     int j;
01890                     for (j = 1; j <= level; ++j) {
01891                         if (n == C[j].n) break;
01892                     }
01893                     if (j <= level) continue;
01894 
01895                     // Block isn't in the built-in cursor, so the form on disk
01896                     // is valid, so read it to check if it's the next level 0
01897                     // block.
01898                     read_block(n, p);
01899                 }
01900             } else {
01901                 read_block(n, p);
01902             }
01903             if (REVISION(p) > 1) {
01904                 set_overwritten();
01905                 return false;
01906             }
01907             if (GET_LEVEL(p) == 0) break;
01908         }
01909         c = DIR_START;
01910         C_[0].n = n;
01911     }
01912     C_[0].c = c;
01913     return true;
01914 }
01915 
01916 bool
01917 Btree::prev_default(Cursor * C_, int j) const
01918 {
01919     byte * p = C_[j].p;
01920     int c = C_[j].c;
01921     Assert(c >= DIR_START);
01922     Assert((unsigned)c < block_size);
01923     Assert(c <= DIR_END(p));
01924     if (c == DIR_START) {
01925         if (j == level) return false;
01926         if (!prev_default(C_, j + 1)) return false;
01927         c = DIR_END(p);
01928     }
01929     c -= D2;
01930     C_[j].c = c;
01931     if (j > 0) {
01932         block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
01933     }
01934     return true;
01935 }
01936 
01937 bool
01938 Btree::next_default(Cursor * C_, int j) const
01939 {
01940     byte * p = C_[j].p;
01941     int c = C_[j].c;
01942     Assert(c >= DIR_START);
01943     c += D2;
01944     Assert((unsigned)c < block_size);
01945     // Sometimes c can be DIR_END(p) + 2 here it appears...
01946     if (c > DIR_END(p)) c = DIR_END(p);
01947     Assert(c <= DIR_END(p));
01948     if (c == DIR_END(p)) {
01949         if (j == level) return false;
01950         if (!next_default(C_, j + 1)) return false;
01951         c = DIR_START;
01952     }
01953     C_[j].c = c;
01954     if (j > 0) {
01955         block_to_cursor(C_, j - 1, Item(p, c).block_given_by());
01956 #ifdef BTREE_DEBUG_FULL
01957         printf("Block in Btree:next_default");
01958         report_block_full(j - 1, C_[j - 1].n, C_[j - 1].p);
01959 #endif /* BTREE_DEBUG_FULL */
01960     }
01961     return true;
01962 }
01963 
01979 bool Key::operator<(Key key2) const
01980 {
01981     DEBUGCALL(DB, bool, "Key::operator<", reinterpret_cast<const void*>(key2.p));
01982     int key1_len = length();
01983     int key2_len = key2.length();
01984     if (key1_len == key2_len) {
01985         // The keys are the same length, so we can compare the counts
01986         // in the same operation since they're stored as 2 byte
01987         // bigendian numbers.
01988         RETURN(memcmp(p + K1, key2.p + K1, key1_len + C2) < 0);
01989     }
01990 
01991     int k_smaller = (key2_len < key1_len ? key2_len : key1_len);
01992 
01993     // Compare the common part of the keys
01994     int diff = memcmp(p + K1, key2.p + K1, k_smaller);
01995     if (diff != 0) RETURN(diff < 0);
01996 
01997     // We dealt with the "same length" case above so we never need to check
01998     // the count here.
01999     RETURN(key1_len < key2_len);
02000 }
02001 
02002 bool Key::operator==(Key key2) const
02003 {
02004     DEBUGCALL(DB, bool, "Key::operator==", reinterpret_cast<const void*>(key2.p));
02005     int key1_len = length();
02006     if (key1_len != key2.length()) return false;
02007     // The keys are the same length, so we can compare the counts
02008     // in the same operation since they're stored as 2 byte
02009     // bigendian numbers.
02010     RETURN(memcmp(p + K1, key2.p + K1, key1_len + C2) == 0);
02011 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.