Main Page | Directories | File List

pager.c

00001 /*
00002 ** 2001 September 15
00003 **
00004 ** The author disclaims copyright to this source code.  In place of
00005 ** a legal notice, here is a blessing:
00006 **
00007 **    May you do good and not evil.
00008 **    May you find forgiveness for yourself and forgive others.
00009 **    May you share freely, never taking more than you give.
00010 **
00011 *************************************************************************
00012 ** This is the implementation of the page cache subsystem or "pager".
00013 ** 
00014 ** The pager is used to access a database disk file.  It implements
00015 ** atomic commit and rollback through the use of a journal file that
00016 ** is separate from the database file.  The pager also implements file
00017 ** locking to prevent two processes from writing the same database
00018 ** file simultaneously, or one process from reading the database while
00019 ** another is writing.
00020 **
00021 ** @(#) $Id: pager.c,v 1.101.2.1 2005/12/19 17:37:10 drh Exp $
00022 */
00023 #include "os.h"         /* Must be first to enable large file support */
00024 #include "sqliteInt.h"
00025 #include "pager.h"
00026 #include <assert.h>
00027 #include <string.h>
00028 
00029 /*
00030 ** Macros for troubleshooting.  Normally turned off
00031 */
00032 #if 0
00033 static Pager *mainPager = 0;
00034 #define SET_PAGER(X)  if( mainPager==0 ) mainPager = (X)
00035 #define CLR_PAGER(X)  if( mainPager==(X) ) mainPager = 0
00036 #define TRACE1(X)     if( pPager==mainPager ) fprintf(stderr,X)
00037 #define TRACE2(X,Y)   if( pPager==mainPager ) fprintf(stderr,X,Y)
00038 #define TRACE3(X,Y,Z) if( pPager==mainPager ) fprintf(stderr,X,Y,Z)
00039 #else
00040 #define SET_PAGER(X)
00041 #define CLR_PAGER(X)
00042 #define TRACE1(X)
00043 #define TRACE2(X,Y)
00044 #define TRACE3(X,Y,Z)
00045 #endif
00046 
00047 
00048 /*
00049 ** The page cache as a whole is always in one of the following
00050 ** states:
00051 **
00052 **   SQLITE_UNLOCK       The page cache is not currently reading or 
00053 **                       writing the database file.  There is no
00054 **                       data held in memory.  This is the initial
00055 **                       state.
00056 **
00057 **   SQLITE_READLOCK     The page cache is reading the database.
00058 **                       Writing is not permitted.  There can be
00059 **                       multiple readers accessing the same database
00060 **                       file at the same time.
00061 **
00062 **   SQLITE_WRITELOCK    The page cache is writing the database.
00063 **                       Access is exclusive.  No other processes or
00064 **                       threads can be reading or writing while one
00065 **                       process is writing.
00066 **
00067 ** The page cache comes up in SQLITE_UNLOCK.  The first time a
00068 ** sqlite_page_get() occurs, the state transitions to SQLITE_READLOCK.
00069 ** After all pages have been released using sqlite_page_unref(),
00070 ** the state transitions back to SQLITE_UNLOCK.  The first time
00071 ** that sqlite_page_write() is called, the state transitions to
00072 ** SQLITE_WRITELOCK.  (Note that sqlite_page_write() can only be
00073 ** called on an outstanding page which means that the pager must
00074 ** be in SQLITE_READLOCK before it transitions to SQLITE_WRITELOCK.)
00075 ** The sqlite_page_rollback() and sqlite_page_commit() functions 
00076 ** transition the state from SQLITE_WRITELOCK back to SQLITE_READLOCK.
00077 */
00078 #define SQLITE_UNLOCK      0
00079 #define SQLITE_READLOCK    1
00080 #define SQLITE_WRITELOCK   2
00081 
00082 
00083 /*
00084 ** Each in-memory image of a page begins with the following header.
00085 ** This header is only visible to this pager module.  The client
00086 ** code that calls pager sees only the data that follows the header.
00087 **
00088 ** Client code should call sqlitepager_write() on a page prior to making
00089 ** any modifications to that page.  The first time sqlitepager_write()
00090 ** is called, the original page contents are written into the rollback
00091 ** journal and PgHdr.inJournal and PgHdr.needSync are set.  Later, once
00092 ** the journal page has made it onto the disk surface, PgHdr.needSync
00093 ** is cleared.  The modified page cannot be written back into the original
00094 ** database file until the journal pages has been synced to disk and the
00095 ** PgHdr.needSync has been cleared.
00096 **
00097 ** The PgHdr.dirty flag is set when sqlitepager_write() is called and
00098 ** is cleared again when the page content is written back to the original
00099 ** database file.
00100 */
00101 typedef struct PgHdr PgHdr;
00102 struct PgHdr {
00103   Pager *pPager;                 /* The pager to which this page belongs */
00104   Pgno pgno;                     /* The page number for this page */
00105   PgHdr *pNextHash, *pPrevHash;  /* Hash collision chain for PgHdr.pgno */
00106   int nRef;                      /* Number of users of this page */
00107   PgHdr *pNextFree, *pPrevFree;  /* Freelist of pages where nRef==0 */
00108   PgHdr *pNextAll, *pPrevAll;    /* A list of all pages */
00109   PgHdr *pNextCkpt, *pPrevCkpt;  /* List of pages in the checkpoint journal */
00110   u8 inJournal;                  /* TRUE if has been written to journal */
00111   u8 inCkpt;                     /* TRUE if written to the checkpoint journal */
00112   u8 dirty;                      /* TRUE if we need to write back changes */
00113   u8 needSync;                   /* Sync journal before writing this page */
00114   u8 alwaysRollback;             /* Disable dont_rollback() for this page */
00115   PgHdr *pDirty;                 /* Dirty pages sorted by PgHdr.pgno */
00116   /* SQLITE_PAGE_SIZE bytes of page data follow this header */
00117   /* Pager.nExtra bytes of local data follow the page data */
00118 };
00119 
00120 
00121 /*
00122 ** A macro used for invoking the codec if there is one
00123 */
00124 #ifdef SQLITE_HAS_CODEC
00125 # define CODEC(P,D,N,X) if( P->xCodec ){ P->xCodec(P->pCodecArg,D,N,X); }
00126 #else
00127 # define CODEC(P,D,N,X)
00128 #endif
00129 
00130 /*
00131 ** Convert a pointer to a PgHdr into a pointer to its data
00132 ** and back again.
00133 */
00134 #define PGHDR_TO_DATA(P)  ((void*)(&(P)[1]))
00135 #define DATA_TO_PGHDR(D)  (&((PgHdr*)(D))[-1])
00136 #define PGHDR_TO_EXTRA(P) ((void*)&((char*)(&(P)[1]))[SQLITE_PAGE_SIZE])
00137 
00138 /*
00139 ** How big to make the hash table used for locating in-memory pages
00140 ** by page number.
00141 */
00142 #define N_PG_HASH 2048
00143 
00144 /*
00145 ** Hash a page number
00146 */
00147 #define pager_hash(PN)  ((PN)&(N_PG_HASH-1))
00148 
00149 /*
00150 ** A open page cache is an instance of the following structure.
00151 */
00152 struct Pager {
00153   char *zFilename;            /* Name of the database file */
00154   char *zJournal;             /* Name of the journal file */
00155   char *zDirectory;           /* Directory hold database and journal files */
00156   OsFile fd, jfd;             /* File descriptors for database and journal */
00157   OsFile cpfd;                /* File descriptor for the checkpoint journal */
00158   int dbSize;                 /* Number of pages in the file */
00159   int origDbSize;             /* dbSize before the current change */
00160   int ckptSize;               /* Size of database (in pages) at ckpt_begin() */
00161   off_t ckptJSize;            /* Size of journal at ckpt_begin() */
00162   int nRec;                   /* Number of pages written to the journal */
00163   u32 cksumInit;              /* Quasi-random value added to every checksum */
00164   int ckptNRec;               /* Number of records in the checkpoint journal */
00165   int nExtra;                 /* Add this many bytes to each in-memory page */
00166   void (*xDestructor)(void*); /* Call this routine when freeing pages */
00167   int nPage;                  /* Total number of in-memory pages */
00168   int nRef;                   /* Number of in-memory pages with PgHdr.nRef>0 */
00169   int mxPage;                 /* Maximum number of pages to hold in cache */
00170   int nHit, nMiss, nOvfl;     /* Cache hits, missing, and LRU overflows */
00171   void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
00172   void *pCodecArg;            /* First argument to xCodec() */
00173   u8 journalOpen;             /* True if journal file descriptors is valid */
00174   u8 journalStarted;          /* True if header of journal is synced */
00175   u8 useJournal;              /* Use a rollback journal on this file */
00176   u8 ckptOpen;                /* True if the checkpoint journal is open */
00177   u8 ckptInUse;               /* True we are in a checkpoint */
00178   u8 ckptAutoopen;            /* Open ckpt journal when main journal is opened*/
00179   u8 noSync;                  /* Do not sync the journal if true */
00180   u8 fullSync;                /* Do extra syncs of the journal for robustness */
00181   u8 state;                   /* SQLITE_UNLOCK, _READLOCK or _WRITELOCK */
00182   u8 errMask;                 /* One of several kinds of errors */
00183   u8 tempFile;                /* zFilename is a temporary file */
00184   u8 readOnly;                /* True for a read-only database */
00185   u8 needSync;                /* True if an fsync() is needed on the journal */
00186   u8 dirtyFile;               /* True if database file has changed in any way */
00187   u8 alwaysRollback;          /* Disable dont_rollback() for all pages */
00188   u8 *aInJournal;             /* One bit for each page in the database file */
00189   u8 *aInCkpt;                /* One bit for each page in the database */
00190   PgHdr *pFirst, *pLast;      /* List of free pages */
00191   PgHdr *pFirstSynced;        /* First free page with PgHdr.needSync==0 */
00192   PgHdr *pAll;                /* List of all pages */
00193   PgHdr *pCkpt;               /* List of pages in the checkpoint journal */
00194   PgHdr *aHash[N_PG_HASH];    /* Hash table to map page number of PgHdr */
00195 };
00196 
00197 /*
00198 ** These are bits that can be set in Pager.errMask.
00199 */
00200 #define PAGER_ERR_FULL     0x01  /* a write() failed */
00201 #define PAGER_ERR_MEM      0x02  /* malloc() failed */
00202 #define PAGER_ERR_LOCK     0x04  /* error in the locking protocol */
00203 #define PAGER_ERR_CORRUPT  0x08  /* database or journal corruption */
00204 #define PAGER_ERR_DISK     0x10  /* general disk I/O error - bad hard drive? */
00205 
00206 /*
00207 ** The journal file contains page records in the following
00208 ** format.
00209 **
00210 ** Actually, this structure is the complete page record for pager
00211 ** formats less than 3.  Beginning with format 3, this record is surrounded
00212 ** by two checksums.
00213 */
00214 typedef struct PageRecord PageRecord;
00215 struct PageRecord {
00216   Pgno pgno;                      /* The page number */
00217   char aData[SQLITE_PAGE_SIZE];   /* Original data for page pgno */
00218 };
00219 
00220 /*
00221 ** Journal files begin with the following magic string.  The data
00222 ** was obtained from /dev/random.  It is used only as a sanity check.
00223 **
00224 ** There are three journal formats (so far). The 1st journal format writes
00225 ** 32-bit integers in the byte-order of the host machine.  New
00226 ** formats writes integers as big-endian.  All new journals use the
00227 ** new format, but we have to be able to read an older journal in order
00228 ** to rollback journals created by older versions of the library.
00229 **
00230 ** The 3rd journal format (added for 2.8.0) adds additional sanity
00231 ** checking information to the journal.  If the power fails while the
00232 ** journal is being written, semi-random garbage data might appear in
00233 ** the journal file after power is restored.  If an attempt is then made
00234 ** to roll the journal back, the database could be corrupted.  The additional
00235 ** sanity checking data is an attempt to discover the garbage in the
00236 ** journal and ignore it.
00237 **
00238 ** The sanity checking information for the 3rd journal format consists
00239 ** of a 32-bit checksum on each page of data.  The checksum covers both
00240 ** the page number and the SQLITE_PAGE_SIZE bytes of data for the page.
00241 ** This cksum is initialized to a 32-bit random value that appears in the
00242 ** journal file right after the header.  The random initializer is important,
00243 ** because garbage data that appears at the end of a journal is likely
00244 ** data that was once in other files that have now been deleted.  If the
00245 ** garbage data came from an obsolete journal file, the checksums might
00246 ** be correct.  But by initializing the checksum to random value which
00247 ** is different for every journal, we minimize that risk.
00248 */
00249 static const unsigned char aJournalMagic1[] = {
00250   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd4,
00251 };
00252 static const unsigned char aJournalMagic2[] = {
00253   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd5,
00254 };
00255 static const unsigned char aJournalMagic3[] = {
00256   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd6,
00257 };
00258 #define JOURNAL_FORMAT_1 1
00259 #define JOURNAL_FORMAT_2 2
00260 #define JOURNAL_FORMAT_3 3
00261 
00262 /*
00263 ** The following integer determines what format to use when creating
00264 ** new primary journal files.  By default we always use format 3.
00265 ** When testing, we can set this value to older journal formats in order to
00266 ** make sure that newer versions of the library are able to rollback older
00267 ** journal files.
00268 **
00269 ** Note that checkpoint journals always use format 2 and omit the header.
00270 */
00271 #ifdef SQLITE_TEST
00272 int journal_format = 3;
00273 #else
00274 # define journal_format 3
00275 #endif
00276 
00277 /*
00278 ** The size of the header and of each page in the journal varies according
00279 ** to which journal format is being used.  The following macros figure out
00280 ** the sizes based on format numbers.
00281 */
00282 #define JOURNAL_HDR_SZ(X) \
00283    (sizeof(aJournalMagic1) + sizeof(Pgno) + ((X)>=3)*2*sizeof(u32))
00284 #define JOURNAL_PG_SZ(X) \
00285    (SQLITE_PAGE_SIZE + sizeof(Pgno) + ((X)>=3)*sizeof(u32))
00286 
00287 /*
00288 ** Enable reference count tracking here:
00289 */
00290 #ifdef SQLITE_TEST
00291   int pager_refinfo_enable = 0;
00292   static void pager_refinfo(PgHdr *p){
00293     static int cnt = 0;
00294     if( !pager_refinfo_enable ) return;
00295     printf(
00296        "REFCNT: %4d addr=0x%08x nRef=%d\n",
00297        p->pgno, (int)PGHDR_TO_DATA(p), p->nRef
00298     );
00299     cnt++;   /* Something to set a breakpoint on */
00300   }
00301 # define REFINFO(X)  pager_refinfo(X)
00302 #else
00303 # define REFINFO(X)
00304 #endif
00305 
00306 /*
00307 ** Read a 32-bit integer from the given file descriptor.  Store the integer
00308 ** that is read in *pRes.  Return SQLITE_OK if everything worked, or an
00309 ** error code is something goes wrong.
00310 **
00311 ** If the journal format is 2 or 3, read a big-endian integer.  If the
00312 ** journal format is 1, read an integer in the native byte-order of the
00313 ** host machine.
00314 */
00315 static int read32bits(int format, OsFile *fd, u32 *pRes){
00316   u32 res;
00317   int rc;
00318   rc = sqliteOsRead(fd, &res, sizeof(res));
00319   if( rc==SQLITE_OK && format>JOURNAL_FORMAT_1 ){
00320     unsigned char ac[4];
00321     memcpy(ac, &res, 4);
00322     res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3];
00323   }
00324   *pRes = res;
00325   return rc;
00326 }
00327 
00328 /*
00329 ** Write a 32-bit integer into the given file descriptor.  Return SQLITE_OK
00330 ** on success or an error code is something goes wrong.
00331 **
00332 ** If the journal format is 2 or 3, write the integer as 4 big-endian
00333 ** bytes.  If the journal format is 1, write the integer in the native
00334 ** byte order.  In normal operation, only formats 2 and 3 are used.
00335 ** Journal format 1 is only used for testing.
00336 */
00337 static int write32bits(OsFile *fd, u32 val){
00338   unsigned char ac[4];
00339   if( journal_format<=1 ){
00340     return sqliteOsWrite(fd, &val, 4);
00341   }
00342   ac[0] = (val>>24) & 0xff;
00343   ac[1] = (val>>16) & 0xff;
00344   ac[2] = (val>>8) & 0xff;
00345   ac[3] = val & 0xff;
00346   return sqliteOsWrite(fd, ac, 4);
00347 }
00348 
00349 /*
00350 ** Write a 32-bit integer into a page header right before the
00351 ** page data.  This will overwrite the PgHdr.pDirty pointer.
00352 **
00353 ** The integer is big-endian for formats 2 and 3 and native byte order
00354 ** for journal format 1.
00355 */
00356 static void store32bits(u32 val, PgHdr *p, int offset){
00357   unsigned char *ac;
00358   ac = &((unsigned char*)PGHDR_TO_DATA(p))[offset];
00359   if( journal_format<=1 ){
00360     memcpy(ac, &val, 4);
00361   }else{
00362     ac[0] = (val>>24) & 0xff;
00363     ac[1] = (val>>16) & 0xff;
00364     ac[2] = (val>>8) & 0xff;
00365     ac[3] = val & 0xff;
00366   }
00367 }
00368 
00369 
00370 /*
00371 ** Convert the bits in the pPager->errMask into an approprate
00372 ** return code.
00373 */
00374 static int pager_errcode(Pager *pPager){
00375   int rc = SQLITE_OK;
00376   if( pPager->errMask & PAGER_ERR_LOCK )    rc = SQLITE_PROTOCOL;
00377   if( pPager->errMask & PAGER_ERR_DISK )    rc = SQLITE_IOERR;
00378   if( pPager->errMask & PAGER_ERR_FULL )    rc = SQLITE_FULL;
00379   if( pPager->errMask & PAGER_ERR_MEM )     rc = SQLITE_NOMEM;
00380   if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT;
00381   return rc;
00382 }
00383 
00384 /*
00385 ** Add or remove a page from the list of all pages that are in the
00386 ** checkpoint journal.
00387 **
00388 ** The Pager keeps a separate list of pages that are currently in
00389 ** the checkpoint journal.  This helps the sqlitepager_ckpt_commit()
00390 ** routine run MUCH faster for the common case where there are many
00391 ** pages in memory but only a few are in the checkpoint journal.
00392 */
00393 static void page_add_to_ckpt_list(PgHdr *pPg){
00394   Pager *pPager = pPg->pPager;
00395   if( pPg->inCkpt ) return;
00396   assert( pPg->pPrevCkpt==0 && pPg->pNextCkpt==0 );
00397   pPg->pPrevCkpt = 0;
00398   if( pPager->pCkpt ){
00399     pPager->pCkpt->pPrevCkpt = pPg;
00400   }
00401   pPg->pNextCkpt = pPager->pCkpt;
00402   pPager->pCkpt = pPg;
00403   pPg->inCkpt = 1;
00404 }
00405 static void page_remove_from_ckpt_list(PgHdr *pPg){
00406   if( !pPg->inCkpt ) return;
00407   if( pPg->pPrevCkpt ){
00408     assert( pPg->pPrevCkpt->pNextCkpt==pPg );
00409     pPg->pPrevCkpt->pNextCkpt = pPg->pNextCkpt;
00410   }else{
00411     assert( pPg->pPager->pCkpt==pPg );
00412     pPg->pPager->pCkpt = pPg->pNextCkpt;
00413   }
00414   if( pPg->pNextCkpt ){
00415     assert( pPg->pNextCkpt->pPrevCkpt==pPg );
00416     pPg->pNextCkpt->pPrevCkpt = pPg->pPrevCkpt;
00417   }
00418   pPg->pNextCkpt = 0;
00419   pPg->pPrevCkpt = 0;
00420   pPg->inCkpt = 0;
00421 }
00422 
00423 /*
00424 ** Find a page in the hash table given its page number.  Return
00425 ** a pointer to the page or NULL if not found.
00426 */
00427 static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
00428   PgHdr *p = pPager->aHash[pager_hash(pgno)];
00429   while( p && p->pgno!=pgno ){
00430     p = p->pNextHash;
00431   }
00432   return p;
00433 }
00434 
00435 /*
00436 ** Unlock the database and clear the in-memory cache.  This routine
00437 ** sets the state of the pager back to what it was when it was first
00438 ** opened.  Any outstanding pages are invalidated and subsequent attempts
00439 ** to access those pages will likely result in a coredump.
00440 */
00441 static void pager_reset(Pager *pPager){
00442   PgHdr *pPg, *pNext;
00443   for(pPg=pPager->pAll; pPg; pPg=pNext){
00444     pNext = pPg->pNextAll;
00445     sqliteFree(pPg);
00446   }
00447   pPager->pFirst = 0;
00448   pPager->pFirstSynced = 0;
00449   pPager->pLast = 0;
00450   pPager->pAll = 0;
00451   memset(pPager->aHash, 0, sizeof(pPager->aHash));
00452   pPager->nPage = 0;
00453   if( pPager->state>=SQLITE_WRITELOCK ){
00454     sqlitepager_rollback(pPager);
00455   }
00456   sqliteOsUnlock(&pPager->fd);
00457   pPager->state = SQLITE_UNLOCK;
00458   pPager->dbSize = -1;
00459   pPager->nRef = 0;
00460   assert( pPager->journalOpen==0 );
00461 }
00462 
00463 /*
00464 ** When this routine is called, the pager has the journal file open and
00465 ** a write lock on the database.  This routine releases the database
00466 ** write lock and acquires a read lock in its place.  The journal file
00467 ** is deleted and closed.
00468 **
00469 ** TODO: Consider keeping the journal file open for temporary databases.
00470 ** This might give a performance improvement on windows where opening
00471 ** a file is an expensive operation.
00472 */
00473 static int pager_unwritelock(Pager *pPager){
00474   int rc;
00475   PgHdr *pPg;
00476   if( pPager->state<SQLITE_WRITELOCK ) return SQLITE_OK;
00477   sqlitepager_ckpt_commit(pPager);
00478   if( pPager->ckptOpen ){
00479     sqliteOsClose(&pPager->cpfd);
00480     pPager->ckptOpen = 0;
00481   }
00482   if( pPager->journalOpen ){
00483     sqliteOsClose(&pPager->jfd);
00484     pPager->journalOpen = 0;
00485     sqliteOsDelete(pPager->zJournal);
00486     sqliteFree( pPager->aInJournal );
00487     pPager->aInJournal = 0;
00488     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
00489       pPg->inJournal = 0;
00490       pPg->dirty = 0;
00491       pPg->needSync = 0;
00492     }
00493   }else{
00494     assert( pPager->dirtyFile==0 || pPager->useJournal==0 );
00495   }
00496   rc = sqliteOsReadLock(&pPager->fd);
00497   if( rc==SQLITE_OK ){
00498     pPager->state = SQLITE_READLOCK;
00499   }else{
00500     /* This can only happen if a process does a BEGIN, then forks and the
00501     ** child process does the COMMIT.  Because of the semantics of unix
00502     ** file locking, the unlock will fail.
00503     */
00504     pPager->state = SQLITE_UNLOCK;
00505   }
00506   return rc;
00507 }
00508 
00509 /*
00510 ** Compute and return a checksum for the page of data.
00511 **
00512 ** This is not a real checksum.  It is really just the sum of the 
00513 ** random initial value and the page number.  We considered do a checksum
00514 ** of the database, but that was found to be too slow.
00515 */
00516 static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){
00517   u32 cksum = pPager->cksumInit + pgno;
00518   return cksum;
00519 }
00520 
00521 /*
00522 ** Read a single page from the journal file opened on file descriptor
00523 ** jfd.  Playback this one page.
00524 **
00525 ** There are three different journal formats.  The format parameter determines
00526 ** which format is used by the journal that is played back.
00527 */
00528 static int pager_playback_one_page(Pager *pPager, OsFile *jfd, int format){
00529   int rc;
00530   PgHdr *pPg;              /* An existing page in the cache */
00531   PageRecord pgRec;
00532   u32 cksum;
00533 
00534   rc = read32bits(format, jfd, &pgRec.pgno);
00535   if( rc!=SQLITE_OK ) return rc;
00536   rc = sqliteOsRead(jfd, &pgRec.aData, sizeof(pgRec.aData));
00537   if( rc!=SQLITE_OK ) return rc;
00538 
00539   /* Sanity checking on the page.  This is more important that I originally
00540   ** thought.  If a power failure occurs while the journal is being written,
00541   ** it could cause invalid data to be written into the journal.  We need to
00542   ** detect this invalid data (with high probability) and ignore it.
00543   */
00544   if( pgRec.pgno==0 ){
00545     return SQLITE_DONE;
00546   }
00547   if( pgRec.pgno>(unsigned)pPager->dbSize ){
00548     return SQLITE_OK;
00549   }
00550   if( format>=JOURNAL_FORMAT_3 ){
00551     rc = read32bits(format, jfd, &cksum);
00552     if( rc ) return rc;
00553     if( pager_cksum(pPager, pgRec.pgno, pgRec.aData)!=cksum ){
00554       return SQLITE_DONE;
00555     }
00556   }
00557 
00558   /* Playback the page.  Update the in-memory copy of the page
00559   ** at the same time, if there is one.
00560   */
00561   pPg = pager_lookup(pPager, pgRec.pgno);
00562   TRACE2("PLAYBACK %d\n", pgRec.pgno);
00563   sqliteOsSeek(&pPager->fd, (pgRec.pgno-1)*(off_t)SQLITE_PAGE_SIZE);
00564   rc = sqliteOsWrite(&pPager->fd, pgRec.aData, SQLITE_PAGE_SIZE);
00565   if( pPg ){
00566     /* No page should ever be rolled back that is in use, except for page
00567     ** 1 which is held in use in order to keep the lock on the database
00568     ** active.
00569     */
00570     assert( pPg->nRef==0 || pPg->pgno==1 );
00571     memcpy(PGHDR_TO_DATA(pPg), pgRec.aData, SQLITE_PAGE_SIZE);
00572     memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
00573     pPg->dirty = 0;
00574     pPg->needSync = 0;
00575     CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
00576   }
00577   return rc;
00578 }
00579 
00580 /*
00581 ** Playback the journal and thus restore the database file to
00582 ** the state it was in before we started making changes.  
00583 **
00584 ** The journal file format is as follows: 
00585 **
00586 **    *  8 byte prefix.  One of the aJournalMagic123 vectors defined
00587 **       above.  The format of the journal file is determined by which
00588 **       of the three prefix vectors is seen.
00589 **    *  4 byte big-endian integer which is the number of valid page records
00590 **       in the journal.  If this value is 0xffffffff, then compute the
00591 **       number of page records from the journal size.  This field appears
00592 **       in format 3 only.
00593 **    *  4 byte big-endian integer which is the initial value for the 
00594 **       sanity checksum.  This field appears in format 3 only.
00595 **    *  4 byte integer which is the number of pages to truncate the
00596 **       database to during a rollback.
00597 **    *  Zero or more pages instances, each as follows:
00598 **        +  4 byte page number.
00599 **        +  SQLITE_PAGE_SIZE bytes of data.
00600 **        +  4 byte checksum (format 3 only)
00601 **
00602 ** When we speak of the journal header, we mean the first 4 bullets above.
00603 ** Each entry in the journal is an instance of the 5th bullet.  Note that
00604 ** bullets 2 and 3 only appear in format-3 journals.
00605 **
00606 ** Call the value from the second bullet "nRec".  nRec is the number of
00607 ** valid page entries in the journal.  In most cases, you can compute the
00608 ** value of nRec from the size of the journal file.  But if a power
00609 ** failure occurred while the journal was being written, it could be the
00610 ** case that the size of the journal file had already been increased but
00611 ** the extra entries had not yet made it safely to disk.  In such a case,
00612 ** the value of nRec computed from the file size would be too large.  For
00613 ** that reason, we always use the nRec value in the header.
00614 **
00615 ** If the nRec value is 0xffffffff it means that nRec should be computed
00616 ** from the file size.  This value is used when the user selects the
00617 ** no-sync option for the journal.  A power failure could lead to corruption
00618 ** in this case.  But for things like temporary table (which will be
00619 ** deleted when the power is restored) we don't care.  
00620 **
00621 ** Journal formats 1 and 2 do not have an nRec value in the header so we
00622 ** have to compute nRec from the file size.  This has risks (as described
00623 ** above) which is why all persistent tables have been changed to use
00624 ** format 3.
00625 **
00626 ** If the file opened as the journal file is not a well-formed
00627 ** journal file then the database will likely already be
00628 ** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask
00629 ** and SQLITE_CORRUPT is returned.  If it all works, then this routine
00630 ** returns SQLITE_OK.
00631 */
00632 static int pager_playback(Pager *pPager, int useJournalSize){
00633   off_t szJ;               /* Size of the journal file in bytes */
00634   int nRec;                /* Number of Records in the journal */
00635   int i;                   /* Loop counter */
00636   Pgno mxPg = 0;           /* Size of the original file in pages */
00637   int format;              /* Format of the journal file. */
00638   unsigned char aMagic[sizeof(aJournalMagic1)];
00639   int rc;
00640 
00641   /* Figure out how many records are in the journal.  Abort early if
00642   ** the journal is empty.
00643   */
00644   assert( pPager->journalOpen );
00645   sqliteOsSeek(&pPager->jfd, 0);
00646   rc = sqliteOsFileSize(&pPager->jfd, &szJ);
00647   if( rc!=SQLITE_OK ){
00648     goto end_playback;
00649   }
00650 
00651   /* If the journal file is too small to contain a complete header,
00652   ** it must mean that the process that created the journal was just
00653   ** beginning to write the journal file when it died.  In that case,
00654   ** the database file should have still been completely unchanged.
00655   ** Nothing needs to be rolled back.  We can safely ignore this journal.
00656   */
00657   if( szJ < sizeof(aMagic)+sizeof(Pgno) ){
00658     goto end_playback;
00659   }
00660 
00661   /* Read the beginning of the journal and truncate the
00662   ** database file back to its original size.
00663   */
00664   rc = sqliteOsRead(&pPager->jfd, aMagic, sizeof(aMagic));
00665   if( rc!=SQLITE_OK ){
00666     rc = SQLITE_PROTOCOL;
00667     goto end_playback;
00668   }
00669   if( memcmp(aMagic, aJournalMagic3, sizeof(aMagic))==0 ){
00670     format = JOURNAL_FORMAT_3;
00671   }else if( memcmp(aMagic, aJournalMagic2, sizeof(aMagic))==0 ){
00672     format = JOURNAL_FORMAT_2;
00673   }else if( memcmp(aMagic, aJournalMagic1, sizeof(aMagic))==0 ){
00674     format = JOURNAL_FORMAT_1;
00675   }else{
00676     rc = SQLITE_PROTOCOL;
00677     goto end_playback;
00678   }
00679   if( format>=JOURNAL_FORMAT_3 ){
00680     if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){
00681       /* Ignore the journal if it is too small to contain a complete
00682       ** header.  We already did this test once above, but at the prior
00683       ** test, we did not know the journal format and so we had to assume
00684       ** the smallest possible header.  Now we know the header is bigger
00685       ** than the minimum so we test again.
00686       */
00687       goto end_playback;
00688     }
00689     rc = read32bits(format, &pPager->jfd, (u32*)&nRec);
00690     if( rc ) goto end_playback;
00691     rc = read32bits(format, &pPager->jfd, &pPager->cksumInit);
00692     if( rc ) goto end_playback;
00693     if( nRec==0xffffffff || useJournalSize ){
00694       nRec = (szJ - JOURNAL_HDR_SZ(3))/JOURNAL_PG_SZ(3);
00695     }
00696   }else{
00697     nRec = (szJ - JOURNAL_HDR_SZ(2))/JOURNAL_PG_SZ(2);
00698     assert( nRec*JOURNAL_PG_SZ(2)+JOURNAL_HDR_SZ(2)==szJ );
00699   }
00700   rc = read32bits(format, &pPager->jfd, &mxPg);
00701   if( rc!=SQLITE_OK ){
00702     goto end_playback;
00703   }
00704   assert( pPager->origDbSize==0 || pPager->origDbSize==mxPg );
00705   rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)mxPg);
00706   if( rc!=SQLITE_OK ){
00707     goto end_playback;
00708   }
00709   pPager->dbSize = mxPg;
00710   
00711   /* Copy original pages out of the journal and back into the database file.
00712   */
00713   for(i=0; i<nRec; i++){
00714     rc = pager_playback_one_page(pPager, &pPager->jfd, format);
00715     if( rc!=SQLITE_OK ){
00716       if( rc==SQLITE_DONE ){
00717         rc = SQLITE_OK;
00718       }
00719       break;
00720     }
00721   }
00722 
00723   /* Pages that have been written to the journal but never synced
00724   ** where not restored by the loop above.  We have to restore those
00725   ** pages by reading them back from the original database.
00726   */
00727   if( rc==SQLITE_OK ){
00728     PgHdr *pPg;
00729     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
00730       char zBuf[SQLITE_PAGE_SIZE];
00731       if( !pPg->dirty ) continue;
00732       if( (int)pPg->pgno <= pPager->origDbSize ){
00733         sqliteOsSeek(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)(pPg->pgno-1));
00734         rc = sqliteOsRead(&pPager->fd, zBuf, SQLITE_PAGE_SIZE);
00735         TRACE2("REFETCH %d\n", pPg->pgno);
00736         CODEC(pPager, zBuf, pPg->pgno, 2);
00737         if( rc ) break;
00738       }else{
00739         memset(zBuf, 0, SQLITE_PAGE_SIZE);
00740       }
00741       if( pPg->nRef==0 || memcmp(zBuf, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE) ){
00742         memcpy(PGHDR_TO_DATA(pPg), zBuf, SQLITE_PAGE_SIZE);
00743         memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
00744       }
00745       pPg->needSync = 0;
00746       pPg->dirty = 0;
00747     }
00748   }
00749 
00750 end_playback:
00751   if( rc!=SQLITE_OK ){
00752     pager_unwritelock(pPager);
00753     pPager->errMask |= PAGER_ERR_CORRUPT;
00754     rc = SQLITE_CORRUPT;
00755   }else{
00756     rc = pager_unwritelock(pPager);
00757   }
00758   return rc;
00759 }
00760 
00761 /*
00762 ** Playback the checkpoint journal.
00763 **
00764 ** This is similar to playing back the transaction journal but with
00765 ** a few extra twists.
00766 **
00767 **    (1)  The number of pages in the database file at the start of
00768 **         the checkpoint is stored in pPager->ckptSize, not in the
00769 **         journal file itself.
00770 **
00771 **    (2)  In addition to playing back the checkpoint journal, also
00772 **         playback all pages of the transaction journal beginning
00773 **         at offset pPager->ckptJSize.
00774 */
00775 static int pager_ckpt_playback(Pager *pPager){
00776   off_t szJ;               /* Size of the full journal */
00777   int nRec;                /* Number of Records */
00778   int i;                   /* Loop counter */
00779   int rc;
00780 
00781   /* Truncate the database back to its original size.
00782   */
00783   rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)pPager->ckptSize);
00784   pPager->dbSize = pPager->ckptSize;
00785 
00786   /* Figure out how many records are in the checkpoint journal.
00787   */
00788   assert( pPager->ckptInUse && pPager->journalOpen );
00789   sqliteOsSeek(&pPager->cpfd, 0);
00790   nRec = pPager->ckptNRec;
00791   
00792   /* Copy original pages out of the checkpoint journal and back into the
00793   ** database file.  Note that the checkpoint journal always uses format
00794   ** 2 instead of format 3 since it does not need to be concerned with
00795   ** power failures corrupting the journal and can thus omit the checksums.
00796   */
00797   for(i=nRec-1; i>=0; i--){
00798     rc = pager_playback_one_page(pPager, &pPager->cpfd, 2);
00799     assert( rc!=SQLITE_DONE );
00800     if( rc!=SQLITE_OK ) goto end_ckpt_playback;
00801   }
00802 
00803   /* Figure out how many pages need to be copied out of the transaction
00804   ** journal.
00805   */
00806   rc = sqliteOsSeek(&pPager->jfd, pPager->ckptJSize);
00807   if( rc!=SQLITE_OK ){
00808     goto end_ckpt_playback;
00809   }
00810   rc = sqliteOsFileSize(&pPager->jfd, &szJ);
00811   if( rc!=SQLITE_OK ){
00812     goto end_ckpt_playback;
00813   }
00814   nRec = (szJ - pPager->ckptJSize)/JOURNAL_PG_SZ(journal_format);
00815   for(i=nRec-1; i>=0; i--){
00816     rc = pager_playback_one_page(pPager, &pPager->jfd, journal_format);
00817     if( rc!=SQLITE_OK ){
00818       assert( rc!=SQLITE_DONE );
00819       goto end_ckpt_playback;
00820     }
00821   }
00822   
00823 end_ckpt_playback:
00824   if( rc!=SQLITE_OK ){
00825     pPager->errMask |= PAGER_ERR_CORRUPT;
00826     rc = SQLITE_CORRUPT;
00827   }
00828   return rc;
00829 }
00830 
00831 /*
00832 ** Change the maximum number of in-memory pages that are allowed.
00833 **
00834 ** The maximum number is the absolute value of the mxPage parameter.
00835 ** If mxPage is negative, the noSync flag is also set.  noSync bypasses
00836 ** calls to sqliteOsSync().  The pager runs much faster with noSync on,
00837 ** but if the operating system crashes or there is an abrupt power 
00838 ** failure, the database file might be left in an inconsistent and
00839 ** unrepairable state.  
00840 */
00841 void sqlitepager_set_cachesize(Pager *pPager, int mxPage){
00842   if( mxPage>=0 ){
00843     pPager->noSync = pPager->tempFile;
00844     if( pPager->noSync==0 ) pPager->needSync = 0;
00845   }else{
00846     pPager->noSync = 1;
00847     mxPage = -mxPage;
00848   }
00849   if( mxPage>10 ){
00850     pPager->mxPage = mxPage;
00851   }
00852 }
00853 
00854 /*
00855 ** Adjust the robustness of the database to damage due to OS crashes
00856 ** or power failures by changing the number of syncs()s when writing
00857 ** the rollback journal.  There are three levels:
00858 **
00859 **    OFF       sqliteOsSync() is never called.  This is the default
00860 **              for temporary and transient files.
00861 **
00862 **    NORMAL    The journal is synced once before writes begin on the
00863 **              database.  This is normally adequate protection, but
00864 **              it is theoretically possible, though very unlikely,
00865 **              that an inopertune power failure could leave the journal
00866 **              in a state which would cause damage to the database
00867 **              when it is rolled back.
00868 **
00869 **    FULL      The journal is synced twice before writes begin on the
00870 **              database (with some additional information - the nRec field
00871 **              of the journal header - being written in between the two
00872 **              syncs).  If we assume that writing a
00873 **              single disk sector is atomic, then this mode provides
00874 **              assurance that the journal will not be corrupted to the
00875 **              point of causing damage to the database during rollback.
00876 **
00877 ** Numeric values associated with these states are OFF==1, NORMAL=2,
00878 ** and FULL=3.
00879 */
00880 void sqlitepager_set_safety_level(Pager *pPager, int level){
00881   pPager->noSync =  level==1 || pPager->tempFile;
00882   pPager->fullSync = level==3 && !pPager->tempFile;
00883   if( pPager->noSync==0 ) pPager->needSync = 0;
00884 }
00885 
00886 /*
00887 ** Open a temporary file.  Write the name of the file into zName
00888 ** (zName must be at least SQLITE_TEMPNAME_SIZE bytes long.)  Write
00889 ** the file descriptor into *fd.  Return SQLITE_OK on success or some
00890 ** other error code if we fail.
00891 **
00892 ** The OS will automatically delete the temporary file when it is
00893 ** closed.
00894 */
00895 static int sqlitepager_opentemp(char *zFile, OsFile *fd){
00896   int cnt = 8;
00897   int rc;
00898   do{
00899     cnt--;
00900     sqliteOsTempFileName(zFile);
00901     rc = sqliteOsOpenExclusive(zFile, fd, 1);
00902   }while( cnt>0 && rc!=SQLITE_OK );
00903   return rc;
00904 }
00905 
00906 /*
00907 ** Create a new page cache and put a pointer to the page cache in *ppPager.
00908 ** The file to be cached need not exist.  The file is not locked until
00909 ** the first call to sqlitepager_get() and is only held open until the
00910 ** last page is released using sqlitepager_unref().
00911 **
00912 ** If zFilename is NULL then a randomly-named temporary file is created
00913 ** and used as the file to be cached.  The file will be deleted
00914 ** automatically when it is closed.
00915 */
00916 int sqlitepager_open(
00917   Pager **ppPager,         /* Return the Pager structure here */
00918   const char *zFilename,   /* Name of the database file to open */
00919   int mxPage,              /* Max number of in-memory cache pages */
00920   int nExtra,              /* Extra bytes append to each in-memory page */
00921   int useJournal           /* TRUE to use a rollback journal on this file */
00922 ){
00923   Pager *pPager;
00924   char *zFullPathname;
00925   int nameLen;
00926   OsFile fd;
00927   int rc, i;
00928   int tempFile;
00929   int readOnly = 0;
00930   char zTemp[SQLITE_TEMPNAME_SIZE];
00931 
00932   *ppPager = 0;
00933   if( sqlite_malloc_failed ){
00934     return SQLITE_NOMEM;
00935   }
00936   if( zFilename && zFilename[0] ){
00937     zFullPathname = sqliteOsFullPathname(zFilename);
00938     rc = sqliteOsOpenReadWrite(zFullPathname, &fd, &readOnly);
00939     tempFile = 0;
00940   }else{
00941     rc = sqlitepager_opentemp(zTemp, &fd);
00942     zFilename = zTemp;
00943     zFullPathname = sqliteOsFullPathname(zFilename);
00944     tempFile = 1;
00945   }
00946   if( sqlite_malloc_failed ){
00947     return SQLITE_NOMEM;
00948   }
00949   if( rc!=SQLITE_OK ){
00950     sqliteFree(zFullPathname);
00951     return SQLITE_CANTOPEN;
00952   }
00953   nameLen = strlen(zFullPathname);
00954   pPager = sqliteMalloc( sizeof(*pPager) + nameLen*3 + 30 );
00955   if( pPager==0 ){
00956     sqliteOsClose(&fd);
00957     sqliteFree(zFullPathname);
00958     return SQLITE_NOMEM;
00959   }
00960   SET_PAGER(pPager);
00961   pPager->zFilename = (char*)&pPager[1];
00962   pPager->zDirectory = &pPager->zFilename[nameLen+1];
00963   pPager->zJournal = &pPager->zDirectory[nameLen+1];
00964   strcpy(pPager->zFilename, zFullPathname);
00965   strcpy(pPager->zDirectory, zFullPathname);
00966   for(i=nameLen; i>0 && pPager->zDirectory[i-1]!='/'; i--){}
00967   if( i>0 ) pPager->zDirectory[i-1] = 0;
00968   strcpy(pPager->zJournal, zFullPathname);
00969   sqliteFree(zFullPathname);
00970   strcpy(&pPager->zJournal[nameLen], "-journal");
00971   pPager->fd = fd;
00972   pPager->journalOpen = 0;
00973   pPager->useJournal = useJournal;
00974   pPager->ckptOpen = 0;
00975   pPager->ckptInUse = 0;
00976   pPager->nRef = 0;
00977   pPager->dbSize = -1;
00978   pPager->ckptSize = 0;
00979   pPager->ckptJSize = 0;
00980   pPager->nPage = 0;
00981   pPager->mxPage = mxPage>5 ? mxPage : 10;
00982   pPager->state = SQLITE_UNLOCK;
00983   pPager->errMask = 0;
00984   pPager->tempFile = tempFile;
00985   pPager->readOnly = readOnly;
00986   pPager->needSync = 0;
00987   pPager->noSync = pPager->tempFile || !useJournal;
00988   pPager->pFirst = 0;
00989   pPager->pFirstSynced = 0;
00990   pPager->pLast = 0;
00991   pPager->nExtra = nExtra;
00992   memset(pPager->aHash, 0, sizeof(pPager->aHash));
00993   *ppPager = pPager;
00994   return SQLITE_OK;
00995 }
00996 
00997 /*
00998 ** Set the destructor for this pager.  If not NULL, the destructor is called
00999 ** when the reference count on each page reaches zero.  The destructor can
01000 ** be used to clean up information in the extra segment appended to each page.
01001 **
01002 ** The destructor is not called as a result sqlitepager_close().  
01003 ** Destructors are only called by sqlitepager_unref().
01004 */
01005 void sqlitepager_set_destructor(Pager *pPager, void (*xDesc)(void*)){
01006   pPager->xDestructor = xDesc;
01007 }
01008 
01009 /*
01010 ** Return the total number of pages in the disk file associated with
01011 ** pPager.
01012 */
01013 int sqlitepager_pagecount(Pager *pPager){
01014   off_t n;
01015   assert( pPager!=0 );
01016   if( pPager->dbSize>=0 ){
01017     return pPager->dbSize;
01018   }
01019   if( sqliteOsFileSize(&pPager->fd, &n)!=SQLITE_OK ){
01020     pPager->errMask |= PAGER_ERR_DISK;
01021     return 0;
01022   }
01023   n /= SQLITE_PAGE_SIZE;
01024   if( pPager->state!=SQLITE_UNLOCK ){
01025     pPager->dbSize = n;
01026   }
01027   return n;
01028 }
01029 
01030 /*
01031 ** Forward declaration
01032 */
01033 static int syncJournal(Pager*);
01034 
01035 /*
01036 ** Truncate the file to the number of pages specified.
01037 */
01038 int sqlitepager_truncate(Pager *pPager, Pgno nPage){
01039   int rc;
01040   if( pPager->dbSize<0 ){
01041     sqlitepager_pagecount(pPager);
01042   }
01043   if( pPager->errMask!=0 ){
01044     rc = pager_errcode(pPager);
01045     return rc;
01046   }
01047   if( nPage>=(unsigned)pPager->dbSize ){
01048     return SQLITE_OK;
01049   }
01050   syncJournal(pPager);
01051   rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)nPage);
01052   if( rc==SQLITE_OK ){
01053     pPager->dbSize = nPage;
01054   }
01055   return rc;
01056 }
01057 
01058 /*
01059 ** Shutdown the page cache.  Free all memory and close all files.
01060 **
01061 ** If a transaction was in progress when this routine is called, that
01062 ** transaction is rolled back.  All outstanding pages are invalidated
01063 ** and their memory is freed.  Any attempt to use a page associated
01064 ** with this page cache after this function returns will likely
01065 ** result in a coredump.
01066 */
01067 int sqlitepager_close(Pager *pPager){
01068   PgHdr *pPg, *pNext;
01069   switch( pPager->state ){
01070     case SQLITE_WRITELOCK: {
01071       sqlitepager_rollback(pPager);
01072       sqliteOsUnlock(&pPager->fd);
01073       assert( pPager->journalOpen==0 );
01074       break;
01075     }
01076     case SQLITE_READLOCK: {
01077       sqliteOsUnlock(&pPager->fd);
01078       break;
01079     }
01080     default: {
01081       /* Do nothing */
01082       break;
01083     }
01084   }
01085   for(pPg=pPager->pAll; pPg; pPg=pNext){
01086     pNext = pPg->pNextAll;
01087     sqliteFree(pPg);
01088   }
01089   sqliteOsClose(&pPager->fd);
01090   assert( pPager->journalOpen==0 );
01091   /* Temp files are automatically deleted by the OS
01092   ** if( pPager->tempFile ){
01093   **   sqliteOsDelete(pPager->zFilename);
01094   ** }
01095   */
01096   CLR_PAGER(pPager);
01097   if( pPager->zFilename!=(char*)&pPager[1] ){
01098     assert( 0 );  /* Cannot happen */
01099     sqliteFree(pPager->zFilename);
01100     sqliteFree(pPager->zJournal);
01101     sqliteFree(pPager->zDirectory);
01102   }
01103   sqliteFree(pPager);
01104   return SQLITE_OK;
01105 }
01106 
01107 /*
01108 ** Return the page number for the given page data.
01109 */
01110 Pgno sqlitepager_pagenumber(void *pData){
01111   PgHdr *p = DATA_TO_PGHDR(pData);
01112   return p->pgno;
01113 }
01114 
01115 /*
01116 ** Increment the reference count for a page.  If the page is
01117 ** currently on the freelist (the reference count is zero) then
01118 ** remove it from the freelist.
01119 */
01120 #define page_ref(P)   ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++)
01121 static void _page_ref(PgHdr *pPg){
01122   if( pPg->nRef==0 ){
01123     /* The page is currently on the freelist.  Remove it. */
01124     if( pPg==pPg->pPager->pFirstSynced ){
01125       PgHdr *p = pPg->pNextFree;
01126       while( p && p->needSync ){ p = p->pNextFree; }
01127       pPg->pPager->pFirstSynced = p;
01128     }
01129     if( pPg->pPrevFree ){
01130       pPg->pPrevFree->pNextFree = pPg->pNextFree;
01131     }else{
01132       pPg->pPager->pFirst = pPg->pNextFree;
01133     }
01134     if( pPg->pNextFree ){
01135       pPg->pNextFree->pPrevFree = pPg->pPrevFree;
01136     }else{
01137       pPg->pPager->pLast = pPg->pPrevFree;
01138     }
01139     pPg->pPager->nRef++;
01140   }
01141   pPg->nRef++;
01142   REFINFO(pPg);
01143 }
01144 
01145 /*
01146 ** Increment the reference count for a page.  The input pointer is
01147 ** a reference to the page data.
01148 */
01149 int sqlitepager_ref(void *pData){
01150   PgHdr *pPg = DATA_TO_PGHDR(pData);
01151   page_ref(pPg);
01152   return SQLITE_OK;
01153 }
01154 
01155 /*
01156 ** Sync the journal.  In other words, make sure all the pages that have
01157 ** been written to the journal have actually reached the surface of the
01158 ** disk.  It is not safe to modify the original database file until after
01159 ** the journal has been synced.  If the original database is modified before
01160 ** the journal is synced and a power failure occurs, the unsynced journal
01161 ** data would be lost and we would be unable to completely rollback the
01162 ** database changes.  Database corruption would occur.
01163 ** 
01164 ** This routine also updates the nRec field in the header of the journal.
01165 ** (See comments on the pager_playback() routine for additional information.)
01166 ** If the sync mode is FULL, two syncs will occur.  First the whole journal
01167 ** is synced, then the nRec field is updated, then a second sync occurs.
01168 **
01169 ** For temporary databases, we do not care if we are able to rollback
01170 ** after a power failure, so sync occurs.
01171 **
01172 ** This routine clears the needSync field of every page current held in
01173 ** memory.
01174 */
01175 static int syncJournal(Pager *pPager){
01176   PgHdr *pPg;
01177   int rc = SQLITE_OK;
01178 
01179   /* Sync the journal before modifying the main database
01180   ** (assuming there is a journal and it needs to be synced.)
01181   */
01182   if( pPager->needSync ){
01183     if( !pPager->tempFile ){
01184       assert( pPager->journalOpen );
01185       /* assert( !pPager->noSync ); // noSync might be set if synchronous
01186       ** was turned off after the transaction was started.  Ticket #615 */
01187 #ifndef NDEBUG
01188       {
01189         /* Make sure the pPager->nRec counter we are keeping agrees
01190         ** with the nRec computed from the size of the journal file.
01191         */
01192         off_t hdrSz, pgSz, jSz;
01193         hdrSz = JOURNAL_HDR_SZ(journal_format);
01194         pgSz = JOURNAL_PG_SZ(journal_format);
01195         rc = sqliteOsFileSize(&pPager->jfd, &jSz);
01196         if( rc!=0 ) return rc;
01197         assert( pPager->nRec*pgSz+hdrSz==jSz );
01198       }
01199 #endif
01200       if( journal_format>=3 ){
01201         /* Write the nRec value into the journal file header */
01202         off_t szJ;
01203         if( pPager->fullSync ){
01204           TRACE1("SYNC\n");
01205           rc = sqliteOsSync(&pPager->jfd);
01206           if( rc!=0 ) return rc;
01207         }
01208         sqliteOsSeek(&pPager->jfd, sizeof(aJournalMagic1));
01209         rc = write32bits(&pPager->jfd, pPager->nRec);
01210         if( rc ) return rc;
01211         szJ = JOURNAL_HDR_SZ(journal_format) +
01212                  pPager->nRec*JOURNAL_PG_SZ(journal_format);
01213         sqliteOsSeek(&pPager->jfd, szJ);
01214       }
01215       TRACE1("SYNC\n");
01216       rc = sqliteOsSync(&pPager->jfd);
01217       if( rc!=0 ) return rc;
01218       pPager->journalStarted = 1;
01219     }
01220     pPager->needSync = 0;
01221 
01222     /* Erase the needSync flag from every page.
01223     */
01224     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
01225       pPg->needSync = 0;
01226     }
01227     pPager->pFirstSynced = pPager->pFirst;
01228   }
01229 
01230 #ifndef NDEBUG
01231   /* If the Pager.needSync flag is clear then the PgHdr.needSync
01232   ** flag must also be clear for all pages.  Verify that this
01233   ** invariant is true.
01234   */
01235   else{
01236     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
01237       assert( pPg->needSync==0 );
01238     }
01239     assert( pPager->pFirstSynced==pPager->pFirst );
01240   }
01241 #endif
01242 
01243   return rc;
01244 }
01245 
01246 /*
01247 ** Given a list of pages (connected by the PgHdr.pDirty pointer) write
01248 ** every one of those pages out to the database file and mark them all
01249 ** as clean.
01250 */
01251 static int pager_write_pagelist(PgHdr *pList){
01252   Pager *pPager;
01253   int rc;
01254 
01255   if( pList==0 ) return SQLITE_OK;
01256   pPager = pList->pPager;
01257   while( pList ){
01258     assert( pList->dirty );
01259     sqliteOsSeek(&pPager->fd, (pList->pgno-1)*(off_t)SQLITE_PAGE_SIZE);
01260     CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
01261     TRACE2("STORE %d\n", pList->pgno);
01262     rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pList), SQLITE_PAGE_SIZE);
01263     CODEC(pPager, PGHDR_TO_DATA(pList), pList->pgno, 0);
01264     if( rc ) return rc;
01265     pList->dirty = 0;
01266     pList = pList->pDirty;
01267   }
01268   return SQLITE_OK;
01269 }
01270 
01271 /*
01272 ** Collect every dirty page into a dirty list and
01273 ** return a pointer to the head of that list.  All pages are
01274 ** collected even if they are still in use.
01275 */
01276 static PgHdr *pager_get_all_dirty_pages(Pager *pPager){
01277   PgHdr *p, *pList;
01278   pList = 0;
01279   for(p=pPager->pAll; p; p=p->pNextAll){
01280     if( p->dirty ){
01281       p->pDirty = pList;
01282       pList = p;
01283     }
01284   }
01285   return pList;
01286 }
01287 
01288 /*
01289 ** Acquire a page.
01290 **
01291 ** A read lock on the disk file is obtained when the first page is acquired. 
01292 ** This read lock is dropped when the last page is released.
01293 **
01294 ** A _get works for any page number greater than 0.  If the database
01295 ** file is smaller than the requested page, then no actual disk
01296 ** read occurs and the memory image of the page is initialized to
01297 ** all zeros.  The extra data appended to a page is always initialized
01298 ** to zeros the first time a page is loaded into memory.
01299 **
01300 ** The acquisition might fail for several reasons.  In all cases,
01301 ** an appropriate error code is returned and *ppPage is set to NULL.
01302 **
01303 ** See also sqlitepager_lookup().  Both this routine and _lookup() attempt
01304 ** to find a page in the in-memory cache first.  If the page is not already
01305 ** in memory, this routine goes to disk to read it in whereas _lookup()
01306 ** just returns 0.  This routine acquires a read-lock the first time it
01307 ** has to go to disk, and could also playback an old journal if necessary.
01308 ** Since _lookup() never goes to disk, it never has to deal with locks
01309 ** or journal files.
01310 */
01311 int sqlitepager_get(Pager *pPager, Pgno pgno, void **ppPage){
01312   PgHdr *pPg;
01313   int rc;
01314 
01315   /* Make sure we have not hit any critical errors.
01316   */ 
01317   assert( pPager!=0 );
01318   assert( pgno!=0 );
01319   *ppPage = 0;
01320   if( pPager->errMask & ~(PAGER_ERR_FULL) ){
01321     return pager_errcode(pPager);
01322   }
01323 
01324   /* If this is the first page accessed, then get a read lock
01325   ** on the database file.
01326   */
01327   if( pPager->nRef==0 ){
01328     rc = sqliteOsReadLock(&pPager->fd);
01329     if( rc!=SQLITE_OK ){
01330       return rc;
01331     }
01332     pPager->state = SQLITE_READLOCK;
01333 
01334     /* If a journal file exists, try to play it back.
01335     */
01336     if( pPager->useJournal && sqliteOsFileExists(pPager->zJournal) ){
01337        int rc;
01338 
01339        /* Get a write lock on the database
01340        */
01341        rc = sqliteOsWriteLock(&pPager->fd);
01342        if( rc!=SQLITE_OK ){
01343          if( sqliteOsUnlock(&pPager->fd)!=SQLITE_OK ){
01344            /* This should never happen! */
01345            rc = SQLITE_INTERNAL;
01346          }
01347          return rc;
01348        }
01349        pPager->state = SQLITE_WRITELOCK;
01350 
01351        /* Open the journal for reading only.  Return SQLITE_BUSY if
01352        ** we are unable to open the journal file. 
01353        **
01354        ** The journal file does not need to be locked itself.  The
01355        ** journal file is never open unless the main database file holds
01356        ** a write lock, so there is never any chance of two or more
01357        ** processes opening the journal at the same time.
01358        */
01359        rc = sqliteOsOpenReadOnly(pPager->zJournal, &pPager->jfd);
01360        if( rc!=SQLITE_OK ){
01361          rc = sqliteOsUnlock(&pPager->fd);
01362          assert( rc==SQLITE_OK );
01363          return SQLITE_BUSY;
01364        }
01365        pPager->journalOpen = 1;
01366        pPager->journalStarted = 0;
01367 
01368        /* Playback and delete the journal.  Drop the database write
01369        ** lock and reacquire the read lock.
01370        */
01371        rc = pager_playback(pPager, 0);
01372        if( rc!=SQLITE_OK ){
01373          return rc;
01374        }
01375     }
01376     pPg = 0;
01377   }else{
01378     /* Search for page in cache */
01379     pPg = pager_lookup(pPager, pgno);
01380   }
01381   if( pPg==0 ){
01382     /* The requested page is not in the page cache. */
01383     int h;
01384     pPager->nMiss++;
01385     if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 ){
01386       /* Create a new page */
01387       pPg = sqliteMallocRaw( sizeof(*pPg) + SQLITE_PAGE_SIZE 
01388                               + sizeof(u32) + pPager->nExtra );
01389       if( pPg==0 ){
01390         pager_unwritelock(pPager);
01391         pPager->errMask |= PAGER_ERR_MEM;
01392         return SQLITE_NOMEM;
01393       }
01394       memset(pPg, 0, sizeof(*pPg));
01395       pPg->pPager = pPager;
01396       pPg->pNextAll = pPager->pAll;
01397       if( pPager->pAll ){
01398         pPager->pAll->pPrevAll = pPg;
01399       }
01400       pPg->pPrevAll = 0;
01401       pPager->pAll = pPg;
01402       pPager->nPage++;
01403     }else{
01404       /* Find a page to recycle.  Try to locate a page that does not
01405       ** require us to do an fsync() on the journal.
01406       */
01407       pPg = pPager->pFirstSynced;
01408 
01409       /* If we could not find a page that does not require an fsync()
01410       ** on the journal file then fsync the journal file.  This is a
01411       ** very slow operation, so we work hard to avoid it.  But sometimes
01412       ** it can't be helped.
01413       */
01414       if( pPg==0 ){
01415         int rc = syncJournal(pPager);
01416         if( rc!=0 ){
01417           sqlitepager_rollback(pPager);
01418           return SQLITE_IOERR;
01419         }
01420         pPg = pPager->pFirst;
01421       }
01422       assert( pPg->nRef==0 );
01423 
01424       /* Write the page to the database file if it is dirty.
01425       */
01426       if( pPg->dirty ){
01427         assert( pPg->needSync==0 );
01428         pPg->pDirty = 0;
01429         rc = pager_write_pagelist( pPg );
01430         if( rc!=SQLITE_OK ){
01431           sqlitepager_rollback(pPager);
01432           return SQLITE_IOERR;
01433         }
01434       }
01435       assert( pPg->dirty==0 );
01436 
01437       /* If the page we are recycling is marked as alwaysRollback, then
01438       ** set the global alwaysRollback flag, thus disabling the
01439       ** sqlite_dont_rollback() optimization for the rest of this transaction.
01440       ** It is necessary to do this because the page marked alwaysRollback
01441       ** might be reloaded at a later time but at that point we won't remember
01442       ** that is was marked alwaysRollback.  This means that all pages must
01443       ** be marked as alwaysRollback from here on out.
01444       */
01445       if( pPg->alwaysRollback ){
01446         pPager->alwaysRollback = 1;
01447       }
01448 
01449       /* Unlink the old page from the free list and the hash table
01450       */
01451       if( pPg==pPager->pFirstSynced ){
01452         PgHdr *p = pPg->pNextFree;
01453         while( p && p->needSync ){ p = p->pNextFree; }
01454         pPager->pFirstSynced = p;
01455       }
01456       if( pPg->pPrevFree ){
01457         pPg->pPrevFree->pNextFree = pPg->pNextFree;
01458       }else{
01459         assert( pPager->pFirst==pPg );
01460         pPager->pFirst = pPg->pNextFree;
01461       }
01462       if( pPg->pNextFree ){
01463         pPg->pNextFree->pPrevFree = pPg->pPrevFree;
01464       }else{
01465         assert( pPager->pLast==pPg );
01466         pPager->pLast = pPg->pPrevFree;
01467       }
01468       pPg->pNextFree = pPg->pPrevFree = 0;
01469       if( pPg->pNextHash ){
01470         pPg->pNextHash->pPrevHash = pPg->pPrevHash;
01471       }
01472       if( pPg->pPrevHash ){
01473         pPg->pPrevHash->pNextHash = pPg->pNextHash;
01474       }else{
01475         h = pager_hash(pPg->pgno);
01476         assert( pPager->aHash[h]==pPg );
01477         pPager->aHash[h] = pPg->pNextHash;
01478       }
01479       pPg->pNextHash = pPg->pPrevHash = 0;
01480       pPager->nOvfl++;
01481     }
01482     pPg->pgno = pgno;
01483     if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){
01484       sqliteCheckMemory(pPager->aInJournal, pgno/8);
01485       assert( pPager->journalOpen );
01486       pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0;
01487       pPg->needSync = 0;
01488     }else{
01489       pPg->inJournal = 0;
01490       pPg->needSync = 0;
01491     }
01492     if( pPager->aInCkpt && (int)pgno<=pPager->ckptSize
01493              && (pPager->aInCkpt[pgno/8] & (1<<(pgno&7)))!=0 ){
01494       page_add_to_ckpt_list(pPg);
01495     }else{
01496       page_remove_from_ckpt_list(pPg);
01497     }
01498     pPg->dirty = 0;
01499     pPg->nRef = 1;
01500     REFINFO(pPg);
01501     pPager->nRef++;
01502     h = pager_hash(pgno);
01503     pPg->pNextHash = pPager->aHash[h];
01504     pPager->aHash[h] = pPg;
01505     if( pPg->pNextHash ){
01506       assert( pPg->pNextHash->pPrevHash==0 );
01507       pPg->pNextHash->pPrevHash = pPg;
01508     }
01509     if( pPager->nExtra>0 ){
01510       memset(PGHDR_TO_EXTRA(pPg), 0, pPager->nExtra);
01511     }
01512     if( pPager->dbSize<0 ) sqlitepager_pagecount(pPager);
01513     if( pPager->errMask!=0 ){
01514       sqlitepager_unref(PGHDR_TO_DATA(pPg));
01515       rc = pager_errcode(pPager);
01516       return rc;
01517     }
01518     if( pPager->dbSize<(int)pgno ){
01519       memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
01520     }else{
01521       int rc;
01522       sqliteOsSeek(&pPager->fd, (pgno-1)*(off_t)SQLITE_PAGE_SIZE);
01523       rc = sqliteOsRead(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE);
01524       TRACE2("FETCH %d\n", pPg->pgno);
01525       CODEC(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
01526       if( rc!=SQLITE_OK ){
01527         off_t fileSize;
01528         if( sqliteOsFileSize(&pPager->fd,&fileSize)!=SQLITE_OK
01529                || fileSize>=pgno*SQLITE_PAGE_SIZE ){
01530           sqlitepager_unref(PGHDR_TO_DATA(pPg));
01531           return rc;
01532         }else{
01533           memset(PGHDR_TO_DATA(pPg), 0, SQLITE_PAGE_SIZE);
01534         }
01535       }
01536     }
01537   }else{
01538     /* The requested page is in the page cache. */
01539     pPager->nHit++;
01540     page_ref(pPg);
01541   }
01542   *ppPage = PGHDR_TO_DATA(pPg);
01543   return SQLITE_OK;
01544 }
01545 
01546 /*
01547 ** Acquire a page if it is already in the in-memory cache.  Do
01548 ** not read the page from disk.  Return a pointer to the page,
01549 ** or 0 if the page is not in cache.
01550 **
01551 ** See also sqlitepager_get().  The difference between this routine
01552 ** and sqlitepager_get() is that _get() will go to the disk and read
01553 ** in the page if the page is not already in cache.  This routine
01554 ** returns NULL if the page is not in cache or if a disk I/O error 
01555 ** has ever happened.
01556 */
01557 void *sqlitepager_lookup(Pager *pPager, Pgno pgno){
01558   PgHdr *pPg;
01559 
01560   assert( pPager!=0 );
01561   assert( pgno!=0 );
01562   if( pPager->errMask & ~(PAGER_ERR_FULL) ){
01563     return 0;
01564   }
01565   /* if( pPager->nRef==0 ){
01566   **  return 0;
01567   ** }
01568   */
01569   pPg = pager_lookup(pPager, pgno);
01570   if( pPg==0 ) return 0;
01571   page_ref(pPg);
01572   return PGHDR_TO_DATA(pPg);
01573 }
01574 
01575 /*
01576 ** Release a page.
01577 **
01578 ** If the number of references to the page drop to zero, then the
01579 ** page is added to the LRU list.  When all references to all pages
01580 ** are released, a rollback occurs and the lock on the database is
01581 ** removed.
01582 */
01583 int sqlitepager_unref(void *pData){
01584   PgHdr *pPg;
01585 
01586   /* Decrement the reference count for this page
01587   */
01588   pPg = DATA_TO_PGHDR(pData);
01589   assert( pPg->nRef>0 );
01590   pPg->nRef--;
01591   REFINFO(pPg);
01592 
01593   /* When the number of references to a page reach 0, call the
01594   ** destructor and add the page to the freelist.
01595   */
01596   if( pPg->nRef==0 ){
01597     Pager *pPager;
01598     pPager = pPg->pPager;
01599     pPg->pNextFree = 0;
01600     pPg->pPrevFree = pPager->pLast;
01601     pPager->pLast = pPg;
01602     if( pPg->pPrevFree ){
01603       pPg->pPrevFree->pNextFree = pPg;
01604     }else{
01605       pPager->pFirst = pPg;
01606     }
01607     if( pPg->needSync==0 && pPager->pFirstSynced==0 ){
01608       pPager->pFirstSynced = pPg;
01609     }
01610     if( pPager->xDestructor ){
01611       pPager->xDestructor(pData);
01612     }
01613   
01614     /* When all pages reach the freelist, drop the read lock from
01615     ** the database file.
01616     */
01617     pPager->nRef--;
01618     assert( pPager->nRef>=0 );
01619     if( pPager->nRef==0 ){
01620       pager_reset(pPager);
01621     }
01622   }
01623   return SQLITE_OK;
01624 }
01625 
01626 /*
01627 ** Create a journal file for pPager.  There should already be a write
01628 ** lock on the database file when this routine is called.
01629 **
01630 ** Return SQLITE_OK if everything.  Return an error code and release the
01631 ** write lock if anything goes wrong.
01632 */
01633 static int pager_open_journal(Pager *pPager){
01634   int rc;
01635   assert( pPager->state==SQLITE_WRITELOCK );
01636   assert( pPager->journalOpen==0 );
01637   assert( pPager->useJournal );
01638   sqlitepager_pagecount(pPager);
01639   pPager->aInJournal = sqliteMalloc( pPager->dbSize/8 + 1 );
01640   if( pPager->aInJournal==0 ){
01641     sqliteOsReadLock(&pPager->fd);
01642     pPager->state = SQLITE_READLOCK;
01643     return SQLITE_NOMEM;
01644   }
01645   rc = sqliteOsOpenExclusive(pPager->zJournal, &pPager->jfd,pPager->tempFile);
01646   if( rc!=SQLITE_OK ){
01647     sqliteFree(pPager->aInJournal);
01648     pPager->aInJournal = 0;
01649     sqliteOsReadLock(&pPager->fd);
01650     pPager->state = SQLITE_READLOCK;
01651     return SQLITE_CANTOPEN;
01652   }
01653   sqliteOsOpenDirectory(pPager->zDirectory, &pPager->jfd);
01654   pPager->journalOpen = 1;
01655   pPager->journalStarted = 0;
01656   pPager->needSync = 0;
01657   pPager->alwaysRollback = 0;
01658   pPager->nRec = 0;
01659   if( pPager->errMask!=0 ){
01660     rc = pager_errcode(pPager);
01661     return rc;
01662   }
01663   pPager->origDbSize = pPager->dbSize;
01664   if( journal_format==JOURNAL_FORMAT_3 ){
01665     rc = sqliteOsWrite(&pPager->jfd, aJournalMagic3, sizeof(aJournalMagic3));
01666     if( rc==SQLITE_OK ){
01667       rc = write32bits(&pPager->jfd, pPager->noSync ? 0xffffffff : 0);
01668     }
01669     if( rc==SQLITE_OK ){
01670       sqliteRandomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
01671       rc = write32bits(&pPager->jfd, pPager->cksumInit);
01672     }
01673   }else if( journal_format==JOURNAL_FORMAT_2 ){
01674     rc = sqliteOsWrite(&pPager->jfd, aJournalMagic2, sizeof(aJournalMagic2));
01675   }else{
01676     assert( journal_format==JOURNAL_FORMAT_1 );
01677     rc = sqliteOsWrite(&pPager->jfd, aJournalMagic1, sizeof(aJournalMagic1));
01678   }
01679   if( rc==SQLITE_OK ){
01680     rc = write32bits(&pPager->jfd, pPager->dbSize);
01681   }
01682   if( pPager->ckptAutoopen && rc==SQLITE_OK ){
01683     rc = sqlitepager_ckpt_begin(pPager);
01684   }
01685   if( rc!=SQLITE_OK ){
01686     rc = pager_unwritelock(pPager);
01687     if( rc==SQLITE_OK ){
01688       rc = SQLITE_FULL;
01689     }
01690   }
01691   return rc;  
01692 }
01693 
01694 /*
01695 ** Acquire a write-lock on the database.  The lock is removed when
01696 ** the any of the following happen:
01697 **
01698 **   *  sqlitepager_commit() is called.
01699 **   *  sqlitepager_rollback() is called.
01700 **   *  sqlitepager_close() is called.
01701 **   *  sqlitepager_unref() is called to on every outstanding page.
01702 **
01703 ** The parameter to this routine is a pointer to any open page of the
01704 ** database file.  Nothing changes about the page - it is used merely
01705 ** to acquire a pointer to the Pager structure and as proof that there
01706 ** is already a read-lock on the database.
01707 **
01708 ** A journal file is opened if this is not a temporary file.  For
01709 ** temporary files, the opening of the journal file is deferred until
01710 ** there is an actual need to write to the journal.
01711 **
01712 ** If the database is already write-locked, this routine is a no-op.
01713 */
01714 int sqlitepager_begin(void *pData){
01715   PgHdr *pPg = DATA_TO_PGHDR(pData);
01716   Pager *pPager = pPg->pPager;
01717   int rc = SQLITE_OK;
01718   assert( pPg->nRef>0 );
01719   assert( pPager->state!=SQLITE_UNLOCK );
01720   if( pPager->state==SQLITE_READLOCK ){
01721     assert( pPager->aInJournal==0 );
01722     rc = sqliteOsWriteLock(&pPager->fd);
01723     if( rc!=SQLITE_OK ){
01724       return rc;
01725     }
01726     pPager->state = SQLITE_WRITELOCK;
01727     pPager->dirtyFile = 0;
01728     TRACE1("TRANSACTION\n");
01729     if( pPager->useJournal && !pPager->tempFile ){
01730       rc = pager_open_journal(pPager);
01731     }
01732   }
01733   return rc;
01734 }
01735 
01736 /*
01737 ** Mark a data page as writeable.  The page is written into the journal 
01738 ** if it is not there already.  This routine must be called before making
01739 ** changes to a page.
01740 **
01741 ** The first time this routine is called, the pager creates a new
01742 ** journal and acquires a write lock on the database.  If the write
01743 ** lock could not be acquired, this routine returns SQLITE_BUSY.  The
01744 ** calling routine must check for that return value and be careful not to
01745 ** change any page data until this routine returns SQLITE_OK.
01746 **
01747 ** If the journal file could not be written because the disk is full,
01748 ** then this routine returns SQLITE_FULL and does an immediate rollback.
01749 ** All subsequent write attempts also return SQLITE_FULL until there
01750 ** is a call to sqlitepager_commit() or sqlitepager_rollback() to
01751 ** reset.
01752 */
01753 int sqlitepager_write(void *pData){
01754   PgHdr *pPg = DATA_TO_PGHDR(pData);
01755   Pager *pPager = pPg->pPager;
01756   int rc = SQLITE_OK;
01757 
01758   /* Check for errors
01759   */
01760   if( pPager->errMask ){ 
01761     return pager_errcode(pPager);
01762   }
01763   if( pPager->readOnly ){
01764     return SQLITE_PERM;
01765   }
01766 
01767   /* Mark the page as dirty.  If the page has already been written
01768   ** to the journal then we can return right away.
01769   */
01770   pPg->dirty = 1;
01771   if( pPg->inJournal && (pPg->inCkpt || pPager->ckptInUse==0) ){
01772     pPager->dirtyFile = 1;
01773     return SQLITE_OK;
01774   }
01775 
01776   /* If we get this far, it means that the page needs to be
01777   ** written to the transaction journal or the ckeckpoint journal
01778   ** or both.
01779   **
01780   ** First check to see that the transaction journal exists and
01781   ** create it if it does not.
01782   */
01783   assert( pPager->state!=SQLITE_UNLOCK );
01784   rc = sqlitepager_begin(pData);
01785   if( rc!=SQLITE_OK ){
01786     return rc;
01787   }
01788   assert( pPager->state==SQLITE_WRITELOCK );
01789   if( !pPager->journalOpen && pPager->useJournal ){
01790     rc = pager_open_journal(pPager);
01791     if( rc!=SQLITE_OK ) return rc;
01792   }
01793   assert( pPager->journalOpen || !pPager->useJournal );
01794   pPager->dirtyFile = 1;
01795 
01796   /* The transaction journal now exists and we have a write lock on the
01797   ** main database file.  Write the current page to the transaction 
01798   ** journal if it is not there already.
01799   */
01800   if( !pPg->inJournal && pPager->useJournal ){
01801     if( (int)pPg->pgno <= pPager->origDbSize ){
01802       int szPg;
01803       u32 saved;
01804       if( journal_format>=JOURNAL_FORMAT_3 ){
01805         u32 cksum = pager_cksum(pPager, pPg->pgno, pData);
01806         saved = *(u32*)PGHDR_TO_EXTRA(pPg);
01807         store32bits(cksum, pPg, SQLITE_PAGE_SIZE);
01808         szPg = SQLITE_PAGE_SIZE+8;
01809       }else{
01810         szPg = SQLITE_PAGE_SIZE+4;
01811       }
01812       store32bits(pPg->pgno, pPg, -4);
01813       CODEC(pPager, pData, pPg->pgno, 7);
01814       rc = sqliteOsWrite(&pPager->jfd, &((char*)pData)[-4], szPg);
01815       TRACE3("JOURNAL %d %d\n", pPg->pgno, pPg->needSync);
01816       CODEC(pPager, pData, pPg->pgno, 0);
01817       if( journal_format>=JOURNAL_FORMAT_3 ){
01818         *(u32*)PGHDR_TO_EXTRA(pPg) = saved;
01819       }
01820       if( rc!=SQLITE_OK ){
01821         sqlitepager_rollback(pPager);
01822         pPager->errMask |= PAGER_ERR_FULL;
01823         return rc;
01824       }
01825       pPager->nRec++;
01826       assert( pPager->aInJournal!=0 );
01827       pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
01828       pPg->needSync = !pPager->noSync;
01829       pPg->inJournal = 1;
01830       if( pPager->ckptInUse ){
01831         pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
01832         page_add_to_ckpt_list(pPg);
01833       }
01834     }else{
01835       pPg->needSync = !pPager->journalStarted && !pPager->noSync;
01836       TRACE3("APPEND %d %d\n", pPg->pgno, pPg->needSync);
01837     }
01838     if( pPg->needSync ){
01839       pPager->needSync = 1;
01840     }
01841   }
01842 
01843   /* If the checkpoint journal is open and the page is not in it,
01844   ** then write the current page to the checkpoint journal.  Note that
01845   ** the checkpoint journal always uses the simplier format 2 that lacks
01846   ** checksums.  The header is also omitted from the checkpoint journal.
01847   */
01848   if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
01849     assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
01850     store32bits(pPg->pgno, pPg, -4);
01851     CODEC(pPager, pData, pPg->pgno, 7);
01852     rc = sqliteOsWrite(&pPager->cpfd, &((char*)pData)[-4], SQLITE_PAGE_SIZE+4);
01853     TRACE2("CKPT-JOURNAL %d\n", pPg->pgno);
01854     CODEC(pPager, pData, pPg->pgno, 0);
01855     if( rc!=SQLITE_OK ){
01856       sqlitepager_rollback(pPager);
01857       pPager->errMask |= PAGER_ERR_FULL;
01858       return rc;
01859     }
01860     pPager->ckptNRec++;
01861     assert( pPager->aInCkpt!=0 );
01862     pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
01863     page_add_to_ckpt_list(pPg);
01864   }
01865 
01866   /* Update the database size and return.
01867   */
01868   if( pPager->dbSize<(int)pPg->pgno ){
01869     pPager->dbSize = pPg->pgno;
01870   }
01871   return rc;
01872 }
01873 
01874 /*
01875 ** Return TRUE if the page given in the argument was previously passed
01876 ** to sqlitepager_write().  In other words, return TRUE if it is ok
01877 ** to change the content of the page.
01878 */
01879 int sqlitepager_iswriteable(void *pData){
01880   PgHdr *pPg = DATA_TO_PGHDR(pData);
01881   return pPg->dirty;
01882 }
01883 
01884 /*
01885 ** Replace the content of a single page with the information in the third
01886 ** argument.
01887 */
01888 int sqlitepager_overwrite(Pager *pPager, Pgno pgno, void *pData){
01889   void *pPage;
01890   int rc;
01891 
01892   rc = sqlitepager_get(pPager, pgno, &pPage);
01893   if( rc==SQLITE_OK ){
01894     rc = sqlitepager_write(pPage);
01895     if( rc==SQLITE_OK ){
01896       memcpy(pPage, pData, SQLITE_PAGE_SIZE);
01897     }
01898     sqlitepager_unref(pPage);
01899   }
01900   return rc;
01901 }
01902 
01903 /*
01904 ** A call to this routine tells the pager that it is not necessary to
01905 ** write the information on page "pgno" back to the disk, even though
01906 ** that page might be marked as dirty.
01907 **
01908 ** The overlying software layer calls this routine when all of the data
01909 ** on the given page is unused.  The pager marks the page as clean so
01910 ** that it does not get written to disk.
01911 **
01912 ** Tests show that this optimization, together with the
01913 ** sqlitepager_dont_rollback() below, more than double the speed
01914 ** of large INSERT operations and quadruple the speed of large DELETEs.
01915 **
01916 ** When this routine is called, set the alwaysRollback flag to true.
01917 ** Subsequent calls to sqlitepager_dont_rollback() for the same page
01918 ** will thereafter be ignored.  This is necessary to avoid a problem
01919 ** where a page with data is added to the freelist during one part of
01920 ** a transaction then removed from the freelist during a later part
01921 ** of the same transaction and reused for some other purpose.  When it
01922 ** is first added to the freelist, this routine is called.  When reused,
01923 ** the dont_rollback() routine is called.  But because the page contains
01924 ** critical data, we still need to be sure it gets rolled back in spite
01925 ** of the dont_rollback() call.
01926 */
01927 void sqlitepager_dont_write(Pager *pPager, Pgno pgno){
01928   PgHdr *pPg;
01929 
01930   pPg = pager_lookup(pPager, pgno);
01931   pPg->alwaysRollback = 1;
01932   if( pPg && pPg->dirty && !pPager->ckptInUse ){
01933     if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){
01934       /* If this pages is the last page in the file and the file has grown
01935       ** during the current transaction, then do NOT mark the page as clean.
01936       ** When the database file grows, we must make sure that the last page
01937       ** gets written at least once so that the disk file will be the correct
01938       ** size. If you do not write this page and the size of the file
01939       ** on the disk ends up being too small, that can lead to database
01940       ** corruption during the next transaction.
01941       */
01942     }else{
01943       TRACE2("DONT_WRITE %d\n", pgno);
01944       pPg->dirty = 0;
01945     }
01946   }
01947 }
01948 
01949 /*
01950 ** A call to this routine tells the pager that if a rollback occurs,
01951 ** it is not necessary to restore the data on the given page.  This
01952 ** means that the pager does not have to record the given page in the
01953 ** rollback journal.
01954 */
01955 void sqlitepager_dont_rollback(void *pData){
01956   PgHdr *pPg = DATA_TO_PGHDR(pData);
01957   Pager *pPager = pPg->pPager;
01958 
01959   if( pPager->state!=SQLITE_WRITELOCK || pPager->journalOpen==0 ) return;
01960   if( pPg->alwaysRollback || pPager->alwaysRollback ) return;
01961   if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){
01962     assert( pPager->aInJournal!=0 );
01963     pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7);
01964     pPg->inJournal = 1;
01965     if( pPager->ckptInUse ){
01966       pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
01967       page_add_to_ckpt_list(pPg);
01968     }
01969     TRACE2("DONT_ROLLBACK %d\n", pPg->pgno);
01970   }
01971   if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){
01972     assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
01973     assert( pPager->aInCkpt!=0 );
01974     pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7);
01975     page_add_to_ckpt_list(pPg);
01976   }
01977 }
01978 
01979 /*
01980 ** Commit all changes to the database and release the write lock.
01981 **
01982 ** If the commit fails for any reason, a rollback attempt is made
01983 ** and an error code is returned.  If the commit worked, SQLITE_OK
01984 ** is returned.
01985 */
01986 int sqlitepager_commit(Pager *pPager){
01987   int rc;
01988   PgHdr *pPg;
01989 
01990   if( pPager->errMask==PAGER_ERR_FULL ){
01991     rc = sqlitepager_rollback(pPager);
01992     if( rc==SQLITE_OK ){
01993       rc = SQLITE_FULL;
01994     }
01995     return rc;
01996   }
01997   if( pPager->errMask!=0 ){
01998     rc = pager_errcode(pPager);
01999     return rc;
02000   }
02001   if( pPager->state!=SQLITE_WRITELOCK ){
02002     return SQLITE_ERROR;
02003   }
02004   TRACE1("COMMIT\n");
02005   if( pPager->dirtyFile==0 ){
02006     /* Exit early (without doing the time-consuming sqliteOsSync() calls)
02007     ** if there have been no changes to the database file. */
02008     assert( pPager->needSync==0 );
02009     rc = pager_unwritelock(pPager);
02010     pPager->dbSize = -1;
02011     return rc;
02012   }
02013   assert( pPager->journalOpen );
02014   rc = syncJournal(pPager);
02015   if( rc!=SQLITE_OK ){
02016     goto commit_abort;
02017   }
02018   pPg = pager_get_all_dirty_pages(pPager);
02019   if( pPg ){
02020     rc = pager_write_pagelist(pPg);
02021     if( rc || (!pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK) ){
02022       goto commit_abort;
02023     }
02024   }
02025   rc = pager_unwritelock(pPager);
02026   pPager->dbSize = -1;
02027   return rc;
02028 
02029   /* Jump here if anything goes wrong during the commit process.
02030   */
02031 commit_abort:
02032   rc = sqlitepager_rollback(pPager);
02033   if( rc==SQLITE_OK ){
02034     rc = SQLITE_FULL;
02035   }
02036   return rc;
02037 }
02038 
02039 /*
02040 ** Rollback all changes.  The database falls back to read-only mode.
02041 ** All in-memory cache pages revert to their original data contents.
02042 ** The journal is deleted.
02043 **
02044 ** This routine cannot fail unless some other process is not following
02045 ** the correct locking protocol (SQLITE_PROTOCOL) or unless some other
02046 ** process is writing trash into the journal file (SQLITE_CORRUPT) or
02047 ** unless a prior malloc() failed (SQLITE_NOMEM).  Appropriate error
02048 ** codes are returned for all these occasions.  Otherwise,
02049 ** SQLITE_OK is returned.
02050 */
02051 int sqlitepager_rollback(Pager *pPager){
02052   int rc;
02053   TRACE1("ROLLBACK\n");
02054   if( !pPager->dirtyFile || !pPager->journalOpen ){
02055     rc = pager_unwritelock(pPager);
02056     pPager->dbSize = -1;
02057     return rc;
02058   }
02059 
02060   if( pPager->errMask!=0 && pPager->errMask!=PAGER_ERR_FULL ){
02061     if( pPager->state>=SQLITE_WRITELOCK ){
02062       pager_playback(pPager, 1);
02063     }
02064     return pager_errcode(pPager);
02065   }
02066   if( pPager->state!=SQLITE_WRITELOCK ){
02067     return SQLITE_OK;
02068   }
02069   rc = pager_playback(pPager, 1);
02070   if( rc!=SQLITE_OK ){
02071     rc = SQLITE_CORRUPT;
02072     pPager->errMask |= PAGER_ERR_CORRUPT;
02073   }
02074   pPager->dbSize = -1;
02075   return rc;
02076 }
02077 
02078 /*
02079 ** Return TRUE if the database file is opened read-only.  Return FALSE
02080 ** if the database is (in theory) writable.
02081 */
02082 int sqlitepager_isreadonly(Pager *pPager){
02083   return pPager->readOnly;
02084 }
02085 
02086 /*
02087 ** This routine is used for testing and analysis only.
02088 */
02089 int *sqlitepager_stats(Pager *pPager){
02090   static int a[9];
02091   a[0] = pPager->nRef;
02092   a[1] = pPager->nPage;
02093   a[2] = pPager->mxPage;
02094   a[3] = pPager->dbSize;
02095   a[4] = pPager->state;
02096   a[5] = pPager->errMask;
02097   a[6] = pPager->nHit;
02098   a[7] = pPager->nMiss;
02099   a[8] = pPager->nOvfl;
02100   return a;
02101 }
02102 
02103 /*
02104 ** Set the checkpoint.
02105 **
02106 ** This routine should be called with the transaction journal already
02107 ** open.  A new checkpoint journal is created that can be used to rollback
02108 ** changes of a single SQL command within a larger transaction.
02109 */
02110 int sqlitepager_ckpt_begin(Pager *pPager){
02111   int rc;
02112   char zTemp[SQLITE_TEMPNAME_SIZE];
02113   if( !pPager->journalOpen ){
02114     pPager->ckptAutoopen = 1;
02115     return SQLITE_OK;
02116   }
02117   assert( pPager->journalOpen );
02118   assert( !pPager->ckptInUse );
02119   pPager->aInCkpt = sqliteMalloc( pPager->dbSize/8 + 1 );
02120   if( pPager->aInCkpt==0 ){
02121     sqliteOsReadLock(&pPager->fd);
02122     return SQLITE_NOMEM;
02123   }
02124 #ifndef NDEBUG
02125   rc = sqliteOsFileSize(&pPager->jfd, &pPager->ckptJSize);
02126   if( rc ) goto ckpt_begin_failed;
02127   assert( pPager->ckptJSize == 
02128     pPager->nRec*JOURNAL_PG_SZ(journal_format)+JOURNAL_HDR_SZ(journal_format) );
02129 #endif
02130   pPager->ckptJSize = pPager->nRec*JOURNAL_PG_SZ(journal_format)
02131                          + JOURNAL_HDR_SZ(journal_format);
02132   pPager->ckptSize = pPager->dbSize;
02133   if( !pPager->ckptOpen ){
02134     rc = sqlitepager_opentemp(zTemp, &pPager->cpfd);
02135     if( rc ) goto ckpt_begin_failed;
02136     pPager->ckptOpen = 1;
02137     pPager->ckptNRec = 0;
02138   }
02139   pPager->ckptInUse = 1;
02140   return SQLITE_OK;
02141  
02142 ckpt_begin_failed:
02143   if( pPager->aInCkpt ){
02144     sqliteFree(pPager->aInCkpt);
02145     pPager->aInCkpt = 0;
02146   }
02147   return rc;
02148 }
02149 
02150 /*
02151 ** Commit a checkpoint.
02152 */
02153 int sqlitepager_ckpt_commit(Pager *pPager){
02154   if( pPager->ckptInUse ){
02155     PgHdr *pPg, *pNext;
02156     sqliteOsSeek(&pPager->cpfd, 0);
02157     /* sqliteOsTruncate(&pPager->cpfd, 0); */
02158     pPager->ckptNRec = 0;
02159     pPager->ckptInUse = 0;
02160     sqliteFree( pPager->aInCkpt );
02161     pPager->aInCkpt = 0;
02162     for(pPg=pPager->pCkpt; pPg; pPg=pNext){
02163       pNext = pPg->pNextCkpt;
02164       assert( pPg->inCkpt );
02165       pPg->inCkpt = 0;
02166       pPg->pPrevCkpt = pPg->pNextCkpt = 0;
02167     }
02168     pPager->pCkpt = 0;
02169   }
02170   pPager->ckptAutoopen = 0;
02171   return SQLITE_OK;
02172 }
02173 
02174 /*
02175 ** Rollback a checkpoint.
02176 */
02177 int sqlitepager_ckpt_rollback(Pager *pPager){
02178   int rc;
02179   if( pPager->ckptInUse ){
02180     rc = pager_ckpt_playback(pPager);
02181     sqlitepager_ckpt_commit(pPager);
02182   }else{
02183     rc = SQLITE_OK;
02184   }
02185   pPager->ckptAutoopen = 0;
02186   return rc;
02187 }
02188 
02189 /*
02190 ** Return the full pathname of the database file.
02191 */
02192 const char *sqlitepager_filename(Pager *pPager){
02193   return pPager->zFilename;
02194 }
02195 
02196 /*
02197 ** Set the codec for this pager
02198 */
02199 void sqlitepager_set_codec(
02200   Pager *pPager,
02201   void (*xCodec)(void*,void*,Pgno,int),
02202   void *pCodecArg
02203 ){
02204   pPager->xCodec = xCodec;
02205   pPager->pCodecArg = pCodecArg;
02206 }
02207 
02208 #ifdef SQLITE_TEST
02209 /*
02210 ** Print a listing of all referenced pages and their ref count.
02211 */
02212 void sqlitepager_refdump(Pager *pPager){
02213   PgHdr *pPg;
02214   for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
02215     if( pPg->nRef<=0 ) continue;
02216     printf("PAGE %3d addr=0x%08x nRef=%d\n", 
02217        pPg->pgno, (int)PGHDR_TO_DATA(pPg), pPg->nRef);
02218   }
02219 }
02220 #endif

Generated on Sun Dec 25 12:29:52 2005 for sqlite 2.8.17 by  doxygen 1.4.2