Berkeley DB 4.4.16: /home/huihoo/src/db/db-4.4.16/os/os

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1997-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: os_rw.c,v 12.5 2005/08/10 15:47:26 bostic Exp $
00008  */
00009 
00010 #include "db_config.h"
00011 
00012 #ifndef NO_SYSTEM_INCLUDES
00013 #include <sys/types.h>
00014 #include <sys/stat.h>
00015 
00016 #include <string.h>
00017 #endif
00018 
00019 #include "db_int.h"
00020 
00021 #ifdef HAVE_FILESYSTEM_NOTZERO
00022 static int __os_zerofill __P((DB_ENV *, DB_FH *));
00023 #endif
00024 static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
00025 
00026 /*
00027  * __os_io --
00028  *      Do an I/O.
00029  *
00030  * PUBLIC: int __os_io __P((DB_ENV *,
00031  * PUBLIC:     int, DB_FH *, db_pgno_t, u_int32_t, u_int8_t *, size_t *));
00032  */
00033 int
00034 __os_io(dbenv, op, fhp, pgno, pagesize, buf, niop)
00035         DB_ENV *dbenv;
00036         int op;
00037         DB_FH *fhp;
00038         db_pgno_t pgno;
00039         u_int32_t pagesize;
00040         u_int8_t *buf;
00041         size_t *niop;
00042 {
00043 #if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
00044         ssize_t nio;
00045 #endif
00046         int ret;
00047 
00048         /* Check for illegal usage. */
00049         DB_ASSERT(F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
00050 
00051 #if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
00052         switch (op) {
00053         case DB_IO_READ:
00054                 if (DB_GLOBAL(j_read) != NULL)
00055                         goto slow;
00056                 nio = DB_GLOBAL(j_pread) != NULL ? DB_GLOBAL(j_pread)
00057                         (fhp->fd, buf, pagesize, (off_t)pgno * pagesize) :
00058                         pread(fhp->fd, buf, pagesize, (off_t)pgno * pagesize);
00059                 break;
00060         case DB_IO_WRITE:
00061                 if (DB_GLOBAL(j_write) != NULL)
00062                         goto slow;
00063 #ifdef HAVE_FILESYSTEM_NOTZERO
00064                 if (__os_fs_notzero())
00065                         goto slow;
00066 #endif
00067                 nio = DB_GLOBAL(j_pwrite) != NULL ? DB_GLOBAL(j_pwrite)
00068                         (fhp->fd, buf, pagesize, (off_t)pgno * pagesize) :
00069                         pwrite(fhp->fd, buf, pagesize, (off_t)pgno * pagesize);
00070                 break;
00071         default:
00072                 return (EINVAL);
00073         }
00074         if (nio == (ssize_t)pagesize) {
00075                 *niop = pagesize;
00076                 return (0);
00077         }
00078 slow:
00079 #endif
00080         MUTEX_LOCK(dbenv, fhp->mtx_fh);
00081 
00082         if ((ret = __os_seek(dbenv, fhp,
00083             pagesize, pgno, 0, 0, DB_OS_SEEK_SET)) != 0)
00084                 goto err;
00085         switch (op) {
00086         case DB_IO_READ:
00087                 ret = __os_read(dbenv, fhp, buf, pagesize, niop);
00088                 break;
00089         case DB_IO_WRITE:
00090                 ret = __os_write(dbenv, fhp, buf, pagesize, niop);
00091                 break;
00092         default:
00093                 ret = EINVAL;
00094                 break;
00095         }
00096 
00097 err:    MUTEX_UNLOCK(dbenv, fhp->mtx_fh);
00098 
00099         return (ret);
00100 
00101 }
00102 
00103 /*
00104  * __os_read --
00105  *      Read from a file handle.
00106  *
00107  * PUBLIC: int __os_read __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
00108  */
00109 int
00110 __os_read(dbenv, fhp, addr, len, nrp)
00111         DB_ENV *dbenv;
00112         DB_FH *fhp;
00113         void *addr;
00114         size_t len;
00115         size_t *nrp;
00116 {
00117         size_t offset;
00118         ssize_t nr;
00119         int ret;
00120         u_int8_t *taddr;
00121 
00122         ret = 0;
00123 
00124         /* Check for illegal usage. */
00125         DB_ASSERT(F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
00126 
00127         if (DB_GLOBAL(j_read) != NULL) {
00128                 *nrp = len;
00129                 if (DB_GLOBAL(j_read)(fhp->fd, addr, len) != (ssize_t)len) {
00130                         ret = __os_get_errno();
00131                         __db_err(dbenv, "read: %#lx, %lu: %s",
00132                             P_TO_ULONG(addr), (u_long)len, strerror(ret));
00133                 }
00134                 return (ret);
00135         }
00136 
00137         for (taddr = addr, offset = 0;
00138             offset < len; taddr += nr, offset += (u_int32_t)nr) {
00139                 RETRY_CHK(((nr = read(
00140                     fhp->fd, taddr, len - offset)) < 0 ? 1 : 0), ret);
00141                 if (nr == 0 || ret != 0)
00142                         break;
00143         }
00144         *nrp = (size_t)(taddr - (u_int8_t *)addr);
00145         if (ret != 0)
00146                 __db_err(dbenv, "read: %#lx, %lu: %s",
00147                     P_TO_ULONG(taddr), (u_long)len - offset, strerror(ret));
00148         return (ret);
00149 }
00150 
00151 /*
00152  * __os_write --
00153  *      Write to a file handle.
00154  *
00155  * PUBLIC: int __os_write __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
00156  */
00157 int
00158 __os_write(dbenv, fhp, addr, len, nwp)
00159         DB_ENV *dbenv;
00160         DB_FH *fhp;
00161         void *addr;
00162         size_t len;
00163         size_t *nwp;
00164 {
00165         /* Check for illegal usage. */
00166         DB_ASSERT(F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
00167 
00168 #ifdef HAVE_FILESYSTEM_NOTZERO
00169         /* Zero-fill as necessary. */
00170         if (__os_fs_notzero()) {
00171                 int ret;
00172                 if ((ret = __os_zerofill(dbenv, fhp)) != 0)
00173                         return (ret);
00174         }
00175 #endif
00176         return (__os_physwrite(dbenv, fhp, addr, len, nwp));
00177 }
00178 
00179 /*
00180  * __os_physwrite --
00181  *      Physical write to a file handle.
00182  */
00183 static int
00184 __os_physwrite(dbenv, fhp, addr, len, nwp)
00185         DB_ENV *dbenv;
00186         DB_FH *fhp;
00187         void *addr;
00188         size_t len;
00189         size_t *nwp;
00190 {
00191         size_t offset;
00192         ssize_t nw;
00193         int ret;
00194         u_int8_t *taddr;
00195 
00196         ret = 0;
00197 
00198 #if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
00199         if (__os_fs_notzero()) {
00200                 struct stat sb;
00201                 off_t cur_off;
00202 
00203                 DB_ASSERT(fstat(fhp->fd, &sb) != -1 &&
00204                     (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
00205                     cur_off <= sb.st_size);
00206         }
00207 #endif
00208 
00209         /*
00210          * Make a last "panic" check.  Imagine a thread of control running in
00211          * Berkeley DB, going to sleep.  Another thread of control decides to
00212          * run recovery because the environment is broken.  The first thing
00213          * recovery does is panic the existing environment, but we only check
00214          * the panic flag when crossing the public API.  If the sleeping thread
00215          * wakes up and writes something, we could have two threads of control
00216          * writing the log files at the same time.  So, before writing, make a
00217          * last panic check.  Obviously, there's still a window, but it's very,
00218          * very small.
00219          */
00220         PANIC_CHECK(dbenv);
00221 
00222         if (DB_GLOBAL(j_write) != NULL) {
00223                 *nwp = len;
00224                 if (DB_GLOBAL(j_write)(fhp->fd, addr, len) != (ssize_t)len) {
00225                         ret = __os_get_errno();
00226                         __db_err(dbenv, "write: %#lx, %lu: %s",
00227                             P_TO_ULONG(addr), (u_long)len, strerror(ret));
00228                 }
00229                 return (ret);
00230         }
00231 
00232         for (taddr = addr, offset = 0;
00233             offset < len; taddr += nw, offset += (u_int32_t)nw) {
00234                 RETRY_CHK(((nw = write(
00235                     fhp->fd, taddr, len - offset)) < 0 ? 1 : 0), ret);
00236                 if (ret != 0)
00237                         break;
00238         }
00239         *nwp = len;
00240         if (ret != 0)
00241                 __db_err(dbenv, "write: %#lx, %lu: %s",
00242                     P_TO_ULONG(taddr), (u_long)len - offset, strerror(ret));
00243         return (ret);
00244 }
00245 
00246 #ifdef HAVE_FILESYSTEM_NOTZERO
00247 /*
00248  * __os_zerofill --
00249  *      Zero out bytes in the file.
00250  *
00251  *      Pages allocated by writing pages past end-of-file are not zeroed,
00252  *      on some systems.  Recovery could theoretically be fooled by a page
00253  *      showing up that contained garbage.  In order to avoid this, we
00254  *      have to write the pages out to disk, and flush them.  The reason
00255  *      for the flush is because if we don't sync, the allocation of another
00256  *      page subsequent to this one might reach the disk first, and if we
00257  *      crashed at the right moment, leave us with this page as the one
00258  *      allocated by writing a page past it in the file.
00259  */
00260 static int
00261 __os_zerofill(dbenv, fhp)
00262         DB_ENV *dbenv;
00263         DB_FH *fhp;
00264 {
00265         off_t stat_offset, write_offset;
00266         size_t blen, nw;
00267         u_int32_t bytes, mbytes;
00268         int group_sync, need_free, ret;
00269         u_int8_t buf[8 * 1024], *bp;
00270 
00271         /* Calculate the byte offset of the next write. */
00272         write_offset = (off_t)fhp->pgno * fhp->pgsize + fhp->offset;
00273 
00274         /* Stat the file. */
00275         if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
00276                 return (ret);
00277         stat_offset = (off_t)mbytes * MEGABYTE + bytes;
00278 
00279         /* Check if the file is large enough. */
00280         if (stat_offset >= write_offset)
00281                 return (0);
00282 
00283         /* Get a large buffer if we're writing lots of data. */
00284 #undef  ZF_LARGE_WRITE
00285 #define ZF_LARGE_WRITE  (64 * 1024)
00286         if (write_offset - stat_offset > ZF_LARGE_WRITE) {
00287                 if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
00288                             return (ret);
00289                 blen = ZF_LARGE_WRITE;
00290                 need_free = 1;
00291         } else {
00292                 bp = buf;
00293                 blen = sizeof(buf);
00294                 need_free = 0;
00295                 memset(buf, 0, sizeof(buf));
00296         }
00297 
00298         /* Seek to the current end of the file. */
00299         if ((ret = __os_seek(
00300             dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
00301                 goto err;
00302 
00303         /*
00304          * Hash is the only access method that allocates groups of pages.  Hash
00305          * uses the existence of the last page in a group to signify the entire
00306          * group is OK; so, write all the pages but the last one in the group,
00307          * flush them to disk, then write the last one to disk and flush it.
00308          */
00309         for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
00310                 if (write_offset - stat_offset <= blen) {
00311                         blen = (size_t)(write_offset - stat_offset);
00312                         if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
00313                                 goto err;
00314                 }
00315                 if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
00316                         goto err;
00317                 stat_offset += blen;
00318         }
00319         if ((ret = __os_fsync(dbenv, fhp)) != 0)
00320                 goto err;
00321 
00322         /* Seek back to where we started. */
00323         mbytes = (u_int32_t)(write_offset / MEGABYTE);
00324         bytes = (u_int32_t)(write_offset % MEGABYTE);
00325         ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
00326 
00327 err:    if (need_free)
00328                 __os_free(dbenv, bp);
00329         return (ret);
00330 }
00331 #endif