Berkeley DB 4.4.16: /home/huihoo/src/db/db-4.4.16/os_win32/os

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1997-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: os_rw.c,v 12.4 2005/08/10 15:47:28 bostic Exp $
00008  */
00009 
00010 #include "db_config.h"
00011 
00012 #ifndef NO_SYSTEM_INCLUDES
00013 #include <sys/types.h>
00014 
00015 #include <string.h>
00016 #endif
00017 
00018 #include "db_int.h"
00019 
00020 #ifdef HAVE_FILESYSTEM_NOTZERO
00021 static int __os_zerofill __P((DB_ENV *, DB_FH *));
00022 #endif
00023 static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
00024 
00025 /*
00026  * __os_io --
00027  *      Do an I/O.
00028  */
00029 int
00030 __os_io(dbenv, op, fhp, pgno, pagesize, buf, niop)
00031         DB_ENV *dbenv;
00032         int op;
00033         DB_FH *fhp;
00034         db_pgno_t pgno;
00035         u_int32_t pagesize;
00036         u_int8_t *buf;
00037         size_t *niop;
00038 {
00039         int ret;
00040 
00041         if (__os_is_winnt()) {
00042                 ULONG64 off = (ULONG64)pagesize * pgno;
00043                 OVERLAPPED over;
00044                 DWORD nbytes;
00045                 over.Offset = (DWORD)(off & 0xffffffff);
00046                 over.OffsetHigh = (DWORD)(off >> 32);
00047                 over.hEvent = 0; /* we don't want asynchronous notifications */
00048 
00049                 switch (op) {
00050                 case DB_IO_READ:
00051                         if (DB_GLOBAL(j_read) != NULL)
00052                                 goto slow;
00053                         if (!ReadFile(fhp->handle,
00054                             buf, (DWORD)pagesize, &nbytes, &over))
00055                                 goto slow;
00056                         break;
00057                 case DB_IO_WRITE:
00058                         if (DB_GLOBAL(j_write) != NULL)
00059                                 goto slow;
00060 #ifdef HAVE_FILESYSTEM_NOTZERO
00061                         if (__os_fs_notzero())
00062                                 goto slow;
00063 #endif
00064                         if (!WriteFile(fhp->handle,
00065                             buf, (DWORD)pagesize, &nbytes, &over))
00066                                 goto slow;
00067                         break;
00068                 }
00069                 if (nbytes == pagesize) {
00070                         *niop = (size_t)nbytes;
00071                         return (0);
00072                 }
00073         }
00074 
00075 slow:   MUTEX_LOCK(dbenv, fhp->mtx_fh);
00076 
00077         if ((ret = __os_seek(dbenv, fhp,
00078             pagesize, pgno, 0, 0, DB_OS_SEEK_SET)) != 0)
00079                 goto err;
00080 
00081         switch (op) {
00082         case DB_IO_READ:
00083                 ret = __os_read(dbenv, fhp, buf, pagesize, niop);
00084                 break;
00085         case DB_IO_WRITE:
00086                 ret = __os_write(dbenv, fhp, buf, pagesize, niop);
00087                 break;
00088         }
00089 
00090 err:    MUTEX_UNLOCK(dbenv, fhp->mtx_fh);
00091 
00092         return (ret);
00093 }
00094 
00095 /*
00096  * __os_read --
00097  *      Read from a file handle.
00098  */
00099 int
00100 __os_read(dbenv, fhp, addr, len, nrp)
00101         DB_ENV *dbenv;
00102         DB_FH *fhp;
00103         void *addr;
00104         size_t len;
00105         size_t *nrp;
00106 {
00107         size_t offset, nr;
00108         DWORD count;
00109         int ret;
00110         u_int8_t *taddr;
00111 
00112         ret = 0;
00113 
00114         if (DB_GLOBAL(j_read) != NULL) {
00115                 *nrp = len;
00116                 if (DB_GLOBAL(j_read)(fhp->fd, addr, len) != (ssize_t)len) {
00117                         ret = __os_get_errno();
00118                         __db_err(dbenv, "read: %#lx, %lu: %s",
00119                             P_TO_ULONG(addr), (u_long)len, strerror(ret));
00120                 }
00121                 return (ret);
00122         }
00123 
00124         ret = 0;
00125         for (taddr = addr,
00126             offset = 0; offset < len; taddr += nr, offset += nr) {
00127                 RETRY_CHK((!ReadFile(fhp->handle,
00128                     taddr, (DWORD)(len - offset), &count, NULL)), ret);
00129                 if (count == 0 || ret != 0)
00130                         break;
00131                 nr = (size_t)count;
00132         }
00133         *nrp = taddr - (u_int8_t *)addr;
00134         if (ret != 0)
00135                 __db_err(dbenv, "read: 0x%lx, %lu: %s",
00136                     P_TO_ULONG(taddr), (u_long)len - offset, strerror(ret));
00137         return (ret);
00138 }
00139 
00140 /*
00141  * __os_write --
00142  *      Write to a file handle.
00143  */
00144 int
00145 __os_write(dbenv, fhp, addr, len, nwp)
00146         DB_ENV *dbenv;
00147         DB_FH *fhp;
00148         void *addr;
00149         size_t len;
00150         size_t *nwp;
00151 {
00152         int ret;
00153 
00154 #ifdef HAVE_FILESYSTEM_NOTZERO
00155         /* Zero-fill as necessary. */
00156         if (__os_fs_notzero() && (ret = __os_zerofill(dbenv, fhp)) != 0)
00157                 return (ret);
00158 #endif
00159         return (__os_physwrite(dbenv, fhp, addr, len, nwp));
00160 }
00161 
00162 /*
00163  * __os_physwrite --
00164  *      Physical write to a file handle.
00165  */
00166 static int
00167 __os_physwrite(dbenv, fhp, addr, len, nwp)
00168         DB_ENV *dbenv;
00169         DB_FH *fhp;
00170         void *addr;
00171         size_t len;
00172         size_t *nwp;
00173 {
00174         size_t offset, nw;
00175         DWORD count;
00176         int ret;
00177         u_int8_t *taddr;
00178 
00179         /*
00180          * Make a last "panic" check.  Imagine a thread of control running in
00181          * Berkeley DB, going to sleep.  Another thread of control decides to
00182          * run recovery because the environment is broken.  The first thing
00183          * recovery does is panic the existing environment, but we only check
00184          * the panic flag when crossing the public API.  If the sleeping thread
00185          * wakes up and writes something, we could have two threads of control
00186          * writing the log files at the same time.  So, before writing, make a
00187          * last panic check.  Obviously, there's still a window, but it's very,
00188          * very small.
00189          */
00190         PANIC_CHECK(dbenv);
00191 
00192         if (DB_GLOBAL(j_write) != NULL) {
00193                 *nwp = len;
00194                 if (DB_GLOBAL(j_write)(fhp->fd, addr, len) != (ssize_t)len) {
00195                         ret = __os_get_errno();
00196                         __db_err(dbenv, "write: %#lx, %lu: %s",
00197                             P_TO_ULONG(addr), (u_long)len, strerror(ret));
00198                 }
00199                 return (ret);
00200         }
00201 
00202         ret = 0;
00203         for (taddr = addr,
00204             offset = 0; offset < len; taddr += nw, offset += nw) {
00205                 RETRY_CHK((!WriteFile(fhp->handle,
00206                     taddr, (DWORD)(len - offset), &count, NULL)), ret);
00207                 if (ret != 0)
00208                         break;
00209                 nw = (size_t)count;
00210         }
00211         *nwp = len;
00212         if (ret != 0)
00213                 __db_err(dbenv, "write: %#lx, %lu: %s",
00214                     P_TO_ULONG(taddr), (u_long)len - offset, strerror(ret));
00215         return (ret);
00216 }
00217 
00218 #ifdef HAVE_FILESYSTEM_NOTZERO
00219 /*
00220  * __os_zerofill --
00221  *      Zero out bytes in the file.
00222  *
00223  *      Pages allocated by writing pages past end-of-file are not zeroed,
00224  *      on some systems.  Recovery could theoretically be fooled by a page
00225  *      showing up that contained garbage.  In order to avoid this, we
00226  *      have to write the pages out to disk, and flush them.  The reason
00227  *      for the flush is because if we don't sync, the allocation of another
00228  *      page subsequent to this one might reach the disk first, and if we
00229  *      crashed at the right moment, leave us with this page as the one
00230  *      allocated by writing a page past it in the file.
00231  */
00232 static int
00233 __os_zerofill(dbenv, fhp)
00234         DB_ENV *dbenv;
00235         DB_FH *fhp;
00236 {
00237         unsigned __int64 stat_offset, write_offset;
00238         size_t blen, nw;
00239         u_int32_t bytes, mbytes;
00240         int group_sync, need_free, ret;
00241         u_int8_t buf[8 * 1024], *bp;
00242 
00243         /* Calculate the byte offset of the next write. */
00244         write_offset = (unsigned __int64)fhp->pgno * fhp->pgsize + fhp->offset;
00245 
00246         /* Stat the file. */
00247         if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
00248                 return (ret);
00249         stat_offset = (unsigned __int64)mbytes * MEGABYTE + bytes;
00250 
00251         /* Check if the file is large enough. */
00252         if (stat_offset >= write_offset)
00253                 return (0);
00254 
00255         /* Get a large buffer if we're writing lots of data. */
00256 #undef  ZF_LARGE_WRITE
00257 #define ZF_LARGE_WRITE  (64 * 1024)
00258         if (write_offset - stat_offset > ZF_LARGE_WRITE) {
00259                 if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
00260                             return (ret);
00261                 blen = ZF_LARGE_WRITE;
00262                 need_free = 1;
00263         } else {
00264                 bp = buf;
00265                 blen = sizeof(buf);
00266                 need_free = 0;
00267                 memset(buf, 0, sizeof(buf));
00268         }
00269 
00270         /* Seek to the current end of the file. */
00271         if ((ret = __os_seek(
00272             dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
00273                 goto err;
00274 
00275         /*
00276          * Hash is the only access method that allocates groups of pages.  Hash
00277          * uses the existence of the last page in a group to signify the entire
00278          * group is OK; so, write all the pages but the last one in the group,
00279          * flush them to disk, then write the last one to disk and flush it.
00280          */
00281         for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
00282                 if (write_offset - stat_offset <= blen) {
00283                         blen = (size_t)(write_offset - stat_offset);
00284                         if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
00285                                 goto err;
00286                 }
00287                 if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
00288                         goto err;
00289                 stat_offset += blen;
00290         }
00291         if ((ret = __os_fsync(dbenv, fhp)) != 0)
00292                 goto err;
00293 
00294         /* Seek back to where we started. */
00295         mbytes = (u_int32_t)(write_offset / MEGABYTE);
00296         bytes = (u_int32_t)(write_offset % MEGABYTE);
00297         ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
00298 
00299 err:    if (need_free)
00300                 __os_free(dbenv, bp);
00301         return (ret);
00302 }
00303 #endif