00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "postgres.h"
00023
00024 #include <unistd.h>
00025 #include <fcntl.h>
00026 #include <sys/file.h>
00027
00028 #include "miscadmin.h"
00029 #include "access/xlog.h"
00030 #include "catalog/catalog.h"
00031 #include "common/relpath.h"
00032 #include "portability/instr_time.h"
00033 #include "postmaster/bgwriter.h"
00034 #include "storage/fd.h"
00035 #include "storage/bufmgr.h"
00036 #include "storage/relfilenode.h"
00037 #include "storage/smgr.h"
00038 #include "utils/hsearch.h"
00039 #include "utils/memutils.h"
00040 #include "pg_trace.h"
00041
00042
00043
00044 #define FSYNCS_PER_ABSORB 10
00045 #define UNLINKS_PER_ABSORB 10
00046
00047
00048
00049
00050
00051
00052
00053
00054 #define FORGET_RELATION_FSYNC (InvalidBlockNumber)
00055 #define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
00056 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
00057
00058
00059
00060
00061
00062
00063
00064
00065 #ifndef WIN32
00066 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
00067 #else
00068 #define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
00069 #endif
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112 typedef struct _MdfdVec
00113 {
00114 File mdfd_vfd;
00115 BlockNumber mdfd_segno;
00116 struct _MdfdVec *mdfd_chain;
00117 } MdfdVec;
00118
00119 static MemoryContext MdCxt;
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141 typedef uint16 CycleCtr;
00142
00143 typedef struct
00144 {
00145 RelFileNode rnode;
00146 CycleCtr cycle_ctr;
00147
00148 Bitmapset *requests[MAX_FORKNUM + 1];
00149
00150 bool canceled[MAX_FORKNUM + 1];
00151 } PendingOperationEntry;
00152
00153 typedef struct
00154 {
00155 RelFileNode rnode;
00156 CycleCtr cycle_ctr;
00157 } PendingUnlinkEntry;
00158
00159 static HTAB *pendingOpsTable = NULL;
00160 static List *pendingUnlinks = NIL;
00161
00162 static CycleCtr mdsync_cycle_ctr = 0;
00163 static CycleCtr mdckpt_cycle_ctr = 0;
00164
00165
00166 typedef enum
00167 {
00168 EXTENSION_FAIL,
00169 EXTENSION_RETURN_NULL,
00170 EXTENSION_CREATE
00171 } ExtensionBehavior;
00172
00173
00174 static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
00175 bool isRedo);
00176 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
00177 ExtensionBehavior behavior);
00178 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
00179 MdfdVec *seg);
00180 static void register_unlink(RelFileNodeBackend rnode);
00181 static MdfdVec *_fdvec_alloc(void);
00182 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
00183 BlockNumber segno);
00184 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
00185 BlockNumber segno, int oflags);
00186 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
00187 BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
00188 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
00189 MdfdVec *seg);
00190
00191
00192
00193
00194
00195 void
00196 mdinit(void)
00197 {
00198 MdCxt = AllocSetContextCreate(TopMemoryContext,
00199 "MdSmgr",
00200 ALLOCSET_DEFAULT_MINSIZE,
00201 ALLOCSET_DEFAULT_INITSIZE,
00202 ALLOCSET_DEFAULT_MAXSIZE);
00203
00204
00205
00206
00207
00208
00209 if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
00210 {
00211 HASHCTL hash_ctl;
00212
00213 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
00214 hash_ctl.keysize = sizeof(RelFileNode);
00215 hash_ctl.entrysize = sizeof(PendingOperationEntry);
00216 hash_ctl.hash = tag_hash;
00217 hash_ctl.hcxt = MdCxt;
00218 pendingOpsTable = hash_create("Pending Ops Table",
00219 100L,
00220 &hash_ctl,
00221 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
00222 pendingUnlinks = NIL;
00223 }
00224 }
00225
00226
00227
00228
00229
00230
00231
00232 void
00233 SetForwardFsyncRequests(void)
00234 {
00235
00236 if (pendingOpsTable)
00237 {
00238 mdsync();
00239 hash_destroy(pendingOpsTable);
00240 }
00241 pendingOpsTable = NULL;
00242
00243
00244
00245
00246
00247 Assert(pendingUnlinks == NIL);
00248 }
00249
00250
00251
00252
00253
00254
00255 bool
00256 mdexists(SMgrRelation reln, ForkNumber forkNum)
00257 {
00258
00259
00260
00261
00262 mdclose(reln, forkNum);
00263
00264 return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
00265 }
00266
00267
00268
00269
00270
00271
00272 void
00273 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
00274 {
00275 char *path;
00276 File fd;
00277
00278 if (isRedo && reln->md_fd[forkNum] != NULL)
00279 return;
00280
00281 Assert(reln->md_fd[forkNum] == NULL);
00282
00283 path = relpath(reln->smgr_rnode, forkNum);
00284
00285 fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
00286
00287 if (fd < 0)
00288 {
00289 int save_errno = errno;
00290
00291
00292
00293
00294
00295
00296
00297 if (isRedo || IsBootstrapProcessingMode())
00298 fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
00299 if (fd < 0)
00300 {
00301
00302 errno = save_errno;
00303 ereport(ERROR,
00304 (errcode_for_file_access(),
00305 errmsg("could not create file \"%s\": %m", path)));
00306 }
00307 }
00308
00309 pfree(path);
00310
00311 reln->md_fd[forkNum] = _fdvec_alloc();
00312
00313 reln->md_fd[forkNum]->mdfd_vfd = fd;
00314 reln->md_fd[forkNum]->mdfd_segno = 0;
00315 reln->md_fd[forkNum]->mdfd_chain = NULL;
00316 }
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365 void
00366 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
00367 {
00368
00369
00370
00371
00372
00373
00374
00375 if (!RelFileNodeBackendIsTemp(rnode))
00376 ForgetRelationFsyncRequests(rnode.node, forkNum);
00377
00378
00379 if (forkNum == InvalidForkNumber)
00380 {
00381 for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
00382 mdunlinkfork(rnode, forkNum, isRedo);
00383 }
00384 else
00385 mdunlinkfork(rnode, forkNum, isRedo);
00386 }
00387
00388 static void
00389 mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
00390 {
00391 char *path;
00392 int ret;
00393
00394 path = relpath(rnode, forkNum);
00395
00396
00397
00398
00399 if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
00400 {
00401 ret = unlink(path);
00402 if (ret < 0 && errno != ENOENT)
00403 ereport(WARNING,
00404 (errcode_for_file_access(),
00405 errmsg("could not remove file \"%s\": %m", path)));
00406 }
00407 else
00408 {
00409
00410 int fd;
00411
00412 fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0);
00413 if (fd >= 0)
00414 {
00415 int save_errno;
00416
00417 ret = ftruncate(fd, 0);
00418 save_errno = errno;
00419 CloseTransientFile(fd);
00420 errno = save_errno;
00421 }
00422 else
00423 ret = -1;
00424 if (ret < 0 && errno != ENOENT)
00425 ereport(WARNING,
00426 (errcode_for_file_access(),
00427 errmsg("could not truncate file \"%s\": %m", path)));
00428
00429
00430 register_unlink(rnode);
00431 }
00432
00433
00434
00435
00436 if (ret >= 0)
00437 {
00438 char *segpath = (char *) palloc(strlen(path) + 12);
00439 BlockNumber segno;
00440
00441
00442
00443
00444
00445 for (segno = 1;; segno++)
00446 {
00447 sprintf(segpath, "%s.%u", path, segno);
00448 if (unlink(segpath) < 0)
00449 {
00450
00451 if (errno != ENOENT)
00452 ereport(WARNING,
00453 (errcode_for_file_access(),
00454 errmsg("could not remove file \"%s\": %m", segpath)));
00455 break;
00456 }
00457 }
00458 pfree(segpath);
00459 }
00460
00461 pfree(path);
00462 }
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473 void
00474 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
00475 char *buffer, bool skipFsync)
00476 {
00477 off_t seekpos;
00478 int nbytes;
00479 MdfdVec *v;
00480
00481
00482 #ifdef CHECK_WRITE_VS_EXTEND
00483 Assert(blocknum >= mdnblocks(reln, forknum));
00484 #endif
00485
00486
00487
00488
00489
00490
00491 if (blocknum == InvalidBlockNumber)
00492 ereport(ERROR,
00493 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00494 errmsg("cannot extend file \"%s\" beyond %u blocks",
00495 relpath(reln->smgr_rnode, forknum),
00496 InvalidBlockNumber)));
00497
00498 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
00499
00500 seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
00501
00502 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
00503
00504
00505
00506
00507
00508
00509
00510
00511
00512
00513 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
00514 ereport(ERROR,
00515 (errcode_for_file_access(),
00516 errmsg("could not seek to block %u in file \"%s\": %m",
00517 blocknum, FilePathName(v->mdfd_vfd))));
00518
00519 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
00520 {
00521 if (nbytes < 0)
00522 ereport(ERROR,
00523 (errcode_for_file_access(),
00524 errmsg("could not extend file \"%s\": %m",
00525 FilePathName(v->mdfd_vfd)),
00526 errhint("Check free disk space.")));
00527
00528 ereport(ERROR,
00529 (errcode(ERRCODE_DISK_FULL),
00530 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
00531 FilePathName(v->mdfd_vfd),
00532 nbytes, BLCKSZ, blocknum),
00533 errhint("Check free disk space.")));
00534 }
00535
00536 if (!skipFsync && !SmgrIsTemp(reln))
00537 register_dirty_segment(reln, forknum, v);
00538
00539 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
00540 }
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552 static MdfdVec *
00553 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
00554 {
00555 MdfdVec *mdfd;
00556 char *path;
00557 File fd;
00558
00559
00560 if (reln->md_fd[forknum])
00561 return reln->md_fd[forknum];
00562
00563 path = relpath(reln->smgr_rnode, forknum);
00564
00565 fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
00566
00567 if (fd < 0)
00568 {
00569
00570
00571
00572
00573
00574
00575 if (IsBootstrapProcessingMode())
00576 fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
00577 if (fd < 0)
00578 {
00579 if (behavior == EXTENSION_RETURN_NULL &&
00580 FILE_POSSIBLY_DELETED(errno))
00581 {
00582 pfree(path);
00583 return NULL;
00584 }
00585 ereport(ERROR,
00586 (errcode_for_file_access(),
00587 errmsg("could not open file \"%s\": %m", path)));
00588 }
00589 }
00590
00591 pfree(path);
00592
00593 reln->md_fd[forknum] = mdfd = _fdvec_alloc();
00594
00595 mdfd->mdfd_vfd = fd;
00596 mdfd->mdfd_segno = 0;
00597 mdfd->mdfd_chain = NULL;
00598 Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
00599
00600 return mdfd;
00601 }
00602
00603
00604
00605
00606 void
00607 mdclose(SMgrRelation reln, ForkNumber forknum)
00608 {
00609 MdfdVec *v = reln->md_fd[forknum];
00610
00611
00612 if (v == NULL)
00613 return;
00614
00615 reln->md_fd[forknum] = NULL;
00616
00617 while (v != NULL)
00618 {
00619 MdfdVec *ov = v;
00620
00621
00622 if (v->mdfd_vfd >= 0)
00623 FileClose(v->mdfd_vfd);
00624
00625 v = v->mdfd_chain;
00626 pfree(ov);
00627 }
00628 }
00629
00630
00631
00632
00633 void
00634 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
00635 {
00636 #ifdef USE_PREFETCH
00637 off_t seekpos;
00638 MdfdVec *v;
00639
00640 v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
00641
00642 seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
00643
00644 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
00645
00646 (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
00647 #endif
00648 }
00649
00650
00651
00652
00653
00654 void
00655 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
00656 char *buffer)
00657 {
00658 off_t seekpos;
00659 int nbytes;
00660 MdfdVec *v;
00661
00662 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
00663 reln->smgr_rnode.node.spcNode,
00664 reln->smgr_rnode.node.dbNode,
00665 reln->smgr_rnode.node.relNode,
00666 reln->smgr_rnode.backend);
00667
00668 v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
00669
00670 seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
00671
00672 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
00673
00674 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
00675 ereport(ERROR,
00676 (errcode_for_file_access(),
00677 errmsg("could not seek to block %u in file \"%s\": %m",
00678 blocknum, FilePathName(v->mdfd_vfd))));
00679
00680 nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
00681
00682 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
00683 reln->smgr_rnode.node.spcNode,
00684 reln->smgr_rnode.node.dbNode,
00685 reln->smgr_rnode.node.relNode,
00686 reln->smgr_rnode.backend,
00687 nbytes,
00688 BLCKSZ);
00689
00690 if (nbytes != BLCKSZ)
00691 {
00692 if (nbytes < 0)
00693 ereport(ERROR,
00694 (errcode_for_file_access(),
00695 errmsg("could not read block %u in file \"%s\": %m",
00696 blocknum, FilePathName(v->mdfd_vfd))));
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706 if (zero_damaged_pages || InRecovery)
00707 MemSet(buffer, 0, BLCKSZ);
00708 else
00709 ereport(ERROR,
00710 (errcode(ERRCODE_DATA_CORRUPTED),
00711 errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
00712 blocknum, FilePathName(v->mdfd_vfd),
00713 nbytes, BLCKSZ)));
00714 }
00715 }
00716
00717
00718
00719
00720
00721
00722
00723
00724 void
00725 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
00726 char *buffer, bool skipFsync)
00727 {
00728 off_t seekpos;
00729 int nbytes;
00730 MdfdVec *v;
00731
00732
00733 #ifdef CHECK_WRITE_VS_EXTEND
00734 Assert(blocknum < mdnblocks(reln, forknum));
00735 #endif
00736
00737 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
00738 reln->smgr_rnode.node.spcNode,
00739 reln->smgr_rnode.node.dbNode,
00740 reln->smgr_rnode.node.relNode,
00741 reln->smgr_rnode.backend);
00742
00743 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL);
00744
00745 seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
00746
00747 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
00748
00749 if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
00750 ereport(ERROR,
00751 (errcode_for_file_access(),
00752 errmsg("could not seek to block %u in file \"%s\": %m",
00753 blocknum, FilePathName(v->mdfd_vfd))));
00754
00755 nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
00756
00757 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
00758 reln->smgr_rnode.node.spcNode,
00759 reln->smgr_rnode.node.dbNode,
00760 reln->smgr_rnode.node.relNode,
00761 reln->smgr_rnode.backend,
00762 nbytes,
00763 BLCKSZ);
00764
00765 if (nbytes != BLCKSZ)
00766 {
00767 if (nbytes < 0)
00768 ereport(ERROR,
00769 (errcode_for_file_access(),
00770 errmsg("could not write block %u in file \"%s\": %m",
00771 blocknum, FilePathName(v->mdfd_vfd))));
00772
00773 ereport(ERROR,
00774 (errcode(ERRCODE_DISK_FULL),
00775 errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
00776 blocknum,
00777 FilePathName(v->mdfd_vfd),
00778 nbytes, BLCKSZ),
00779 errhint("Check free disk space.")));
00780 }
00781
00782 if (!skipFsync && !SmgrIsTemp(reln))
00783 register_dirty_segment(reln, forknum, v);
00784 }
00785
00786
00787
00788
00789
00790
00791
00792
00793
00794 BlockNumber
00795 mdnblocks(SMgrRelation reln, ForkNumber forknum)
00796 {
00797 MdfdVec *v = mdopen(reln, forknum, EXTENSION_FAIL);
00798 BlockNumber nblocks;
00799 BlockNumber segno = 0;
00800
00801
00802
00803
00804
00805
00806
00807
00808
00809
00810
00811
00812
00813
00814 while (v->mdfd_chain != NULL)
00815 {
00816 segno++;
00817 v = v->mdfd_chain;
00818 }
00819
00820 for (;;)
00821 {
00822 nblocks = _mdnblocks(reln, forknum, v);
00823 if (nblocks > ((BlockNumber) RELSEG_SIZE))
00824 elog(FATAL, "segment too big");
00825 if (nblocks < ((BlockNumber) RELSEG_SIZE))
00826 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
00827
00828
00829
00830
00831 segno++;
00832
00833 if (v->mdfd_chain == NULL)
00834 {
00835
00836
00837
00838
00839
00840
00841 v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
00842 if (v->mdfd_chain == NULL)
00843 ereport(ERROR,
00844 (errcode_for_file_access(),
00845 errmsg("could not open file \"%s\": %m",
00846 _mdfd_segpath(reln, forknum, segno))));
00847 }
00848
00849 v = v->mdfd_chain;
00850 }
00851 }
00852
00853
00854
00855
00856 void
00857 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
00858 {
00859 MdfdVec *v;
00860 BlockNumber curnblk;
00861 BlockNumber priorblocks;
00862
00863
00864
00865
00866
00867 curnblk = mdnblocks(reln, forknum);
00868 if (nblocks > curnblk)
00869 {
00870
00871 if (InRecovery)
00872 return;
00873 ereport(ERROR,
00874 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
00875 relpath(reln->smgr_rnode, forknum),
00876 nblocks, curnblk)));
00877 }
00878 if (nblocks == curnblk)
00879 return;
00880
00881 v = mdopen(reln, forknum, EXTENSION_FAIL);
00882
00883 priorblocks = 0;
00884 while (v != NULL)
00885 {
00886 MdfdVec *ov = v;
00887
00888 if (priorblocks > nblocks)
00889 {
00890
00891
00892
00893
00894
00895 if (FileTruncate(v->mdfd_vfd, 0) < 0)
00896 ereport(ERROR,
00897 (errcode_for_file_access(),
00898 errmsg("could not truncate file \"%s\": %m",
00899 FilePathName(v->mdfd_vfd))));
00900
00901 if (!SmgrIsTemp(reln))
00902 register_dirty_segment(reln, forknum, v);
00903 v = v->mdfd_chain;
00904 Assert(ov != reln->md_fd[forknum]);
00905
00906 pfree(ov);
00907 }
00908 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
00909 {
00910
00911
00912
00913
00914
00915
00916
00917
00918 BlockNumber lastsegblocks = nblocks - priorblocks;
00919
00920 if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
00921 ereport(ERROR,
00922 (errcode_for_file_access(),
00923 errmsg("could not truncate file \"%s\" to %u blocks: %m",
00924 FilePathName(v->mdfd_vfd),
00925 nblocks)));
00926 if (!SmgrIsTemp(reln))
00927 register_dirty_segment(reln, forknum, v);
00928 v = v->mdfd_chain;
00929 ov->mdfd_chain = NULL;
00930 }
00931 else
00932 {
00933
00934
00935
00936
00937 v = v->mdfd_chain;
00938 }
00939 priorblocks += RELSEG_SIZE;
00940 }
00941 }
00942
00943
00944
00945
00946
00947
00948
00949 void
00950 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
00951 {
00952 MdfdVec *v;
00953
00954
00955
00956
00957
00958 mdnblocks(reln, forknum);
00959
00960 v = mdopen(reln, forknum, EXTENSION_FAIL);
00961
00962 while (v != NULL)
00963 {
00964 if (FileSync(v->mdfd_vfd) < 0)
00965 ereport(ERROR,
00966 (errcode_for_file_access(),
00967 errmsg("could not fsync file \"%s\": %m",
00968 FilePathName(v->mdfd_vfd))));
00969 v = v->mdfd_chain;
00970 }
00971 }
00972
00973
00974
00975
00976 void
00977 mdsync(void)
00978 {
00979 static bool mdsync_in_progress = false;
00980
00981 HASH_SEQ_STATUS hstat;
00982 PendingOperationEntry *entry;
00983 int absorb_counter;
00984
00985
00986 int processed = 0;
00987 instr_time sync_start,
00988 sync_end,
00989 sync_diff;
00990 uint64 elapsed;
00991 uint64 longest = 0;
00992 uint64 total_elapsed = 0;
00993
00994
00995
00996
00997
00998 if (!pendingOpsTable)
00999 elog(ERROR, "cannot sync without a pendingOpsTable");
01000
01001
01002
01003
01004
01005
01006
01007
01008
01009
01010 AbsorbFsyncRequests();
01011
01012
01013
01014
01015
01016
01017
01018
01019
01020
01021
01022
01023
01024
01025
01026
01027
01028
01029
01030
01031
01032
01033
01034
01035
01036 if (mdsync_in_progress)
01037 {
01038
01039 hash_seq_init(&hstat, pendingOpsTable);
01040 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
01041 {
01042 entry->cycle_ctr = mdsync_cycle_ctr;
01043 }
01044 }
01045
01046
01047 mdsync_cycle_ctr++;
01048
01049
01050 mdsync_in_progress = true;
01051
01052
01053 absorb_counter = FSYNCS_PER_ABSORB;
01054 hash_seq_init(&hstat, pendingOpsTable);
01055 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
01056 {
01057 ForkNumber forknum;
01058
01059
01060
01061
01062
01063
01064 if (entry->cycle_ctr == mdsync_cycle_ctr)
01065 continue;
01066
01067
01068 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
01069
01070
01071
01072
01073
01074
01075
01076
01077
01078
01079
01080
01081 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
01082 {
01083 Bitmapset *requests = entry->requests[forknum];
01084 int segno;
01085
01086 entry->requests[forknum] = NULL;
01087 entry->canceled[forknum] = false;
01088
01089 while ((segno = bms_first_member(requests)) >= 0)
01090 {
01091 int failures;
01092
01093
01094
01095
01096
01097
01098 if (!enableFsync)
01099 continue;
01100
01101
01102
01103
01104
01105
01106
01107
01108 if (--absorb_counter <= 0)
01109 {
01110 AbsorbFsyncRequests();
01111 absorb_counter = FSYNCS_PER_ABSORB;
01112 }
01113
01114
01115
01116
01117
01118
01119
01120
01121
01122
01123
01124
01125
01126 for (failures = 0;; failures++)
01127 {
01128 SMgrRelation reln;
01129 MdfdVec *seg;
01130 char *path;
01131 int save_errno;
01132
01133
01134
01135
01136
01137
01138
01139
01140
01141
01142
01143
01144
01145
01146
01147 reln = smgropen(entry->rnode, InvalidBackendId);
01148
01149
01150 seg = _mdfd_getseg(reln, forknum,
01151 (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
01152 false, EXTENSION_RETURN_NULL);
01153
01154 INSTR_TIME_SET_CURRENT(sync_start);
01155
01156 if (seg != NULL &&
01157 FileSync(seg->mdfd_vfd) >= 0)
01158 {
01159
01160 INSTR_TIME_SET_CURRENT(sync_end);
01161 sync_diff = sync_end;
01162 INSTR_TIME_SUBTRACT(sync_diff, sync_start);
01163 elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
01164 if (elapsed > longest)
01165 longest = elapsed;
01166 total_elapsed += elapsed;
01167 processed++;
01168 if (log_checkpoints)
01169 elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
01170 processed,
01171 FilePathName(seg->mdfd_vfd),
01172 (double) elapsed / 1000);
01173
01174 break;
01175 }
01176
01177
01178 save_errno = errno;
01179 path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
01180 errno = save_errno;
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191
01192
01193
01194 if (!FILE_POSSIBLY_DELETED(errno) ||
01195 failures > 0)
01196 ereport(ERROR,
01197 (errcode_for_file_access(),
01198 errmsg("could not fsync file \"%s\": %m",
01199 path)));
01200 else
01201 ereport(DEBUG1,
01202 (errcode_for_file_access(),
01203 errmsg("could not fsync file \"%s\" but retrying: %m",
01204 path)));
01205 pfree(path);
01206
01207
01208
01209
01210
01211 AbsorbFsyncRequests();
01212 absorb_counter = FSYNCS_PER_ABSORB;
01213
01214 if (entry->canceled[forknum])
01215 break;
01216 }
01217 }
01218 bms_free(requests);
01219 }
01220
01221
01222
01223
01224
01225
01226
01227 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
01228 {
01229 if (entry->requests[forknum] != NULL)
01230 break;
01231 }
01232 if (forknum <= MAX_FORKNUM)
01233 entry->cycle_ctr = mdsync_cycle_ctr;
01234 else
01235 {
01236
01237 if (hash_search(pendingOpsTable, &entry->rnode,
01238 HASH_REMOVE, NULL) == NULL)
01239 elog(ERROR, "pendingOpsTable corrupted");
01240 }
01241 }
01242
01243
01244 CheckpointStats.ckpt_sync_rels = processed;
01245 CheckpointStats.ckpt_longest_sync = longest;
01246 CheckpointStats.ckpt_agg_sync_time = total_elapsed;
01247
01248
01249 mdsync_in_progress = false;
01250 }
01251
01252
01253
01254
01255
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265
01266 void
01267 mdpreckpt(void)
01268 {
01269
01270
01271
01272
01273 mdckpt_cycle_ctr++;
01274 }
01275
01276
01277
01278
01279
01280
01281 void
01282 mdpostckpt(void)
01283 {
01284 int absorb_counter;
01285
01286 absorb_counter = UNLINKS_PER_ABSORB;
01287 while (pendingUnlinks != NIL)
01288 {
01289 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
01290 char *path;
01291
01292
01293
01294
01295
01296
01297
01298
01299
01300
01301 if (entry->cycle_ctr == mdckpt_cycle_ctr)
01302 break;
01303
01304
01305 path = relpathperm(entry->rnode, MAIN_FORKNUM);
01306 if (unlink(path) < 0)
01307 {
01308
01309
01310
01311
01312
01313
01314
01315 if (errno != ENOENT)
01316 ereport(WARNING,
01317 (errcode_for_file_access(),
01318 errmsg("could not remove file \"%s\": %m", path)));
01319 }
01320 pfree(path);
01321
01322
01323 pendingUnlinks = list_delete_first(pendingUnlinks);
01324 pfree(entry);
01325
01326
01327
01328
01329
01330
01331
01332 if (--absorb_counter <= 0)
01333 {
01334 AbsorbFsyncRequests();
01335 absorb_counter = UNLINKS_PER_ABSORB;
01336 }
01337 }
01338 }
01339
01340
01341
01342
01343
01344
01345
01346
01347
01348
01349 static void
01350 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
01351 {
01352
01353 Assert(!SmgrIsTemp(reln));
01354
01355 if (pendingOpsTable)
01356 {
01357
01358 RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
01359 }
01360 else
01361 {
01362 if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
01363 return;
01364
01365 ereport(DEBUG1,
01366 (errmsg("could not forward fsync request because request queue is full")));
01367
01368 if (FileSync(seg->mdfd_vfd) < 0)
01369 ereport(ERROR,
01370 (errcode_for_file_access(),
01371 errmsg("could not fsync file \"%s\": %m",
01372 FilePathName(seg->mdfd_vfd))));
01373 }
01374 }
01375
01376
01377
01378
01379
01380
01381
01382
01383
01384
01385 static void
01386 register_unlink(RelFileNodeBackend rnode)
01387 {
01388
01389 Assert(!RelFileNodeBackendIsTemp(rnode));
01390
01391 if (pendingOpsTable)
01392 {
01393
01394 RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
01395 UNLINK_RELATION_REQUEST);
01396 }
01397 else
01398 {
01399
01400
01401
01402
01403
01404
01405
01406 Assert(IsUnderPostmaster);
01407 while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
01408 UNLINK_RELATION_REQUEST))
01409 pg_usleep(10000L);
01410 }
01411 }
01412
01413
01414
01415
01416
01417
01418
01419
01420
01421
01422
01423
01424
01425
01426
01427
01428
01429
01430
01431
01432
01433
01434 void
01435 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
01436 {
01437 Assert(pendingOpsTable);
01438
01439 if (segno == FORGET_RELATION_FSYNC)
01440 {
01441
01442 PendingOperationEntry *entry;
01443
01444 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
01445 &rnode,
01446 HASH_FIND,
01447 NULL);
01448 if (entry)
01449 {
01450
01451
01452
01453
01454
01455
01456
01457 if (forknum == InvalidForkNumber)
01458 {
01459
01460 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
01461 {
01462 bms_free(entry->requests[forknum]);
01463 entry->requests[forknum] = NULL;
01464 entry->canceled[forknum] = true;
01465 }
01466 }
01467 else
01468 {
01469
01470 bms_free(entry->requests[forknum]);
01471 entry->requests[forknum] = NULL;
01472 entry->canceled[forknum] = true;
01473 }
01474 }
01475 }
01476 else if (segno == FORGET_DATABASE_FSYNC)
01477 {
01478
01479 HASH_SEQ_STATUS hstat;
01480 PendingOperationEntry *entry;
01481 ListCell *cell,
01482 *prev,
01483 *next;
01484
01485
01486 hash_seq_init(&hstat, pendingOpsTable);
01487 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
01488 {
01489 if (entry->rnode.dbNode == rnode.dbNode)
01490 {
01491
01492 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
01493 {
01494 bms_free(entry->requests[forknum]);
01495 entry->requests[forknum] = NULL;
01496 entry->canceled[forknum] = true;
01497 }
01498 }
01499 }
01500
01501
01502 prev = NULL;
01503 for (cell = list_head(pendingUnlinks); cell; cell = next)
01504 {
01505 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
01506
01507 next = lnext(cell);
01508 if (entry->rnode.dbNode == rnode.dbNode)
01509 {
01510 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
01511 pfree(entry);
01512 }
01513 else
01514 prev = cell;
01515 }
01516 }
01517 else if (segno == UNLINK_RELATION_REQUEST)
01518 {
01519
01520 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
01521 PendingUnlinkEntry *entry;
01522
01523
01524 Assert(forknum == MAIN_FORKNUM);
01525
01526 entry = palloc(sizeof(PendingUnlinkEntry));
01527 entry->rnode = rnode;
01528 entry->cycle_ctr = mdckpt_cycle_ctr;
01529
01530 pendingUnlinks = lappend(pendingUnlinks, entry);
01531
01532 MemoryContextSwitchTo(oldcxt);
01533 }
01534 else
01535 {
01536
01537 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
01538 PendingOperationEntry *entry;
01539 bool found;
01540
01541 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
01542 &rnode,
01543 HASH_ENTER,
01544 &found);
01545
01546 if (!found)
01547 {
01548 entry->cycle_ctr = mdsync_cycle_ctr;
01549 MemSet(entry->requests, 0, sizeof(entry->requests));
01550 MemSet(entry->canceled, 0, sizeof(entry->canceled));
01551 }
01552
01553
01554
01555
01556
01557
01558
01559 entry->requests[forknum] = bms_add_member(entry->requests[forknum],
01560 (int) segno);
01561
01562 MemoryContextSwitchTo(oldcxt);
01563 }
01564 }
01565
01566
01567
01568
01569
01570
01571
01572 void
01573 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
01574 {
01575 if (pendingOpsTable)
01576 {
01577
01578 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
01579 }
01580 else if (IsUnderPostmaster)
01581 {
01582
01583
01584
01585
01586
01587
01588
01589
01590
01591
01592 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
01593 pg_usleep(10000L);
01594
01595
01596
01597
01598
01599 }
01600 }
01601
01602
01603
01604
01605 void
01606 ForgetDatabaseFsyncRequests(Oid dbid)
01607 {
01608 RelFileNode rnode;
01609
01610 rnode.dbNode = dbid;
01611 rnode.spcNode = 0;
01612 rnode.relNode = 0;
01613
01614 if (pendingOpsTable)
01615 {
01616
01617 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
01618 }
01619 else if (IsUnderPostmaster)
01620 {
01621
01622 while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
01623 FORGET_DATABASE_FSYNC))
01624 pg_usleep(10000L);
01625 }
01626 }
01627
01628
01629
01630
01631
01632 static MdfdVec *
01633 _fdvec_alloc(void)
01634 {
01635 return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
01636 }
01637
01638
01639
01640
01641
01642 static char *
01643 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
01644 {
01645 char *path,
01646 *fullpath;
01647
01648 path = relpath(reln->smgr_rnode, forknum);
01649
01650 if (segno > 0)
01651 {
01652
01653 fullpath = (char *) palloc(strlen(path) + 12);
01654 sprintf(fullpath, "%s.%u", path, segno);
01655 pfree(path);
01656 }
01657 else
01658 fullpath = path;
01659
01660 return fullpath;
01661 }
01662
01663
01664
01665
01666
01667 static MdfdVec *
01668 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
01669 int oflags)
01670 {
01671 MdfdVec *v;
01672 int fd;
01673 char *fullpath;
01674
01675 fullpath = _mdfd_segpath(reln, forknum, segno);
01676
01677
01678 fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
01679
01680 pfree(fullpath);
01681
01682 if (fd < 0)
01683 return NULL;
01684
01685
01686 v = _fdvec_alloc();
01687
01688
01689 v->mdfd_vfd = fd;
01690 v->mdfd_segno = segno;
01691 v->mdfd_chain = NULL;
01692 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
01693
01694
01695 return v;
01696 }
01697
01698
01699
01700
01701
01702
01703
01704
01705
01706 static MdfdVec *
01707 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
01708 bool skipFsync, ExtensionBehavior behavior)
01709 {
01710 MdfdVec *v = mdopen(reln, forknum, behavior);
01711 BlockNumber targetseg;
01712 BlockNumber nextsegno;
01713
01714 if (!v)
01715 return NULL;
01716
01717 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
01718 for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
01719 {
01720 Assert(nextsegno == v->mdfd_segno + 1);
01721
01722 if (v->mdfd_chain == NULL)
01723 {
01724
01725
01726
01727
01728
01729
01730
01731
01732
01733
01734
01735
01736
01737
01738 if (behavior == EXTENSION_CREATE || InRecovery)
01739 {
01740 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
01741 {
01742 char *zerobuf = palloc0(BLCKSZ);
01743
01744 mdextend(reln, forknum,
01745 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
01746 zerobuf, skipFsync);
01747 pfree(zerobuf);
01748 }
01749 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
01750 }
01751 else
01752 {
01753
01754 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
01755 }
01756 if (v->mdfd_chain == NULL)
01757 {
01758 if (behavior == EXTENSION_RETURN_NULL &&
01759 FILE_POSSIBLY_DELETED(errno))
01760 return NULL;
01761 ereport(ERROR,
01762 (errcode_for_file_access(),
01763 errmsg("could not open file \"%s\" (target block %u): %m",
01764 _mdfd_segpath(reln, forknum, nextsegno),
01765 blkno)));
01766 }
01767 }
01768 v = v->mdfd_chain;
01769 }
01770 return v;
01771 }
01772
01773
01774
01775
01776 static BlockNumber
01777 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
01778 {
01779 off_t len;
01780
01781 len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
01782 if (len < 0)
01783 ereport(ERROR,
01784 (errcode_for_file_access(),
01785 errmsg("could not seek to end of file \"%s\": %m",
01786 FilePathName(seg->mdfd_vfd))));
01787
01788 return (BlockNumber) (len / BLCKSZ);
01789 }