00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "postgres.h"
00024
00025 #include "access/nbtree.h"
00026 #include "access/transam.h"
00027 #include "miscadmin.h"
00028 #include "storage/indexfsm.h"
00029 #include "storage/lmgr.h"
00030 #include "storage/predicate.h"
00031 #include "utils/inval.h"
00032 #include "utils/snapmgr.h"
00033
00034
00035
00036
00037
00038 void
00039 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
00040 {
00041 BTMetaPageData *metad;
00042 BTPageOpaque metaopaque;
00043
00044 _bt_pageinit(page, BLCKSZ);
00045
00046 metad = BTPageGetMeta(page);
00047 metad->btm_magic = BTREE_MAGIC;
00048 metad->btm_version = BTREE_VERSION;
00049 metad->btm_root = rootbknum;
00050 metad->btm_level = level;
00051 metad->btm_fastroot = rootbknum;
00052 metad->btm_fastlevel = level;
00053
00054 metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
00055 metaopaque->btpo_flags = BTP_META;
00056
00057
00058
00059
00060
00061 ((PageHeader) page)->pd_lower =
00062 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
00063 }
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093 Buffer
00094 _bt_getroot(Relation rel, int access)
00095 {
00096 Buffer metabuf;
00097 Page metapg;
00098 BTPageOpaque metaopaque;
00099 Buffer rootbuf;
00100 Page rootpage;
00101 BTPageOpaque rootopaque;
00102 BlockNumber rootblkno;
00103 uint32 rootlevel;
00104 BTMetaPageData *metad;
00105
00106
00107
00108
00109
00110
00111 if (rel->rd_amcache != NULL)
00112 {
00113 metad = (BTMetaPageData *) rel->rd_amcache;
00114
00115 Assert(metad->btm_magic == BTREE_MAGIC);
00116 Assert(metad->btm_version == BTREE_VERSION);
00117 Assert(metad->btm_root != P_NONE);
00118
00119 rootblkno = metad->btm_fastroot;
00120 Assert(rootblkno != P_NONE);
00121 rootlevel = metad->btm_fastlevel;
00122
00123 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
00124 rootpage = BufferGetPage(rootbuf);
00125 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
00126
00127
00128
00129
00130
00131
00132
00133
00134 if (!P_IGNORE(rootopaque) &&
00135 rootopaque->btpo.level == rootlevel &&
00136 P_LEFTMOST(rootopaque) &&
00137 P_RIGHTMOST(rootopaque))
00138 {
00139
00140 return rootbuf;
00141 }
00142 _bt_relbuf(rel, rootbuf);
00143
00144 if (rel->rd_amcache)
00145 pfree(rel->rd_amcache);
00146 rel->rd_amcache = NULL;
00147 }
00148
00149 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
00150 metapg = BufferGetPage(metabuf);
00151 metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
00152 metad = BTPageGetMeta(metapg);
00153
00154
00155 if (!(metaopaque->btpo_flags & BTP_META) ||
00156 metad->btm_magic != BTREE_MAGIC)
00157 ereport(ERROR,
00158 (errcode(ERRCODE_INDEX_CORRUPTED),
00159 errmsg("index \"%s\" is not a btree",
00160 RelationGetRelationName(rel))));
00161
00162 if (metad->btm_version != BTREE_VERSION)
00163 ereport(ERROR,
00164 (errcode(ERRCODE_INDEX_CORRUPTED),
00165 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
00166 RelationGetRelationName(rel),
00167 metad->btm_version, BTREE_VERSION)));
00168
00169
00170 if (metad->btm_root == P_NONE)
00171 {
00172
00173 if (access == BT_READ)
00174 {
00175 _bt_relbuf(rel, metabuf);
00176 return InvalidBuffer;
00177 }
00178
00179
00180 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
00181 LockBuffer(metabuf, BT_WRITE);
00182
00183
00184
00185
00186
00187
00188 if (metad->btm_root != P_NONE)
00189 {
00190
00191
00192
00193
00194
00195
00196 _bt_relbuf(rel, metabuf);
00197 return _bt_getroot(rel, access);
00198 }
00199
00200
00201
00202
00203
00204
00205 rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
00206 rootblkno = BufferGetBlockNumber(rootbuf);
00207 rootpage = BufferGetPage(rootbuf);
00208 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
00209 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
00210 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
00211 rootopaque->btpo.level = 0;
00212 rootopaque->btpo_cycleid = 0;
00213
00214
00215 START_CRIT_SECTION();
00216
00217 metad->btm_root = rootblkno;
00218 metad->btm_level = 0;
00219 metad->btm_fastroot = rootblkno;
00220 metad->btm_fastlevel = 0;
00221
00222 MarkBufferDirty(rootbuf);
00223 MarkBufferDirty(metabuf);
00224
00225
00226 if (RelationNeedsWAL(rel))
00227 {
00228 xl_btree_newroot xlrec;
00229 XLogRecPtr recptr;
00230 XLogRecData rdata;
00231
00232 xlrec.node = rel->rd_node;
00233 xlrec.rootblk = rootblkno;
00234 xlrec.level = 0;
00235
00236 rdata.data = (char *) &xlrec;
00237 rdata.len = SizeOfBtreeNewroot;
00238 rdata.buffer = InvalidBuffer;
00239 rdata.next = NULL;
00240
00241 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
00242
00243 PageSetLSN(rootpage, recptr);
00244 PageSetLSN(metapg, recptr);
00245 }
00246
00247 END_CRIT_SECTION();
00248
00249
00250
00251
00252
00253 CacheInvalidateRelcache(rel);
00254
00255
00256
00257
00258
00259
00260 LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
00261 LockBuffer(rootbuf, BT_READ);
00262
00263
00264 _bt_relbuf(rel, metabuf);
00265 }
00266 else
00267 {
00268 rootblkno = metad->btm_fastroot;
00269 Assert(rootblkno != P_NONE);
00270 rootlevel = metad->btm_fastlevel;
00271
00272
00273
00274
00275 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
00276 sizeof(BTMetaPageData));
00277 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
00278
00279
00280
00281
00282
00283 rootbuf = metabuf;
00284
00285 for (;;)
00286 {
00287 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
00288 rootpage = BufferGetPage(rootbuf);
00289 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
00290
00291 if (!P_IGNORE(rootopaque))
00292 break;
00293
00294
00295 if (P_RIGHTMOST(rootopaque))
00296 elog(ERROR, "no live root page found in index \"%s\"",
00297 RelationGetRelationName(rel));
00298 rootblkno = rootopaque->btpo_next;
00299 }
00300
00301
00302 if (rootopaque->btpo.level != rootlevel)
00303 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
00304 rootblkno, RelationGetRelationName(rel),
00305 rootopaque->btpo.level, rootlevel);
00306 }
00307
00308
00309
00310
00311
00312 return rootbuf;
00313 }
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329 Buffer
00330 _bt_gettrueroot(Relation rel)
00331 {
00332 Buffer metabuf;
00333 Page metapg;
00334 BTPageOpaque metaopaque;
00335 Buffer rootbuf;
00336 Page rootpage;
00337 BTPageOpaque rootopaque;
00338 BlockNumber rootblkno;
00339 uint32 rootlevel;
00340 BTMetaPageData *metad;
00341
00342
00343
00344
00345
00346
00347
00348 if (rel->rd_amcache)
00349 pfree(rel->rd_amcache);
00350 rel->rd_amcache = NULL;
00351
00352 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
00353 metapg = BufferGetPage(metabuf);
00354 metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
00355 metad = BTPageGetMeta(metapg);
00356
00357 if (!(metaopaque->btpo_flags & BTP_META) ||
00358 metad->btm_magic != BTREE_MAGIC)
00359 ereport(ERROR,
00360 (errcode(ERRCODE_INDEX_CORRUPTED),
00361 errmsg("index \"%s\" is not a btree",
00362 RelationGetRelationName(rel))));
00363
00364 if (metad->btm_version != BTREE_VERSION)
00365 ereport(ERROR,
00366 (errcode(ERRCODE_INDEX_CORRUPTED),
00367 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
00368 RelationGetRelationName(rel),
00369 metad->btm_version, BTREE_VERSION)));
00370
00371
00372 if (metad->btm_root == P_NONE)
00373 {
00374 _bt_relbuf(rel, metabuf);
00375 return InvalidBuffer;
00376 }
00377
00378 rootblkno = metad->btm_root;
00379 rootlevel = metad->btm_level;
00380
00381
00382
00383
00384
00385 rootbuf = metabuf;
00386
00387 for (;;)
00388 {
00389 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
00390 rootpage = BufferGetPage(rootbuf);
00391 rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
00392
00393 if (!P_IGNORE(rootopaque))
00394 break;
00395
00396
00397 if (P_RIGHTMOST(rootopaque))
00398 elog(ERROR, "no live root page found in index \"%s\"",
00399 RelationGetRelationName(rel));
00400 rootblkno = rootopaque->btpo_next;
00401 }
00402
00403
00404 if (rootopaque->btpo.level != rootlevel)
00405 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
00406 rootblkno, RelationGetRelationName(rel),
00407 rootopaque->btpo.level, rootlevel);
00408
00409 return rootbuf;
00410 }
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423 int
00424 _bt_getrootheight(Relation rel)
00425 {
00426 BTMetaPageData *metad;
00427
00428
00429
00430
00431
00432 if (rel->rd_amcache == NULL)
00433 {
00434 Buffer metabuf;
00435 Page metapg;
00436 BTPageOpaque metaopaque;
00437
00438 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
00439 metapg = BufferGetPage(metabuf);
00440 metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
00441 metad = BTPageGetMeta(metapg);
00442
00443
00444 if (!(metaopaque->btpo_flags & BTP_META) ||
00445 metad->btm_magic != BTREE_MAGIC)
00446 ereport(ERROR,
00447 (errcode(ERRCODE_INDEX_CORRUPTED),
00448 errmsg("index \"%s\" is not a btree",
00449 RelationGetRelationName(rel))));
00450
00451 if (metad->btm_version != BTREE_VERSION)
00452 ereport(ERROR,
00453 (errcode(ERRCODE_INDEX_CORRUPTED),
00454 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
00455 RelationGetRelationName(rel),
00456 metad->btm_version, BTREE_VERSION)));
00457
00458
00459
00460
00461
00462
00463 if (metad->btm_root == P_NONE)
00464 {
00465 _bt_relbuf(rel, metabuf);
00466 return 0;
00467 }
00468
00469
00470
00471
00472 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
00473 sizeof(BTMetaPageData));
00474 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
00475
00476 _bt_relbuf(rel, metabuf);
00477 }
00478
00479 metad = (BTMetaPageData *) rel->rd_amcache;
00480
00481 Assert(metad->btm_magic == BTREE_MAGIC);
00482 Assert(metad->btm_version == BTREE_VERSION);
00483 Assert(metad->btm_fastroot != P_NONE);
00484
00485 return metad->btm_fastlevel;
00486 }
00487
00488
00489
00490
00491 void
00492 _bt_checkpage(Relation rel, Buffer buf)
00493 {
00494 Page page = BufferGetPage(buf);
00495
00496
00497
00498
00499
00500
00501
00502 if (PageIsNew(page))
00503 ereport(ERROR,
00504 (errcode(ERRCODE_INDEX_CORRUPTED),
00505 errmsg("index \"%s\" contains unexpected zero page at block %u",
00506 RelationGetRelationName(rel),
00507 BufferGetBlockNumber(buf)),
00508 errhint("Please REINDEX it.")));
00509
00510
00511
00512
00513 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
00514 ereport(ERROR,
00515 (errcode(ERRCODE_INDEX_CORRUPTED),
00516 errmsg("index \"%s\" contains corrupted page at block %u",
00517 RelationGetRelationName(rel),
00518 BufferGetBlockNumber(buf)),
00519 errhint("Please REINDEX it.")));
00520 }
00521
00522
00523
00524
00525 static void
00526 _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
00527 {
00528 if (!RelationNeedsWAL(rel))
00529 return;
00530
00531
00532 START_CRIT_SECTION();
00533
00534
00535
00536
00537
00538
00539
00540 {
00541 XLogRecData rdata[1];
00542 xl_btree_reuse_page xlrec_reuse;
00543
00544 xlrec_reuse.node = rel->rd_node;
00545 xlrec_reuse.block = blkno;
00546 xlrec_reuse.latestRemovedXid = latestRemovedXid;
00547 rdata[0].data = (char *) &xlrec_reuse;
00548 rdata[0].len = SizeOfBtreeReusePage;
00549 rdata[0].buffer = InvalidBuffer;
00550 rdata[0].next = NULL;
00551
00552 XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata);
00553
00554
00555
00556
00557
00558 }
00559
00560 END_CRIT_SECTION();
00561 }
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571
00572
00573
00574 Buffer
00575 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
00576 {
00577 Buffer buf;
00578
00579 if (blkno != P_NEW)
00580 {
00581
00582 buf = ReadBuffer(rel, blkno);
00583 LockBuffer(buf, access);
00584 _bt_checkpage(rel, buf);
00585 }
00586 else
00587 {
00588 bool needLock;
00589 Page page;
00590
00591 Assert(access == BT_WRITE);
00592
00593
00594
00595
00596
00597
00598
00599
00600
00601
00602
00603
00604
00605
00606
00607
00608
00609
00610
00611
00612
00613
00614
00615
00616
00617 for (;;)
00618 {
00619 blkno = GetFreeIndexPage(rel);
00620 if (blkno == InvalidBlockNumber)
00621 break;
00622 buf = ReadBuffer(rel, blkno);
00623 if (ConditionalLockBuffer(buf))
00624 {
00625 page = BufferGetPage(buf);
00626 if (_bt_page_recyclable(page))
00627 {
00628
00629
00630
00631
00632
00633 if (XLogStandbyInfoActive())
00634 {
00635 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00636
00637 _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
00638 }
00639
00640
00641 _bt_pageinit(page, BufferGetPageSize(buf));
00642 return buf;
00643 }
00644 elog(DEBUG2, "FSM returned nonrecyclable page");
00645 _bt_relbuf(rel, buf);
00646 }
00647 else
00648 {
00649 elog(DEBUG2, "FSM returned nonlockable page");
00650
00651 ReleaseBuffer(buf);
00652 }
00653 }
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663 needLock = !RELATION_IS_LOCAL(rel);
00664
00665 if (needLock)
00666 LockRelationForExtension(rel, ExclusiveLock);
00667
00668 buf = ReadBuffer(rel, P_NEW);
00669
00670
00671 LockBuffer(buf, BT_WRITE);
00672
00673
00674
00675
00676
00677
00678
00679 if (needLock)
00680 UnlockRelationForExtension(rel, ExclusiveLock);
00681
00682
00683 page = BufferGetPage(buf);
00684 Assert(PageIsNew(page));
00685 _bt_pageinit(page, BufferGetPageSize(buf));
00686 }
00687
00688
00689 return buf;
00690 }
00691
00692
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705 Buffer
00706 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
00707 {
00708 Buffer buf;
00709
00710 Assert(blkno != P_NEW);
00711 if (BufferIsValid(obuf))
00712 LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
00713 buf = ReleaseAndReadBuffer(obuf, rel, blkno);
00714 LockBuffer(buf, access);
00715 _bt_checkpage(rel, buf);
00716 return buf;
00717 }
00718
00719
00720
00721
00722
00723
00724 void
00725 _bt_relbuf(Relation rel, Buffer buf)
00726 {
00727 UnlockReleaseBuffer(buf);
00728 }
00729
00730
00731
00732
00733
00734
00735
00736 void
00737 _bt_pageinit(Page page, Size size)
00738 {
00739 PageInit(page, size, sizeof(BTPageOpaqueData));
00740 }
00741
00742
00743
00744
00745
00746
00747
00748 bool
00749 _bt_page_recyclable(Page page)
00750 {
00751 BTPageOpaque opaque;
00752
00753
00754
00755
00756
00757
00758
00759 if (PageIsNew(page))
00760 return true;
00761
00762
00763
00764
00765
00766 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00767 if (P_ISDELETED(opaque) &&
00768 TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin))
00769 return true;
00770 return false;
00771 }
00772
00773
00774
00775
00776
00777
00778
00779
00780
00781
00782
00783
00784
00785
00786
00787
00788
00789
00790
00791
00792
00793 void
00794 _bt_delitems_vacuum(Relation rel, Buffer buf,
00795 OffsetNumber *itemnos, int nitems,
00796 BlockNumber lastBlockVacuumed)
00797 {
00798 Page page = BufferGetPage(buf);
00799 BTPageOpaque opaque;
00800
00801
00802 START_CRIT_SECTION();
00803
00804
00805 if (nitems > 0)
00806 PageIndexMultiDelete(page, itemnos, nitems);
00807
00808
00809
00810
00811
00812 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00813 opaque->btpo_cycleid = 0;
00814
00815
00816
00817
00818
00819
00820
00821
00822 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
00823
00824 MarkBufferDirty(buf);
00825
00826
00827 if (RelationNeedsWAL(rel))
00828 {
00829 XLogRecPtr recptr;
00830 XLogRecData rdata[2];
00831 xl_btree_vacuum xlrec_vacuum;
00832
00833 xlrec_vacuum.node = rel->rd_node;
00834 xlrec_vacuum.block = BufferGetBlockNumber(buf);
00835
00836 xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
00837 rdata[0].data = (char *) &xlrec_vacuum;
00838 rdata[0].len = SizeOfBtreeVacuum;
00839 rdata[0].buffer = InvalidBuffer;
00840 rdata[0].next = &(rdata[1]);
00841
00842
00843
00844
00845
00846
00847 if (nitems > 0)
00848 {
00849 rdata[1].data = (char *) itemnos;
00850 rdata[1].len = nitems * sizeof(OffsetNumber);
00851 }
00852 else
00853 {
00854 rdata[1].data = NULL;
00855 rdata[1].len = 0;
00856 }
00857 rdata[1].buffer = buf;
00858 rdata[1].buffer_std = true;
00859 rdata[1].next = NULL;
00860
00861 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
00862
00863 PageSetLSN(page, recptr);
00864 }
00865
00866 END_CRIT_SECTION();
00867 }
00868
00869
00870
00871
00872
00873
00874
00875
00876
00877
00878
00879
00880
00881 void
00882 _bt_delitems_delete(Relation rel, Buffer buf,
00883 OffsetNumber *itemnos, int nitems,
00884 Relation heapRel)
00885 {
00886 Page page = BufferGetPage(buf);
00887 BTPageOpaque opaque;
00888
00889
00890 Assert(nitems > 0);
00891
00892
00893 START_CRIT_SECTION();
00894
00895
00896 PageIndexMultiDelete(page, itemnos, nitems);
00897
00898
00899
00900
00901
00902
00903
00904
00905
00906
00907
00908
00909
00910 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00911 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
00912
00913 MarkBufferDirty(buf);
00914
00915
00916 if (RelationNeedsWAL(rel))
00917 {
00918 XLogRecPtr recptr;
00919 XLogRecData rdata[3];
00920 xl_btree_delete xlrec_delete;
00921
00922 xlrec_delete.node = rel->rd_node;
00923 xlrec_delete.hnode = heapRel->rd_node;
00924 xlrec_delete.block = BufferGetBlockNumber(buf);
00925 xlrec_delete.nitems = nitems;
00926
00927 rdata[0].data = (char *) &xlrec_delete;
00928 rdata[0].len = SizeOfBtreeDelete;
00929 rdata[0].buffer = InvalidBuffer;
00930 rdata[0].next = &(rdata[1]);
00931
00932
00933
00934
00935
00936
00937 rdata[1].data = (char *) itemnos;
00938 rdata[1].len = nitems * sizeof(OffsetNumber);
00939 rdata[1].buffer = InvalidBuffer;
00940 rdata[1].next = &(rdata[2]);
00941
00942 rdata[2].data = NULL;
00943 rdata[2].len = 0;
00944 rdata[2].buffer = buf;
00945 rdata[2].buffer_std = true;
00946 rdata[2].next = NULL;
00947
00948 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
00949
00950 PageSetLSN(page, recptr);
00951 }
00952
00953 END_CRIT_SECTION();
00954 }
00955
00956
00957
00958
00959
00960
00961
00962
00963
00964
00965
00966
00967
00968
00969
00970
00971
00972
00973 static bool
00974 _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
00975 {
00976 BlockNumber parent;
00977 OffsetNumber poffset,
00978 maxoff;
00979 Buffer pbuf;
00980 Page page;
00981 BTPageOpaque opaque;
00982
00983
00984
00985
00986
00987
00988 if (InRecovery)
00989 return true;
00990
00991
00992 ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
00993 pbuf = _bt_getstackbuf(rel, stack, BT_READ);
00994 if (pbuf == InvalidBuffer)
00995 elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
00996 RelationGetRelationName(rel), target);
00997 parent = stack->bts_blkno;
00998 poffset = stack->bts_offset;
00999
01000 page = BufferGetPage(pbuf);
01001 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01002 maxoff = PageGetMaxOffsetNumber(page);
01003
01004
01005
01006
01007
01008 if (poffset >= maxoff)
01009 {
01010
01011 if (poffset == P_FIRSTDATAKEY(opaque))
01012 {
01013
01014
01015
01016
01017
01018 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque))
01019 {
01020 _bt_relbuf(rel, pbuf);
01021 return false;
01022 }
01023
01024 _bt_relbuf(rel, pbuf);
01025 return _bt_parent_deletion_safe(rel, parent, stack->bts_parent);
01026 }
01027 else
01028 {
01029
01030 _bt_relbuf(rel, pbuf);
01031 return false;
01032 }
01033 }
01034 else
01035 {
01036
01037 _bt_relbuf(rel, pbuf);
01038 return true;
01039 }
01040 }
01041
01042
01043
01044
01045
01046
01047
01048
01049
01050
01051
01052
01053
01054
01055
01056
01057
01058
01059
01060
01061
01062
01063
01064
01065
01066 int
01067 _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
01068 {
01069 int result;
01070 BlockNumber target,
01071 leftsib,
01072 rightsib,
01073 parent;
01074 OffsetNumber poffset,
01075 maxoff;
01076 uint32 targetlevel,
01077 ilevel;
01078 ItemId itemid;
01079 IndexTuple targetkey,
01080 itup;
01081 ScanKey itup_scankey;
01082 Buffer lbuf,
01083 rbuf,
01084 pbuf;
01085 bool parent_half_dead;
01086 bool parent_one_child;
01087 bool rightsib_empty;
01088 Buffer metabuf = InvalidBuffer;
01089 Page metapg = NULL;
01090 BTMetaPageData *metad = NULL;
01091 Page page;
01092 BTPageOpaque opaque;
01093
01094
01095
01096
01097
01098 page = BufferGetPage(buf);
01099 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01100 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
01101 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
01102 {
01103
01104 Assert(!P_ISHALFDEAD(opaque));
01105
01106 _bt_relbuf(rel, buf);
01107 return 0;
01108 }
01109
01110
01111
01112
01113
01114 target = BufferGetBlockNumber(buf);
01115 targetlevel = opaque->btpo.level;
01116 leftsib = opaque->btpo_prev;
01117 itemid = PageGetItemId(page, P_HIKEY);
01118 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
01119
01120
01121
01122
01123
01124 _bt_relbuf(rel, buf);
01125
01126
01127
01128
01129
01130
01131
01132
01133
01134 if (stack == NULL)
01135 {
01136 if (!InRecovery)
01137 {
01138
01139 itup_scankey = _bt_mkscankey(rel, targetkey);
01140
01141 stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
01142 &lbuf, BT_READ);
01143
01144 _bt_relbuf(rel, lbuf);
01145
01146
01147
01148
01149
01150
01151 ilevel = 0;
01152 for (;;)
01153 {
01154 if (stack == NULL)
01155 elog(ERROR, "not enough stack items");
01156 if (ilevel == targetlevel)
01157 break;
01158 stack = stack->bts_parent;
01159 ilevel++;
01160 }
01161 }
01162 else
01163 {
01164
01165
01166
01167
01168
01169
01170
01171
01172
01173 pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
01174 stack = (BTStack) palloc(sizeof(BTStackData));
01175 stack->bts_blkno = BufferGetBlockNumber(pbuf);
01176 stack->bts_offset = InvalidOffsetNumber;
01177
01178 stack->bts_parent = NULL;
01179 _bt_relbuf(rel, pbuf);
01180 }
01181 }
01182
01183
01184
01185
01186
01187
01188
01189
01190 if (targetlevel == 0 &&
01191 !_bt_parent_deletion_safe(rel, target, stack))
01192 return 0;
01193
01194
01195
01196
01197
01198
01199
01200
01201
01202
01203
01204
01205
01206 if (leftsib != P_NONE)
01207 {
01208 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
01209 page = BufferGetPage(lbuf);
01210 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01211 while (P_ISDELETED(opaque) || opaque->btpo_next != target)
01212 {
01213
01214 leftsib = opaque->btpo_next;
01215 _bt_relbuf(rel, lbuf);
01216 if (leftsib == P_NONE)
01217 {
01218 elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
01219 RelationGetRelationName(rel));
01220 return 0;
01221 }
01222 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
01223 page = BufferGetPage(lbuf);
01224 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01225 }
01226 }
01227 else
01228 lbuf = InvalidBuffer;
01229
01230
01231
01232
01233
01234
01235 buf = _bt_getbuf(rel, target, BT_WRITE);
01236 page = BufferGetPage(buf);
01237 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01238
01239
01240
01241
01242
01243
01244 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
01245 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
01246 {
01247 _bt_relbuf(rel, buf);
01248 if (BufferIsValid(lbuf))
01249 _bt_relbuf(rel, lbuf);
01250 return 0;
01251 }
01252 if (opaque->btpo_prev != leftsib)
01253 elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
01254 target, RelationGetRelationName(rel));
01255
01256
01257
01258
01259 rightsib = opaque->btpo_next;
01260 rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
01261 page = BufferGetPage(rbuf);
01262 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01263 if (opaque->btpo_prev != target)
01264 elog(ERROR, "right sibling's left-link doesn't match: "
01265 "block %u links to %u instead of expected %u in index \"%s\"",
01266 rightsib, opaque->btpo_prev, target,
01267 RelationGetRelationName(rel));
01268
01269
01270
01271
01272
01273 PredicateLockPageCombine(rel, target, rightsib);
01274
01275
01276
01277
01278
01279 ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
01280 pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
01281 if (pbuf == InvalidBuffer)
01282 elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
01283 RelationGetRelationName(rel), target);
01284 parent = stack->bts_blkno;
01285 poffset = stack->bts_offset;
01286
01287
01288
01289
01290
01291
01292
01293 page = BufferGetPage(pbuf);
01294 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01295 maxoff = PageGetMaxOffsetNumber(page);
01296 parent_half_dead = false;
01297 parent_one_child = false;
01298 if (poffset >= maxoff)
01299 {
01300 if (poffset == P_FIRSTDATAKEY(opaque))
01301 parent_half_dead = true;
01302 else
01303 elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"",
01304 target, parent, RelationGetRelationName(rel));
01305 }
01306 else
01307 {
01308
01309 if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
01310 parent_one_child = true;
01311 }
01312
01313
01314
01315
01316
01317
01318
01319
01320
01321
01322
01323
01324
01325 if (leftsib == P_NONE && !parent_half_dead)
01326 {
01327 page = BufferGetPage(rbuf);
01328 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01329 Assert(opaque->btpo.level == targetlevel);
01330 if (P_RIGHTMOST(opaque))
01331 {
01332
01333 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
01334 metapg = BufferGetPage(metabuf);
01335 metad = BTPageGetMeta(metapg);
01336
01337
01338
01339
01340
01341
01342 if (metad->btm_fastlevel > targetlevel + 1)
01343 {
01344
01345 _bt_relbuf(rel, metabuf);
01346 metabuf = InvalidBuffer;
01347 }
01348 }
01349 }
01350
01351
01352
01353
01354
01355
01356
01357
01358
01359
01360
01361 page = BufferGetPage(pbuf);
01362 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01363
01364 #ifdef USE_ASSERT_CHECKING
01365 itemid = PageGetItemId(page, poffset);
01366 itup = (IndexTuple) PageGetItem(page, itemid);
01367 Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
01368 #endif
01369
01370 if (!parent_half_dead)
01371 {
01372 OffsetNumber nextoffset;
01373
01374 nextoffset = OffsetNumberNext(poffset);
01375 itemid = PageGetItemId(page, nextoffset);
01376 itup = (IndexTuple) PageGetItem(page, itemid);
01377 if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
01378 elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
01379 rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
01380 parent, RelationGetRelationName(rel));
01381 }
01382
01383
01384
01385
01386
01387
01388 START_CRIT_SECTION();
01389
01390
01391
01392
01393
01394
01395
01396 if (parent_half_dead)
01397 {
01398 PageIndexTupleDelete(page, poffset);
01399 opaque->btpo_flags |= BTP_HALF_DEAD;
01400 }
01401 else
01402 {
01403 OffsetNumber nextoffset;
01404
01405 itemid = PageGetItemId(page, poffset);
01406 itup = (IndexTuple) PageGetItem(page, itemid);
01407 ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
01408
01409 nextoffset = OffsetNumberNext(poffset);
01410 PageIndexTupleDelete(page, nextoffset);
01411 }
01412
01413
01414
01415
01416
01417
01418 if (BufferIsValid(lbuf))
01419 {
01420 page = BufferGetPage(lbuf);
01421 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01422 Assert(opaque->btpo_next == target);
01423 opaque->btpo_next = rightsib;
01424 }
01425 page = BufferGetPage(rbuf);
01426 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01427 Assert(opaque->btpo_prev == target);
01428 opaque->btpo_prev = leftsib;
01429 rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
01430
01431
01432
01433
01434
01435
01436
01437
01438
01439
01440
01441 page = BufferGetPage(buf);
01442 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01443 opaque->btpo_flags &= ~BTP_HALF_DEAD;
01444 opaque->btpo_flags |= BTP_DELETED;
01445 opaque->btpo.xact = ReadNewTransactionId();
01446
01447
01448 if (BufferIsValid(metabuf))
01449 {
01450 metad->btm_fastroot = rightsib;
01451 metad->btm_fastlevel = targetlevel;
01452 MarkBufferDirty(metabuf);
01453 }
01454
01455
01456 MarkBufferDirty(pbuf);
01457 MarkBufferDirty(rbuf);
01458 MarkBufferDirty(buf);
01459 if (BufferIsValid(lbuf))
01460 MarkBufferDirty(lbuf);
01461
01462
01463 if (RelationNeedsWAL(rel))
01464 {
01465 xl_btree_delete_page xlrec;
01466 xl_btree_metadata xlmeta;
01467 uint8 xlinfo;
01468 XLogRecPtr recptr;
01469 XLogRecData rdata[5];
01470 XLogRecData *nextrdata;
01471
01472 xlrec.target.node = rel->rd_node;
01473 ItemPointerSet(&(xlrec.target.tid), parent, poffset);
01474 xlrec.deadblk = target;
01475 xlrec.leftblk = leftsib;
01476 xlrec.rightblk = rightsib;
01477 xlrec.btpo_xact = opaque->btpo.xact;
01478
01479 rdata[0].data = (char *) &xlrec;
01480 rdata[0].len = SizeOfBtreeDeletePage;
01481 rdata[0].buffer = InvalidBuffer;
01482 rdata[0].next = nextrdata = &(rdata[1]);
01483
01484 if (BufferIsValid(metabuf))
01485 {
01486 xlmeta.root = metad->btm_root;
01487 xlmeta.level = metad->btm_level;
01488 xlmeta.fastroot = metad->btm_fastroot;
01489 xlmeta.fastlevel = metad->btm_fastlevel;
01490
01491 nextrdata->data = (char *) &xlmeta;
01492 nextrdata->len = sizeof(xl_btree_metadata);
01493 nextrdata->buffer = InvalidBuffer;
01494 nextrdata->next = nextrdata + 1;
01495 nextrdata++;
01496 xlinfo = XLOG_BTREE_DELETE_PAGE_META;
01497 }
01498 else if (parent_half_dead)
01499 xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
01500 else
01501 xlinfo = XLOG_BTREE_DELETE_PAGE;
01502
01503 nextrdata->data = NULL;
01504 nextrdata->len = 0;
01505 nextrdata->next = nextrdata + 1;
01506 nextrdata->buffer = pbuf;
01507 nextrdata->buffer_std = true;
01508 nextrdata++;
01509
01510 nextrdata->data = NULL;
01511 nextrdata->len = 0;
01512 nextrdata->buffer = rbuf;
01513 nextrdata->buffer_std = true;
01514 nextrdata->next = NULL;
01515
01516 if (BufferIsValid(lbuf))
01517 {
01518 nextrdata->next = nextrdata + 1;
01519 nextrdata++;
01520 nextrdata->data = NULL;
01521 nextrdata->len = 0;
01522 nextrdata->buffer = lbuf;
01523 nextrdata->buffer_std = true;
01524 nextrdata->next = NULL;
01525 }
01526
01527 recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
01528
01529 if (BufferIsValid(metabuf))
01530 {
01531 PageSetLSN(metapg, recptr);
01532 }
01533 page = BufferGetPage(pbuf);
01534 PageSetLSN(page, recptr);
01535 page = BufferGetPage(rbuf);
01536 PageSetLSN(page, recptr);
01537 page = BufferGetPage(buf);
01538 PageSetLSN(page, recptr);
01539 if (BufferIsValid(lbuf))
01540 {
01541 page = BufferGetPage(lbuf);
01542 PageSetLSN(page, recptr);
01543 }
01544 }
01545
01546 END_CRIT_SECTION();
01547
01548
01549 if (BufferIsValid(metabuf))
01550 {
01551 CacheInvalidateRelcache(rel);
01552 _bt_relbuf(rel, metabuf);
01553 }
01554
01555 if (BufferIsValid(lbuf))
01556 _bt_relbuf(rel, lbuf);
01557
01558
01559
01560
01561
01562
01563
01564
01565
01566
01567
01568
01569
01570
01571
01572
01573 if (parent_half_dead)
01574 {
01575
01576 _bt_relbuf(rel, rbuf);
01577 result = _bt_pagedel(rel, pbuf, stack->bts_parent) + 1;
01578 _bt_relbuf(rel, buf);
01579 }
01580 else if (parent_one_child && rightsib_empty)
01581 {
01582 _bt_relbuf(rel, pbuf);
01583 _bt_relbuf(rel, buf);
01584
01585 result = _bt_pagedel(rel, rbuf, stack) + 1;
01586 }
01587 else
01588 {
01589 _bt_relbuf(rel, pbuf);
01590 _bt_relbuf(rel, buf);
01591 _bt_relbuf(rel, rbuf);
01592 result = 1;
01593 }
01594
01595 return result;
01596 }