00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 #include "postgres.h"
00068
00069 #include "access/heapam_xlog.h"
00070 #include "access/nbtree.h"
00071 #include "miscadmin.h"
00072 #include "storage/smgr.h"
00073 #include "tcop/tcopprot.h"
00074 #include "utils/rel.h"
00075 #include "utils/tuplesort.h"
00076
00077
00078
00079
00080
00081
00082
00083 struct BTSpool
00084 {
00085 Tuplesortstate *sortstate;
00086 Relation heap;
00087 Relation index;
00088 bool isunique;
00089 };
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104 typedef struct BTPageState
00105 {
00106 Page btps_page;
00107 BlockNumber btps_blkno;
00108 IndexTuple btps_minkey;
00109 OffsetNumber btps_lastoff;
00110 uint32 btps_level;
00111 Size btps_full;
00112 struct BTPageState *btps_next;
00113 } BTPageState;
00114
00115
00116
00117
00118 typedef struct BTWriteState
00119 {
00120 Relation heap;
00121 Relation index;
00122 bool btws_use_wal;
00123 BlockNumber btws_pages_alloced;
00124 BlockNumber btws_pages_written;
00125 Page btws_zeropage;
00126 } BTWriteState;
00127
00128
00129 static Page _bt_blnewpage(uint32 level);
00130 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
00131 static void _bt_slideleft(Page page);
00132 static void _bt_sortaddtup(Page page, Size itemsize,
00133 IndexTuple itup, OffsetNumber itup_off);
00134 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
00135 IndexTuple itup);
00136 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
00137 static void _bt_load(BTWriteState *wstate,
00138 BTSpool *btspool, BTSpool *btspool2);
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149 BTSpool *
00150 _bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
00151 {
00152 BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
00153 int btKbytes;
00154
00155 btspool->heap = heap;
00156 btspool->index = index;
00157 btspool->isunique = isunique;
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167 btKbytes = isdead ? work_mem : maintenance_work_mem;
00168 btspool->sortstate = tuplesort_begin_index_btree(heap, index, isunique,
00169 btKbytes, false);
00170
00171 return btspool;
00172 }
00173
00174
00175
00176
00177 void
00178 _bt_spooldestroy(BTSpool *btspool)
00179 {
00180 tuplesort_end(btspool->sortstate);
00181 pfree(btspool);
00182 }
00183
00184
00185
00186
00187 void
00188 _bt_spool(IndexTuple itup, BTSpool *btspool)
00189 {
00190 tuplesort_putindextuple(btspool->sortstate, itup);
00191 }
00192
00193
00194
00195
00196
00197 void
00198 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
00199 {
00200 BTWriteState wstate;
00201
00202 #ifdef BTREE_BUILD_STATS
00203 if (log_btree_build_stats)
00204 {
00205 ShowUsage("BTREE BUILD (Spool) STATISTICS");
00206 ResetUsage();
00207 }
00208 #endif
00209
00210 tuplesort_performsort(btspool->sortstate);
00211 if (btspool2)
00212 tuplesort_performsort(btspool2->sortstate);
00213
00214 wstate.heap = btspool->heap;
00215 wstate.index = btspool->index;
00216
00217
00218
00219
00220
00221 wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
00222
00223
00224 wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
00225 wstate.btws_pages_written = 0;
00226 wstate.btws_zeropage = NULL;
00227
00228 _bt_load(&wstate, btspool, btspool2);
00229 }
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240 static Page
00241 _bt_blnewpage(uint32 level)
00242 {
00243 Page page;
00244 BTPageOpaque opaque;
00245
00246 page = (Page) palloc(BLCKSZ);
00247
00248
00249 _bt_pageinit(page, BLCKSZ);
00250
00251
00252 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00253 opaque->btpo_prev = opaque->btpo_next = P_NONE;
00254 opaque->btpo.level = level;
00255 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
00256 opaque->btpo_cycleid = 0;
00257
00258
00259 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
00260
00261 return page;
00262 }
00263
00264
00265
00266
00267 static void
00268 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
00269 {
00270
00271 RelationOpenSmgr(wstate->index);
00272
00273
00274 if (wstate->btws_use_wal)
00275 {
00276
00277 log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page);
00278 }
00279
00280
00281
00282
00283
00284
00285
00286
00287 while (blkno > wstate->btws_pages_written)
00288 {
00289 if (!wstate->btws_zeropage)
00290 wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
00291
00292 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
00293 wstate->btws_pages_written++,
00294 (char *) wstate->btws_zeropage,
00295 true);
00296 }
00297
00298 PageSetChecksumInplace(page, blkno);
00299
00300
00301
00302
00303
00304 if (blkno == wstate->btws_pages_written)
00305 {
00306
00307 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
00308 (char *) page, true);
00309 wstate->btws_pages_written++;
00310 }
00311 else
00312 {
00313
00314 smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
00315 (char *) page, true);
00316 }
00317
00318 pfree(page);
00319 }
00320
00321
00322
00323
00324
00325 static BTPageState *
00326 _bt_pagestate(BTWriteState *wstate, uint32 level)
00327 {
00328 BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
00329
00330
00331 state->btps_page = _bt_blnewpage(level);
00332
00333
00334 state->btps_blkno = wstate->btws_pages_alloced++;
00335
00336 state->btps_minkey = NULL;
00337
00338 state->btps_lastoff = P_HIKEY;
00339 state->btps_level = level;
00340
00341 if (level > 0)
00342 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
00343 else
00344 state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
00345 BTREE_DEFAULT_FILLFACTOR);
00346
00347 state->btps_next = NULL;
00348
00349 return state;
00350 }
00351
00352
00353
00354
00355
00356
00357
00358 static void
00359 _bt_slideleft(Page page)
00360 {
00361 OffsetNumber off;
00362 OffsetNumber maxoff;
00363 ItemId previi;
00364 ItemId thisii;
00365
00366 if (!PageIsEmpty(page))
00367 {
00368 maxoff = PageGetMaxOffsetNumber(page);
00369 previi = PageGetItemId(page, P_HIKEY);
00370 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
00371 {
00372 thisii = PageGetItemId(page, off);
00373 *previi = *thisii;
00374 previi = thisii;
00375 }
00376 ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
00377 }
00378 }
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393 static void
00394 _bt_sortaddtup(Page page,
00395 Size itemsize,
00396 IndexTuple itup,
00397 OffsetNumber itup_off)
00398 {
00399 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00400 IndexTupleData trunctuple;
00401
00402 if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
00403 {
00404 trunctuple = *itup;
00405 trunctuple.t_info = sizeof(IndexTupleData);
00406 itup = &trunctuple;
00407 itemsize = sizeof(IndexTupleData);
00408 }
00409
00410 if (PageAddItem(page, (Item) itup, itemsize, itup_off,
00411 false, false) == InvalidOffsetNumber)
00412 elog(ERROR, "failed to add item to the index page");
00413 }
00414
00415
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448 static void
00449 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
00450 {
00451 Page npage;
00452 BlockNumber nblkno;
00453 OffsetNumber last_off;
00454 Size pgspc;
00455 Size itupsz;
00456
00457
00458
00459
00460
00461 CHECK_FOR_INTERRUPTS();
00462
00463 npage = state->btps_page;
00464 nblkno = state->btps_blkno;
00465 last_off = state->btps_lastoff;
00466
00467 pgspc = PageGetFreeSpace(npage);
00468 itupsz = IndexTupleDSize(*itup);
00469 itupsz = MAXALIGN(itupsz);
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482 if (itupsz > BTMaxItemSize(npage))
00483 ereport(ERROR,
00484 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00485 errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
00486 (unsigned long) itupsz,
00487 (unsigned long) BTMaxItemSize(npage),
00488 RelationGetRelationName(wstate->index)),
00489 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
00490 "Consider a function index of an MD5 hash of the value, "
00491 "or use full text indexing."),
00492 errtableconstraint(wstate->heap,
00493 RelationGetRelationName(wstate->index))));
00494
00495
00496
00497
00498
00499
00500
00501 if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
00502 {
00503
00504
00505
00506 Page opage = npage;
00507 BlockNumber oblkno = nblkno;
00508 ItemId ii;
00509 ItemId hii;
00510 IndexTuple oitup;
00511
00512
00513 npage = _bt_blnewpage(state->btps_level);
00514
00515
00516 nblkno = wstate->btws_pages_alloced++;
00517
00518
00519
00520
00521
00522
00523
00524
00525 Assert(last_off > P_FIRSTKEY);
00526 ii = PageGetItemId(opage, last_off);
00527 oitup = (IndexTuple) PageGetItem(opage, ii);
00528 _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
00529
00530
00531
00532
00533 hii = PageGetItemId(opage, P_HIKEY);
00534 *hii = *ii;
00535 ItemIdSetUnused(ii);
00536 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
00537
00538
00539
00540
00541
00542
00543 if (state->btps_next == NULL)
00544 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
00545
00546 Assert(state->btps_minkey != NULL);
00547 ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY);
00548 _bt_buildadd(wstate, state->btps_next, state->btps_minkey);
00549 pfree(state->btps_minkey);
00550
00551
00552
00553
00554
00555
00556 state->btps_minkey = CopyIndexTuple(oitup);
00557
00558
00559
00560
00561 {
00562 BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
00563 BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
00564
00565 oopaque->btpo_next = nblkno;
00566 nopaque->btpo_prev = oblkno;
00567 nopaque->btpo_next = P_NONE;
00568 }
00569
00570
00571
00572
00573
00574 _bt_blwritepage(wstate, opage, oblkno);
00575
00576
00577
00578
00579 last_off = P_FIRSTKEY;
00580 }
00581
00582
00583
00584
00585
00586
00587
00588 if (last_off == P_HIKEY)
00589 {
00590 Assert(state->btps_minkey == NULL);
00591 state->btps_minkey = CopyIndexTuple(itup);
00592 }
00593
00594
00595
00596
00597 last_off = OffsetNumberNext(last_off);
00598 _bt_sortaddtup(npage, itupsz, itup, last_off);
00599
00600 state->btps_page = npage;
00601 state->btps_blkno = nblkno;
00602 state->btps_lastoff = last_off;
00603 }
00604
00605
00606
00607
00608 static void
00609 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
00610 {
00611 BTPageState *s;
00612 BlockNumber rootblkno = P_NONE;
00613 uint32 rootlevel = 0;
00614 Page metapage;
00615
00616
00617
00618
00619 for (s = state; s != NULL; s = s->btps_next)
00620 {
00621 BlockNumber blkno;
00622 BTPageOpaque opaque;
00623
00624 blkno = s->btps_blkno;
00625 opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
00626
00627
00628
00629
00630
00631
00632
00633
00634
00635 if (s->btps_next == NULL)
00636 {
00637 opaque->btpo_flags |= BTP_ROOT;
00638 rootblkno = blkno;
00639 rootlevel = s->btps_level;
00640 }
00641 else
00642 {
00643 Assert(s->btps_minkey != NULL);
00644 ItemPointerSet(&(s->btps_minkey->t_tid), blkno, P_HIKEY);
00645 _bt_buildadd(wstate, s->btps_next, s->btps_minkey);
00646 pfree(s->btps_minkey);
00647 s->btps_minkey = NULL;
00648 }
00649
00650
00651
00652
00653
00654 _bt_slideleft(s->btps_page);
00655 _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
00656 s->btps_page = NULL;
00657 }
00658
00659
00660
00661
00662
00663
00664
00665 metapage = (Page) palloc(BLCKSZ);
00666 _bt_initmetapage(metapage, rootblkno, rootlevel);
00667 _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
00668 }
00669
00670
00671
00672
00673
00674 static void
00675 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
00676 {
00677 BTPageState *state = NULL;
00678 bool merge = (btspool2 != NULL);
00679 IndexTuple itup,
00680 itup2 = NULL;
00681 bool should_free,
00682 should_free2,
00683 load1;
00684 TupleDesc tupdes = RelationGetDescr(wstate->index);
00685 int i,
00686 keysz = RelationGetNumberOfAttributes(wstate->index);
00687 ScanKey indexScanKey = NULL;
00688
00689 if (merge)
00690 {
00691
00692
00693
00694
00695
00696
00697 itup = tuplesort_getindextuple(btspool->sortstate,
00698 true, &should_free);
00699 itup2 = tuplesort_getindextuple(btspool2->sortstate,
00700 true, &should_free2);
00701 indexScanKey = _bt_mkscankey_nodata(wstate->index);
00702
00703 for (;;)
00704 {
00705 load1 = true;
00706 if (itup2 == NULL)
00707 {
00708 if (itup == NULL)
00709 break;
00710 }
00711 else if (itup != NULL)
00712 {
00713 for (i = 1; i <= keysz; i++)
00714 {
00715 ScanKey entry;
00716 Datum attrDatum1,
00717 attrDatum2;
00718 bool isNull1,
00719 isNull2;
00720 int32 compare;
00721
00722 entry = indexScanKey + i - 1;
00723 attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
00724 attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
00725 if (isNull1)
00726 {
00727 if (isNull2)
00728 compare = 0;
00729 else if (entry->sk_flags & SK_BT_NULLS_FIRST)
00730 compare = -1;
00731 else
00732 compare = 1;
00733 }
00734 else if (isNull2)
00735 {
00736 if (entry->sk_flags & SK_BT_NULLS_FIRST)
00737 compare = 1;
00738 else
00739 compare = -1;
00740 }
00741 else
00742 {
00743 compare =
00744 DatumGetInt32(FunctionCall2Coll(&entry->sk_func,
00745 entry->sk_collation,
00746 attrDatum1,
00747 attrDatum2));
00748
00749 if (entry->sk_flags & SK_BT_DESC)
00750 compare = -compare;
00751 }
00752 if (compare > 0)
00753 {
00754 load1 = false;
00755 break;
00756 }
00757 else if (compare < 0)
00758 break;
00759 }
00760 }
00761 else
00762 load1 = false;
00763
00764
00765 if (state == NULL)
00766 state = _bt_pagestate(wstate, 0);
00767
00768 if (load1)
00769 {
00770 _bt_buildadd(wstate, state, itup);
00771 if (should_free)
00772 pfree(itup);
00773 itup = tuplesort_getindextuple(btspool->sortstate,
00774 true, &should_free);
00775 }
00776 else
00777 {
00778 _bt_buildadd(wstate, state, itup2);
00779 if (should_free2)
00780 pfree(itup2);
00781 itup2 = tuplesort_getindextuple(btspool2->sortstate,
00782 true, &should_free2);
00783 }
00784 }
00785 _bt_freeskey(indexScanKey);
00786 }
00787 else
00788 {
00789
00790 while ((itup = tuplesort_getindextuple(btspool->sortstate,
00791 true, &should_free)) != NULL)
00792 {
00793
00794 if (state == NULL)
00795 state = _bt_pagestate(wstate, 0);
00796
00797 _bt_buildadd(wstate, state, itup);
00798 if (should_free)
00799 pfree(itup);
00800 }
00801 }
00802
00803
00804 _bt_uppershutdown(wstate, state);
00805
00806
00807
00808
00809
00810
00811
00812
00813
00814
00815
00816
00817
00818
00819
00820
00821 if (RelationNeedsWAL(wstate->index))
00822 {
00823 RelationOpenSmgr(wstate->index);
00824 smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
00825 }
00826 }