#include "postgres.h"#include "access/heapam_xlog.h"#include "access/nbtree.h"#include "miscadmin.h"#include "storage/smgr.h"#include "tcop/tcopprot.h"#include "utils/rel.h"#include "utils/tuplesort.h"
Go to the source code of this file.
Data Structures | |
| struct | BTSpool |
| struct | BTPageState |
| struct | BTWriteState |
Typedefs | |
| typedef struct BTPageState | BTPageState |
| typedef struct BTWriteState | BTWriteState |
Functions | |
| static Page | _bt_blnewpage (uint32 level) |
| static BTPageState * | _bt_pagestate (BTWriteState *wstate, uint32 level) |
| static void | _bt_slideleft (Page page) |
| static void | _bt_sortaddtup (Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off) |
| static void | _bt_buildadd (BTWriteState *wstate, BTPageState *state, IndexTuple itup) |
| static void | _bt_uppershutdown (BTWriteState *wstate, BTPageState *state) |
| static void | _bt_load (BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) |
| BTSpool * | _bt_spoolinit (Relation heap, Relation index, bool isunique, bool isdead) |
| void | _bt_spooldestroy (BTSpool *btspool) |
| void | _bt_spool (IndexTuple itup, BTSpool *btspool) |
| void | _bt_leafbuild (BTSpool *btspool, BTSpool *btspool2) |
| static void | _bt_blwritepage (BTWriteState *wstate, Page page, BlockNumber blkno) |
| typedef struct BTPageState BTPageState |
| typedef struct BTWriteState BTWriteState |
Definition at line 241 of file nbtsort.c.
References _bt_pageinit(), BTP_LEAF, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTPageOpaqueData::level, PageGetSpecialPointer, and palloc().
Referenced by _bt_buildadd(), and _bt_pagestate().
{
Page page;
BTPageOpaque opaque;
page = (Page) palloc(BLCKSZ);
/* Zero the page and set up standard page header info */
_bt_pageinit(page, BLCKSZ);
/* Initialize BT opaque state */
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
opaque->btpo_prev = opaque->btpo_next = P_NONE;
opaque->btpo.level = level;
opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
opaque->btpo_cycleid = 0;
/* Make the P_HIKEY line pointer appear allocated */
((PageHeader) page)->pd_lower += sizeof(ItemIdData);
return page;
}
| static void _bt_blwritepage | ( | BTWriteState * | wstate, | |
| Page | page, | |||
| BlockNumber | blkno | |||
| ) | [static] |
Definition at line 268 of file nbtsort.c.
References BTWriteState::btws_pages_written, BTWriteState::btws_use_wal, BTWriteState::btws_zeropage, BTWriteState::index, log_newpage(), MAIN_FORKNUM, PageSetChecksumInplace(), palloc0(), pfree(), RelationData::rd_node, RelationData::rd_smgr, RelationOpenSmgr, smgrextend(), and smgrwrite().
Referenced by _bt_buildadd(), and _bt_uppershutdown().
{
/* Ensure rd_smgr is open (could have been closed by relcache flush!) */
RelationOpenSmgr(wstate->index);
/* XLOG stuff */
if (wstate->btws_use_wal)
{
/* We use the heap NEWPAGE record type for this */
log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page);
}
/*
* If we have to write pages nonsequentially, fill in the space with
* zeroes until we come back and overwrite. This is not logically
* necessary on standard Unix filesystems (unwritten space will read as
* zeroes anyway), but it should help to avoid fragmentation. The dummy
* pages aren't WAL-logged though.
*/
while (blkno > wstate->btws_pages_written)
{
if (!wstate->btws_zeropage)
wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
/* don't set checksum for all-zero page */
smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
wstate->btws_pages_written++,
(char *) wstate->btws_zeropage,
true);
}
PageSetChecksumInplace(page, blkno);
/*
* Now write the page. There's no need for smgr to schedule an fsync for
* this write; we'll do it ourselves before ending the build.
*/
if (blkno == wstate->btws_pages_written)
{
/* extending the file... */
smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
(char *) page, true);
wstate->btws_pages_written++;
}
else
{
/* overwriting a block we zero-filled before */
smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
(char *) page, true);
}
pfree(page);
}
| static void _bt_buildadd | ( | BTWriteState * | wstate, | |
| BTPageState * | state, | |||
| IndexTuple | itup | |||
| ) | [static] |
Definition at line 449 of file nbtsort.c.
References _bt_blnewpage(), _bt_blwritepage(), _bt_pagestate(), _bt_sortaddtup(), Assert, BTMaxItemSize, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTPageState::btps_blkno, BTPageState::btps_lastoff, BTPageState::btps_level, BTPageState::btps_minkey, BTPageState::btps_next, BTPageState::btps_page, BTWriteState::btws_pages_alloced, CHECK_FOR_INTERRUPTS, CopyIndexTuple(), ereport, errcode(), errhint(), errmsg(), ERROR, errtableconstraint(), BTWriteState::heap, BTWriteState::index, IndexTupleDSize, ItemIdGetLength, ItemIdSetUnused, ItemPointerSet, MAXALIGN, NULL, OffsetNumberNext, P_FIRSTKEY, P_HIKEY, PageGetFreeSpace(), PageGetItem, PageGetItemId, PageGetSpecialPointer, pfree(), RelationGetRelationName, and IndexTupleData::t_tid.
Referenced by _bt_load(), and _bt_uppershutdown().
{
Page npage;
BlockNumber nblkno;
OffsetNumber last_off;
Size pgspc;
Size itupsz;
/*
* This is a handy place to check for cancel interrupts during the btree
* load phase of index creation.
*/
CHECK_FOR_INTERRUPTS();
npage = state->btps_page;
nblkno = state->btps_blkno;
last_off = state->btps_lastoff;
pgspc = PageGetFreeSpace(npage);
itupsz = IndexTupleDSize(*itup);
itupsz = MAXALIGN(itupsz);
/*
* Check whether the item can fit on a btree page at all. (Eventually, we
* ought to try to apply TOAST methods if not.) We actually need to be
* able to fit three items on every page, so restrict any one item to 1/3
* the per-page available space. Note that at this point, itupsz doesn't
* include the ItemId.
*
* NOTE: similar code appears in _bt_insertonpg() to defend against
* oversize items being inserted into an already-existing index. But
* during creation of an index, we don't go through there.
*/
if (itupsz > BTMaxItemSize(npage))
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
(unsigned long) itupsz,
(unsigned long) BTMaxItemSize(npage),
RelationGetRelationName(wstate->index)),
errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
"Consider a function index of an MD5 hash of the value, "
"or use full text indexing."),
errtableconstraint(wstate->heap,
RelationGetRelationName(wstate->index))));
/*
* Check to see if page is "full". It's definitely full if the item won't
* fit. Otherwise, compare to the target freespace derived from the
* fillfactor. However, we must put at least two items on each page, so
* disregard fillfactor if we don't have that many.
*/
if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
{
/*
* Finish off the page and write it out.
*/
Page opage = npage;
BlockNumber oblkno = nblkno;
ItemId ii;
ItemId hii;
IndexTuple oitup;
/* Create new page of same level */
npage = _bt_blnewpage(state->btps_level);
/* and assign it a page position */
nblkno = wstate->btws_pages_alloced++;
/*
* We copy the last item on the page into the new page, and then
* rearrange the old page so that the 'last item' becomes its high key
* rather than a true data item. There had better be at least two
* items on the page already, else the page would be empty of useful
* data.
*/
Assert(last_off > P_FIRSTKEY);
ii = PageGetItemId(opage, last_off);
oitup = (IndexTuple) PageGetItem(opage, ii);
_bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
/*
* Move 'last' into the high key position on opage
*/
hii = PageGetItemId(opage, P_HIKEY);
*hii = *ii;
ItemIdSetUnused(ii); /* redundant */
((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
/*
* Link the old page into its parent, using its minimum key. If we
* don't have a parent, we have to create one; this adds a new btree
* level.
*/
if (state->btps_next == NULL)
state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
Assert(state->btps_minkey != NULL);
ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY);
_bt_buildadd(wstate, state->btps_next, state->btps_minkey);
pfree(state->btps_minkey);
/*
* Save a copy of the minimum key for the new page. We have to copy
* it off the old page, not the new one, in case we are not at leaf
* level.
*/
state->btps_minkey = CopyIndexTuple(oitup);
/*
* Set the sibling links for both pages.
*/
{
BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
oopaque->btpo_next = nblkno;
nopaque->btpo_prev = oblkno;
nopaque->btpo_next = P_NONE; /* redundant */
}
/*
* Write out the old page. We never need to touch it again, so we can
* free the opage workspace too.
*/
_bt_blwritepage(wstate, opage, oblkno);
/*
* Reset last_off to point to new page
*/
last_off = P_FIRSTKEY;
}
/*
* If the new item is the first for its page, stash a copy for later. Note
* this will only happen for the first item on a level; on later pages,
* the first item for a page is copied from the prior page in the code
* above.
*/
if (last_off == P_HIKEY)
{
Assert(state->btps_minkey == NULL);
state->btps_minkey = CopyIndexTuple(itup);
}
/*
* Add the new item into the current page.
*/
last_off = OffsetNumberNext(last_off);
_bt_sortaddtup(npage, itupsz, itup, last_off);
state->btps_page = npage;
state->btps_blkno = nblkno;
state->btps_lastoff = last_off;
}
Definition at line 198 of file nbtsort.c.
References _bt_load(), BTREE_METAPAGE, BTWriteState::btws_pages_alloced, BTWriteState::btws_pages_written, BTWriteState::btws_use_wal, BTWriteState::btws_zeropage, BTSpool::heap, BTWriteState::heap, BTSpool::index, BTWriteState::index, log_btree_build_stats, RelationNeedsWAL, ResetUsage(), ShowUsage(), BTSpool::sortstate, tuplesort_performsort(), and XLogIsNeeded.
Referenced by btbuild().
{
BTWriteState wstate;
#ifdef BTREE_BUILD_STATS
if (log_btree_build_stats)
{
ShowUsage("BTREE BUILD (Spool) STATISTICS");
ResetUsage();
}
#endif /* BTREE_BUILD_STATS */
tuplesort_performsort(btspool->sortstate);
if (btspool2)
tuplesort_performsort(btspool2->sortstate);
wstate.heap = btspool->heap;
wstate.index = btspool->index;
/*
* We need to log index creation in WAL iff WAL archiving/streaming is
* enabled UNLESS the index isn't WAL-logged anyway.
*/
wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
/* reserve the metapage */
wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
wstate.btws_pages_written = 0;
wstate.btws_zeropage = NULL; /* until needed */
_bt_load(&wstate, btspool, btspool2);
}
| static void _bt_load | ( | BTWriteState * | wstate, | |
| BTSpool * | btspool, | |||
| BTSpool * | btspool2 | |||
| ) | [static] |
Definition at line 675 of file nbtsort.c.
References _bt_buildadd(), _bt_freeskey(), _bt_mkscankey_nodata(), _bt_pagestate(), _bt_uppershutdown(), DatumGetInt32, FunctionCall2Coll(), i, BTWriteState::index, index_getattr, MAIN_FORKNUM, NULL, pfree(), RelationData::rd_smgr, RelationGetDescr, RelationGetNumberOfAttributes, RelationNeedsWAL, RelationOpenSmgr, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, smgrimmedsync(), BTSpool::sortstate, and tuplesort_getindextuple().
Referenced by _bt_leafbuild().
{
BTPageState *state = NULL;
bool merge = (btspool2 != NULL);
IndexTuple itup,
itup2 = NULL;
bool should_free,
should_free2,
load1;
TupleDesc tupdes = RelationGetDescr(wstate->index);
int i,
keysz = RelationGetNumberOfAttributes(wstate->index);
ScanKey indexScanKey = NULL;
if (merge)
{
/*
* Another BTSpool for dead tuples exists. Now we have to merge
* btspool and btspool2.
*/
/* the preparation of merge */
itup = tuplesort_getindextuple(btspool->sortstate,
true, &should_free);
itup2 = tuplesort_getindextuple(btspool2->sortstate,
true, &should_free2);
indexScanKey = _bt_mkscankey_nodata(wstate->index);
for (;;)
{
load1 = true; /* load BTSpool next ? */
if (itup2 == NULL)
{
if (itup == NULL)
break;
}
else if (itup != NULL)
{
for (i = 1; i <= keysz; i++)
{
ScanKey entry;
Datum attrDatum1,
attrDatum2;
bool isNull1,
isNull2;
int32 compare;
entry = indexScanKey + i - 1;
attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
if (isNull1)
{
if (isNull2)
compare = 0; /* NULL "=" NULL */
else if (entry->sk_flags & SK_BT_NULLS_FIRST)
compare = -1; /* NULL "<" NOT_NULL */
else
compare = 1; /* NULL ">" NOT_NULL */
}
else if (isNull2)
{
if (entry->sk_flags & SK_BT_NULLS_FIRST)
compare = 1; /* NOT_NULL ">" NULL */
else
compare = -1; /* NOT_NULL "<" NULL */
}
else
{
compare =
DatumGetInt32(FunctionCall2Coll(&entry->sk_func,
entry->sk_collation,
attrDatum1,
attrDatum2));
if (entry->sk_flags & SK_BT_DESC)
compare = -compare;
}
if (compare > 0)
{
load1 = false;
break;
}
else if (compare < 0)
break;
}
}
else
load1 = false;
/* When we see first tuple, create first index page */
if (state == NULL)
state = _bt_pagestate(wstate, 0);
if (load1)
{
_bt_buildadd(wstate, state, itup);
if (should_free)
pfree(itup);
itup = tuplesort_getindextuple(btspool->sortstate,
true, &should_free);
}
else
{
_bt_buildadd(wstate, state, itup2);
if (should_free2)
pfree(itup2);
itup2 = tuplesort_getindextuple(btspool2->sortstate,
true, &should_free2);
}
}
_bt_freeskey(indexScanKey);
}
else
{
/* merge is unnecessary */
while ((itup = tuplesort_getindextuple(btspool->sortstate,
true, &should_free)) != NULL)
{
/* When we see first tuple, create first index page */
if (state == NULL)
state = _bt_pagestate(wstate, 0);
_bt_buildadd(wstate, state, itup);
if (should_free)
pfree(itup);
}
}
/* Close down final pages and write the metapage */
_bt_uppershutdown(wstate, state);
/*
* If the index is WAL-logged, we must fsync it down to disk before it's
* safe to commit the transaction. (For a non-WAL-logged index we don't
* care since the index will be uninteresting after a crash anyway.)
*
* It's obvious that we must do this when not WAL-logging the build. It's
* less obvious that we have to do it even if we did WAL-log the index
* pages. The reason is that since we're building outside shared buffers,
* a CHECKPOINT occurring during the build has no way to flush the
* previously written data to disk (indeed it won't know the index even
* exists). A crash later on would replay WAL from the checkpoint,
* therefore it wouldn't replay our earlier WAL entries. If we do not
* fsync those pages here, they might still not be on disk when the crash
* occurs.
*/
if (RelationNeedsWAL(wstate->index))
{
RelationOpenSmgr(wstate->index);
smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
}
}
| static BTPageState * _bt_pagestate | ( | BTWriteState * | wstate, | |
| uint32 | level | |||
| ) | [static] |
Definition at line 326 of file nbtsort.c.
References _bt_blnewpage(), BTPageState::btps_blkno, BTPageState::btps_full, BTPageState::btps_lastoff, BTPageState::btps_level, BTPageState::btps_minkey, BTPageState::btps_next, BTPageState::btps_page, BTREE_DEFAULT_FILLFACTOR, BTWriteState::btws_pages_alloced, BTWriteState::index, palloc0(), and RelationGetTargetPageFreeSpace.
Referenced by _bt_buildadd(), and _bt_load().
{
BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
/* create initial page for level */
state->btps_page = _bt_blnewpage(level);
/* and assign it a page position */
state->btps_blkno = wstate->btws_pages_alloced++;
state->btps_minkey = NULL;
/* initialize lastoff so first item goes into P_FIRSTKEY */
state->btps_lastoff = P_HIKEY;
state->btps_level = level;
/* set "full" threshold based on level. See notes at head of file. */
if (level > 0)
state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
else
state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
BTREE_DEFAULT_FILLFACTOR);
/* no parent level, yet */
state->btps_next = NULL;
return state;
}
| static void _bt_slideleft | ( | Page | page | ) | [static] |
Definition at line 359 of file nbtsort.c.
References OffsetNumberNext, P_FIRSTKEY, P_HIKEY, PageGetItemId, PageGetMaxOffsetNumber, and PageIsEmpty.
Referenced by _bt_uppershutdown().
{
OffsetNumber off;
OffsetNumber maxoff;
ItemId previi;
ItemId thisii;
if (!PageIsEmpty(page))
{
maxoff = PageGetMaxOffsetNumber(page);
previi = PageGetItemId(page, P_HIKEY);
for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
{
thisii = PageGetItemId(page, off);
*previi = *thisii;
previi = thisii;
}
((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
}
}
| static void _bt_sortaddtup | ( | Page | page, | |
| Size | itemsize, | |||
| IndexTuple | itup, | |||
| OffsetNumber | itup_off | |||
| ) | [static] |
Definition at line 394 of file nbtsort.c.
References elog, ERROR, InvalidOffsetNumber, P_ISLEAF, PageAddItem(), PageGetSpecialPointer, and IndexTupleData::t_info.
Referenced by _bt_buildadd().
{
BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
IndexTupleData trunctuple;
if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
{
trunctuple = *itup;
trunctuple.t_info = sizeof(IndexTupleData);
itup = &trunctuple;
itemsize = sizeof(IndexTupleData);
}
if (PageAddItem(page, (Item) itup, itemsize, itup_off,
false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to the index page");
}
| void _bt_spool | ( | IndexTuple | itup, | |
| BTSpool * | btspool | |||
| ) |
Definition at line 188 of file nbtsort.c.
References BTSpool::sortstate, and tuplesort_putindextuple().
Referenced by btbuildCallback().
{
tuplesort_putindextuple(btspool->sortstate, itup);
}
| void _bt_spooldestroy | ( | BTSpool * | btspool | ) |
Definition at line 178 of file nbtsort.c.
References pfree(), BTSpool::sortstate, and tuplesort_end().
Referenced by btbuild().
{
tuplesort_end(btspool->sortstate);
pfree(btspool);
}
Definition at line 150 of file nbtsort.c.
References BTSpool::heap, BTSpool::index, BTSpool::isunique, maintenance_work_mem, palloc0(), BTSpool::sortstate, tuplesort_begin_index_btree(), and work_mem.
Referenced by btbuild().
{
BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
int btKbytes;
btspool->heap = heap;
btspool->index = index;
btspool->isunique = isunique;
/*
* We size the sort area as maintenance_work_mem rather than work_mem to
* speed index creation. This should be OK since a single backend can't
* run multiple index creations in parallel. Note that creation of a
* unique index actually requires two BTSpool objects. We expect that the
* second one (for dead tuples) won't get very full, so we give it only
* work_mem.
*/
btKbytes = isdead ? work_mem : maintenance_work_mem;
btspool->sortstate = tuplesort_begin_index_btree(heap, index, isunique,
btKbytes, false);
return btspool;
}
| static void _bt_uppershutdown | ( | BTWriteState * | wstate, | |
| BTPageState * | state | |||
| ) | [static] |
Definition at line 609 of file nbtsort.c.
References _bt_blwritepage(), _bt_buildadd(), _bt_initmetapage(), _bt_slideleft(), Assert, BTPageOpaqueData::btpo_flags, BTPageState::btps_blkno, BTPageState::btps_level, BTPageState::btps_minkey, BTPageState::btps_next, BTPageState::btps_page, BTREE_METAPAGE, ItemPointerSet, NULL, P_HIKEY, PageGetSpecialPointer, palloc(), pfree(), and IndexTupleData::t_tid.
Referenced by _bt_load().
{
BTPageState *s;
BlockNumber rootblkno = P_NONE;
uint32 rootlevel = 0;
Page metapage;
/*
* Each iteration of this loop completes one more level of the tree.
*/
for (s = state; s != NULL; s = s->btps_next)
{
BlockNumber blkno;
BTPageOpaque opaque;
blkno = s->btps_blkno;
opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
/*
* We have to link the last page on this level to somewhere.
*
* If we're at the top, it's the root, so attach it to the metapage.
* Otherwise, add an entry for it to its parent using its minimum key.
* This may cause the last page of the parent level to split, but
* that's not a problem -- we haven't gotten to it yet.
*/
if (s->btps_next == NULL)
{
opaque->btpo_flags |= BTP_ROOT;
rootblkno = blkno;
rootlevel = s->btps_level;
}
else
{
Assert(s->btps_minkey != NULL);
ItemPointerSet(&(s->btps_minkey->t_tid), blkno, P_HIKEY);
_bt_buildadd(wstate, s->btps_next, s->btps_minkey);
pfree(s->btps_minkey);
s->btps_minkey = NULL;
}
/*
* This is the rightmost page, so the ItemId array needs to be slid
* back one slot. Then we can dump out the page.
*/
_bt_slideleft(s->btps_page);
_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
s->btps_page = NULL; /* writepage freed the workspace */
}
/*
* As the last step in the process, construct the metapage and make it
* point to the new root (unless we had no data at all, in which case it's
* set to point to "P_NONE"). This changes the index to the "valid" state
* by filling in a valid magic number in the metapage.
*/
metapage = (Page) palloc(BLCKSZ);
_bt_initmetapage(metapage, rootblkno, rootlevel);
_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
}
1.7.1