#include "postgres.h"
#include "trgm.h"
#include "regex/regexport.h"
#include "tsearch/ts_locale.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
Go to the source code of this file.
#define COLOR_BLANK (-2) |
Definition at line 256 of file trgm_regexp.c.
Referenced by addArcs(), addKey(), expandColorTrigrams(), selectColorTrigrams(), and validArcLabel().
#define COLOR_COUNT_LIMIT 256 |
Definition at line 207 of file trgm_regexp.c.
Referenced by getColorInfo().
#define COLOR_UNKNOWN (-1) |
Definition at line 255 of file trgm_regexp.c.
Referenced by prefixContains(), and validArcLabel().
#define MAX_EXPANDED_ARCS 1024 |
Definition at line 205 of file trgm_regexp.c.
Referenced by transformGraph().
#define MAX_EXPANDED_STATES 128 |
Definition at line 204 of file trgm_regexp.c.
Referenced by transformGraph().
#define MAX_TRGM_COUNT 256 |
Definition at line 206 of file trgm_regexp.c.
Referenced by selectColorTrigrams().
typedef int TrgmColor |
Definition at line 252 of file trgm_regexp.c.
static void addArc | ( | TrgmNFA * | trgmNFA, | |
TrgmState * | state, | |||
TrgmStateKey * | key, | |||
TrgmColor | co, | |||
TrgmStateKey * | destKey | |||
) | [static] |
Definition at line 1250 of file trgm_regexp.c.
References TrgmState::arcs, TrgmNFA::arcsCount, TrgmPrefix::colors, ColorTrgm::colors, TrgmArc::ctrgm, TrgmState::enterKeys, lappend(), lfirst, TrgmStateKey::nstate, palloc(), TrgmStateKey::prefix, prefixContains(), TrgmArc::target, and validArcLabel().
Referenced by addArcs().
{ TrgmArc *arc; ListCell *cell; /* Do nothing if this wouldn't be a valid arc label trigram */ if (!validArcLabel(key, co)) return; /* * Check if we are going to reach key which is covered by a key which is * already listed in this state. If so arc is useless: the NFA can bypass * it through a path that doesn't require any predictable trigram, so * whether the arc's trigram is present or not doesn't really matter. */ foreach(cell, state->enterKeys) { TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell); if (existingKey->nstate == destKey->nstate && prefixContains(&existingKey->prefix, &destKey->prefix)) return; } /* Checks were successful, add new arc */ arc = (TrgmArc *) palloc(sizeof(TrgmArc)); arc->target = getState(trgmNFA, destKey); arc->ctrgm.colors[0] = key->prefix.colors[0]; arc->ctrgm.colors[1] = key->prefix.colors[1]; arc->ctrgm.colors[2] = co; state->arcs = lappend(state->arcs, arc); trgmNFA->arcsCount++; }
Definition at line 1158 of file trgm_regexp.c.
References addArc(), regex_arc_t::co, COLOR_BLANK, TrgmNFA::colorInfo, TrgmPrefix::colors, TrgmColorInfo::containsNonWord, TrgmState::enterKeys, TrgmColorInfo::expandable, i, lfirst, MemSet, TrgmStateKey::nstate, palloc(), pfree(), pg_reg_getnumoutarcs(), pg_reg_getoutarcs(), TrgmStateKey::prefix, TrgmNFA::regex, regex_arc_t::to, and TrgmColorInfo::wordCharsCount.
Referenced by processState().
{ TrgmStateKey destKey; ListCell *cell; regex_arc_t *arcs; int arcsCount, i; /* * Ensure any pad bytes in destKey are zero, since it may get used as a * hashtable key by getState. */ MemSet(&destKey, 0, sizeof(destKey)); /* * Iterate over enter keys associated with this expanded-graph state. This * includes both the state's own stateKey, and any enter keys we added to * it during addKey (which represent expanded-graph states that are not * distinguishable from this one by means of trigrams). For each such * enter key, examine all the out-arcs of the key's underlying NFA state, * and try to make a trigram arc leading to where the out-arc leads. * (addArc will deal with whether the arc is valid or not.) */ foreach(cell, state->enterKeys) { TrgmStateKey *key = (TrgmStateKey *) lfirst(cell); arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate); arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount); for (i = 0; i < arcsCount; i++) { regex_arc_t *arc = &arcs[i]; TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co]; /* * Ignore non-expandable colors; addKey already handled the case. * * We need no special check for begin/end pseudocolors here. We * don't need to do any processing for them, and they will be * marked non-expandable since the regex engine will have reported * them that way. */ if (!colorInfo->expandable) continue; if (colorInfo->containsNonWord) { /* * Color includes non-word character(s). * * Generate an arc, treating this transition as occurring on * BLANK. This allows word-ending trigrams to be manufactured * if possible. */ destKey.prefix.colors[0] = key->prefix.colors[1]; destKey.prefix.colors[1] = COLOR_BLANK; destKey.nstate = arc->to; addArc(trgmNFA, state, key, COLOR_BLANK, &destKey); } if (colorInfo->wordCharsCount > 0) { /* * Color includes word character(s). * * Generate an arc. Color is pushed into prefix of target * state. */ destKey.prefix.colors[0] = key->prefix.colors[1]; destKey.prefix.colors[1] = arc->co; destKey.nstate = arc->to; addArc(trgmNFA, state, key, arc->co, &destKey); } } pfree(arcs); } }
static void addKey | ( | TrgmNFA * | trgmNFA, | |
TrgmState * | state, | |||
TrgmStateKey * | key | |||
) | [static] |
Definition at line 975 of file trgm_regexp.c.
References addKeyToQueue(), regex_arc_t::co, COLOR_BLANK, TrgmNFA::colorInfo, TrgmPrefix::colors, TrgmColorInfo::containsNonWord, TrgmState::enterKeys, TrgmColorInfo::expandable, TrgmState::fin, i, lappend(), lfirst, list_delete_cell(), list_head(), lnext, MemSet, TrgmStateKey::nstate, palloc(), pfree(), pg_reg_colorisbegin(), pg_reg_colorisend(), pg_reg_getfinalstate(), pg_reg_getnumoutarcs(), pg_reg_getoutarcs(), TrgmStateKey::prefix, prefixContains(), TrgmNFA::regex, regex_arc_t::to, validArcLabel(), and TrgmColorInfo::wordCharsCount.
Referenced by processState().
{ regex_arc_t *arcs; TrgmStateKey destKey; ListCell *cell, *prev, *next; int i, arcsCount; /* * Ensure any pad bytes in destKey are zero, since it may get used as a * hashtable key by getState. */ MemSet(&destKey, 0, sizeof(destKey)); /* * Compare key to each existing enter key of the state to check for * redundancy. We can drop either old key(s) or the new key if we find * redundancy. */ prev = NULL; cell = list_head(state->enterKeys); while (cell) { TrgmStateKey *existingKey = (TrgmStateKey *) lfirst(cell); next = lnext(cell); if (existingKey->nstate == key->nstate) { if (prefixContains(&existingKey->prefix, &key->prefix)) { /* This old key already covers the new key. Nothing to do */ return; } if (prefixContains(&key->prefix, &existingKey->prefix)) { /* * The new key covers this old key. Remove the old key, it's * no longer needed once we add this key to the list. */ state->enterKeys = list_delete_cell(state->enterKeys, cell, prev); } else prev = cell; } else prev = cell; cell = next; } /* No redundancy, so add this key to the state's list */ state->enterKeys = lappend(state->enterKeys, key); /* If state is now known final, mark it and we're done */ if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex)) { state->fin = true; return; } /* * Loop through all outgoing arcs of the corresponding state in the * original NFA. */ arcsCount = pg_reg_getnumoutarcs(trgmNFA->regex, key->nstate); arcs = (regex_arc_t *) palloc(sizeof(regex_arc_t) * arcsCount); pg_reg_getoutarcs(trgmNFA->regex, key->nstate, arcs, arcsCount); for (i = 0; i < arcsCount; i++) { regex_arc_t *arc = &arcs[i]; if (pg_reg_colorisbegin(trgmNFA->regex, arc->co)) { /* * Start of line/string (^). Trigram extraction treats start of * line same as start of word: double space prefix is added. * Hence, make an enter key showing we can reach the arc * destination with all-blank prefix. */ destKey.prefix.colors[0] = COLOR_BLANK; destKey.prefix.colors[1] = COLOR_BLANK; destKey.nstate = arc->to; /* Add enter key to this state */ addKeyToQueue(trgmNFA, &destKey); } else if (pg_reg_colorisend(trgmNFA->regex, arc->co)) { /* * End of line/string ($). We must consider this arc as a * transition that doesn't read anything. The reason for adding * this enter key to the state is that if the arc leads to the * NFA's final state, we must mark this expanded state as final. */ destKey.prefix.colors[0] = COLOR_UNKNOWN; destKey.prefix.colors[1] = COLOR_UNKNOWN; destKey.nstate = arc->to; /* Add enter key to this state */ addKeyToQueue(trgmNFA, &destKey); } else { /* Regular color */ TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co]; if (colorInfo->expandable) { if (colorInfo->containsNonWord && !validArcLabel(key, COLOR_BLANK)) { /* * We can reach the arc destination after reading a * non-word character, but the prefix is not something * that addArc will accept with COLOR_BLANK, so no trigram * arc can get made for this transition. We must make an * enter key to show that the arc destination is * reachable. Set it up with an all-blank prefix, since * that corresponds to what the trigram extraction code * will do at a word starting boundary. */ destKey.prefix.colors[0] = COLOR_BLANK; destKey.prefix.colors[1] = COLOR_BLANK; destKey.nstate = arc->to; addKeyToQueue(trgmNFA, &destKey); } if (colorInfo->wordCharsCount > 0 && !validArcLabel(key, arc->co)) { /* * We can reach the arc destination after reading a word * character, but the prefix is not something that addArc * will accept, so no trigram arc can get made for this * transition. We must make an enter key to show that the * arc destination is reachable. The prefix for the enter * key should reflect the info we have for this arc. */ destKey.prefix.colors[0] = key->prefix.colors[1]; destKey.prefix.colors[1] = arc->co; destKey.nstate = arc->to; addKeyToQueue(trgmNFA, &destKey); } } else { /* * Unexpandable color. Add enter key with ambiguous prefix, * showing we can reach the destination from this state, but * the preceding colors will be uncertain. (We do not set the * first prefix color to key->prefix.colors[1], because a * prefix of known followed by unknown is invalid.) */ destKey.prefix.colors[0] = COLOR_UNKNOWN; destKey.prefix.colors[1] = COLOR_UNKNOWN; destKey.nstate = arc->to; addKeyToQueue(trgmNFA, &destKey); } } } pfree(arcs); }
static void addKeyToQueue | ( | TrgmNFA * | trgmNFA, | |
TrgmStateKey * | key | |||
) | [static] |
Definition at line 1146 of file trgm_regexp.c.
References TrgmNFA::keysQueue, lappend(), and palloc().
Referenced by addKey().
{ TrgmStateKey *keyCopy = (TrgmStateKey *) palloc(sizeof(TrgmStateKey)); memcpy(keyCopy, key, sizeof(TrgmStateKey)); trgmNFA->keysQueue = lappend(trgmNFA->keysQueue, keyCopy); }
static int colorTrgmInfoCmp | ( | const void * | p1, | |
const void * | p2 | |||
) | [static] |
Definition at line 1739 of file trgm_regexp.c.
References ColorTrgmInfo::ctrgm, and memcmp().
Referenced by selectColorTrigrams().
{ const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1; const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2; return memcmp(&c1->ctrgm, &c2->ctrgm, sizeof(ColorTrgm)); }
static int colorTrgmInfoCountCmp | ( | const void * | p1, | |
const void * | p2 | |||
) | [static] |
Definition at line 1752 of file trgm_regexp.c.
References ColorTrgmInfo::count.
Referenced by selectColorTrigrams().
{ const ColorTrgmInfo *c1 = (const ColorTrgmInfo *) p1; const ColorTrgmInfo *c2 = (const ColorTrgmInfo *) p2; if (c1->count < c2->count) return 1; else if (c1->count == c2->count) return 0; else return -1; }
static bool convertPgWchar | ( | pg_wchar | c, | |
trgm_mb_char * | result | |||
) | [static] |
Definition at line 804 of file trgm_regexp.c.
References trgm_mb_char::bytes, lowerstr(), MAX_MULTIBYTE_CHAR_LEN, pfree(), and pg_wchar2mb_with_len().
Referenced by getColorInfo().
{ /* "s" has enough space for a multibyte character and a trailing NUL */ char s[MAX_MULTIBYTE_CHAR_LEN + 1]; /* * We can ignore the NUL character, since it can never appear in a PG text * string. This avoids the need for various special cases when * reconstructing trigrams. */ if (c == 0) return false; /* Do the conversion, making sure the result is NUL-terminated */ memset(s, 0, sizeof(s)); pg_wchar2mb_with_len(&c, s, 1); /* * In IGNORECASE mode, we can ignore uppercase characters. We assume that * the regex engine generated both uppercase and lowercase equivalents * within each color, since we used the REG_ICASE option; so there's no * need to process the uppercase version. * * XXX this code is dependent on the assumption that lowerstr() works the * same as the regex engine's internal case folding machinery. Might be * wiser to expose pg_wc_tolower and test whether c == pg_wc_tolower(c). * On the other hand, the trigrams in the index were created using * lowerstr(), so we're probably screwed if there's any incompatibility * anyway. */ #ifdef IGNORECASE { char *lowerCased = lowerstr(s); if (strcmp(lowerCased, s) != 0) { pfree(lowerCased); return false; } pfree(lowerCased); } #endif /* Fill result with exactly MAX_MULTIBYTE_CHAR_LEN bytes */ strncpy(result->bytes, s, MAX_MULTIBYTE_CHAR_LEN); return true; }
TRGM* createTrgmNFA | ( | text * | text_re, | |
Oid | collation, | |||
TrgmPackedGraph ** | graph, | |||
MemoryContext | rcontext | |||
) |
Definition at line 484 of file trgm_regexp.c.
References ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, ALLOCSET_DEFAULT_MINSIZE, AllocSetContextCreate(), createTrgmNFAInternal(), CurrentMemoryContext, MemoryContextDelete(), MemoryContextSwitchTo(), PG_CATCH, PG_END_TRY, PG_RE_THROW, pg_regfree(), PG_TRY, RE_compile(), REG_ADVANCED, and REG_ICASE.
Referenced by gin_extract_query_trgm(), and gtrgm_consistent().
{ TRGM *trg; regex_t regex; MemoryContext tmpcontext; MemoryContext oldcontext; /* * This processing generates a great deal of cruft, which we'd like to * clean up before returning (since this function may be called in a * query-lifespan memory context). Make a temp context we can work in so * that cleanup is easy. */ tmpcontext = AllocSetContextCreate(CurrentMemoryContext, "createTrgmNFA temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldcontext = MemoryContextSwitchTo(tmpcontext); /* * Stage 1: Compile the regexp into a NFA, using the regexp library. */ #ifdef IGNORECASE RE_compile(®ex, text_re, REG_ADVANCED | REG_ICASE, collation); #else RE_compile(®ex, text_re, REG_ADVANCED, collation); #endif /* * Since the regexp library allocates its internal data structures with * malloc, we need to use a PG_TRY block to ensure that pg_regfree() gets * done even if there's an error. */ PG_TRY(); { trg = createTrgmNFAInternal(®ex, graph, rcontext); } PG_CATCH(); { pg_regfree(®ex); PG_RE_THROW(); } PG_END_TRY(); pg_regfree(®ex); /* Clean up all the cruft we created */ MemoryContextSwitchTo(oldcontext); MemoryContextDelete(tmpcontext); return trg; }
static TRGM * createTrgmNFAInternal | ( | regex_t * | regex, | |
TrgmPackedGraph ** | graph, | |||
MemoryContext | rcontext | |||
) | [static] |
Definition at line 543 of file trgm_regexp.c.
References TrgmNFA::colorInfo, expandColorTrigrams(), TrgmState::fin, getColorInfo(), TrgmNFA::initState, TrgmNFA::ncolors, packGraph(), TrgmNFA::regex, selectColorTrigrams(), and transformGraph().
Referenced by createTrgmNFA().
{ TRGM *trg; TrgmNFA trgmNFA; trgmNFA.regex = regex; /* Collect color information from the regex */ getColorInfo(regex, &trgmNFA); #ifdef TRGM_REGEXP_DEBUG printSourceNFA(regex, trgmNFA.colorInfo, trgmNFA.ncolors); #endif /* * Stage 2: Create an expanded graph from the source NFA. */ transformGraph(&trgmNFA); #ifdef TRGM_REGEXP_DEBUG printTrgmNFA(&trgmNFA); #endif /* * Fail if we were unable to make a nontrivial graph, ie it is possible to * get from the initial state to the final state without reading any * predictable trigram. */ if (trgmNFA.initState->fin) return NULL; /* * Stage 3: Select color trigrams to expand. Fail if too many trigrams. */ if (!selectColorTrigrams(&trgmNFA)) return NULL; /* * Stage 4: Expand color trigrams and pack graph into final * representation. */ trg = expandColorTrigrams(&trgmNFA, rcontext); *graph = packGraph(&trgmNFA, rcontext); #ifdef TRGM_REGEXP_DEBUG printTrgmPackedGraph(*graph, trg); #endif return trg; }
static TRGM * expandColorTrigrams | ( | TrgmNFA * | trgmNFA, | |
MemoryContext | rcontext | |||
) | [static] |
Definition at line 1610 of file trgm_regexp.c.
References ARRKEY, trgm_mb_char::bytes, CALCGTSIZE, COLOR_BLANK, TrgmNFA::colorInfo, ColorTrgm::colors, TrgmNFA::colorTrgms, TrgmNFA::colorTrgmsCount, ColorTrgmInfo::ctrgm, ColorTrgmInfo::expanded, fillTrgm(), TRGM::flag, GETARR, i, MemoryContextAllocZero(), SET_VARSIZE, TrgmNFA::totalTrgmCount, TRGMHDRSIZE, TrgmColorInfo::wordChars, and TrgmColorInfo::wordCharsCount.
Referenced by createTrgmNFAInternal().
{ TRGM *trg; trgm *p; int i; TrgmColorInfo blankColor; trgm_mb_char blankChar; /* Set up "blank" color structure containing a single zero character */ memset(blankChar.bytes, 0, sizeof(blankChar.bytes)); blankColor.wordCharsCount = 1; blankColor.wordChars = &blankChar; /* Construct the trgm array */ trg = (TRGM *) MemoryContextAllocZero(rcontext, TRGMHDRSIZE + trgmNFA->totalTrgmCount * sizeof(trgm)); trg->flag = ARRKEY; SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, trgmNFA->totalTrgmCount)); p = GETARR(trg); for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { ColorTrgmInfo *colorTrgm = &trgmNFA->colorTrgms[i]; TrgmColorInfo *c[3]; trgm_mb_char s[3]; int j, i1, i2, i3; /* Ignore any unexpanded trigrams ... */ if (!colorTrgm->expanded) continue; /* Get colors, substituting the dummy struct for COLOR_BLANK */ for (j = 0; j < 3; j++) { if (colorTrgm->ctrgm.colors[j] != COLOR_BLANK) c[j] = &trgmNFA->colorInfo[colorTrgm->ctrgm.colors[j]]; else c[j] = &blankColor; } /* Iterate over all possible combinations of colors' characters */ for (i1 = 0; i1 < c[0]->wordCharsCount; i1++) { s[0] = c[0]->wordChars[i1]; for (i2 = 0; i2 < c[1]->wordCharsCount; i2++) { s[1] = c[1]->wordChars[i2]; for (i3 = 0; i3 < c[2]->wordCharsCount; i3++) { s[2] = c[2]->wordChars[i3]; fillTrgm(p, s); p++; } } } } return trg; }
static void fillTrgm | ( | trgm * | ptrgm, | |
trgm_mb_char | s[3] | |||
) | [static] |
Definition at line 1678 of file trgm_regexp.c.
References trgm_mb_char::bytes, compact_trigram(), i, and MAX_MULTIBYTE_CHAR_LEN.
Referenced by expandColorTrigrams().
{ char str[3 * MAX_MULTIBYTE_CHAR_LEN], *p; int i, j; /* Write multibyte string into "str" (we don't need null termination) */ p = str; for (i = 0; i < 3; i++) { if (s[i].bytes[0] != 0) { for (j = 0; j < MAX_MULTIBYTE_CHAR_LEN && s[i].bytes[j]; j++) *p++ = s[i].bytes[j]; } else { /* Emit a space in place of COLOR_BLANK */ *p++ = ' '; } } /* Convert "str" to a standard trigram (possibly hashing it) */ compact_trigram(ptrgm, str, p - str); }
Definition at line 741 of file trgm_regexp.c.
References trgm_mb_char::bytes, chars, COLOR_COUNT_LIMIT, TrgmNFA::colorInfo, TrgmColorInfo::containsNonWord, convertPgWchar(), TrgmColorInfo::expandable, i, ISWORDCHR, TrgmNFA::ncolors, palloc(), palloc0(), pfree(), pg_reg_getcharacters(), pg_reg_getnumcharacters(), pg_reg_getnumcolors(), TrgmColorInfo::wordChars, and TrgmColorInfo::wordCharsCount.
Referenced by createTrgmNFAInternal().
{ int colorsCount = pg_reg_getnumcolors(regex); int i; trgmNFA->ncolors = colorsCount; trgmNFA->colorInfo = (TrgmColorInfo *) palloc0(colorsCount * sizeof(TrgmColorInfo)); /* * Loop over colors, filling TrgmColorInfo about each. */ for (i = 0; i < colorsCount; i++) { TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[i]; int charsCount = pg_reg_getnumcharacters(regex, i); pg_wchar *chars; int j; if (charsCount < 0 || charsCount > COLOR_COUNT_LIMIT) { /* Non expandable, or too large to work with */ colorInfo->expandable = false; continue; } colorInfo->expandable = true; colorInfo->containsNonWord = false; colorInfo->wordChars = (trgm_mb_char *) palloc(sizeof(trgm_mb_char) * charsCount); colorInfo->wordCharsCount = 0; /* Extract all the chars in this color */ chars = (pg_wchar *) palloc(sizeof(pg_wchar) * charsCount); pg_reg_getcharacters(regex, i, chars, charsCount); /* * Convert characters back to multibyte form, and save only those that * are word characters. Set "containsNonWord" if any non-word * character. (Note: it'd probably be nicer to keep the chars in * pg_wchar format for now, but ISWORDCHR wants to see multibyte.) */ for (j = 0; j < charsCount; j++) { trgm_mb_char c; if (!convertPgWchar(chars[j], &c)) continue; /* ok to ignore it altogether */ if (ISWORDCHR(c.bytes)) colorInfo->wordChars[colorInfo->wordCharsCount++] = c; else colorInfo->containsNonWord = true; } pfree(chars); } }
static TrgmState* getState | ( | TrgmNFA * | trgmNFA, | |
TrgmStateKey * | key | |||
) | [static] |
Definition at line 1347 of file trgm_regexp.c.
References TrgmState::arcs, TrgmState::children, TrgmState::enterKeys, TrgmState::fin, HASH_ENTER, hash_search(), TrgmState::init, lappend(), TrgmState::number, TrgmState::parent, TrgmNFA::queue, and TrgmNFA::states.
{ TrgmState *state; bool found; state = (TrgmState *) hash_search(trgmNFA->states, key, HASH_ENTER, &found); if (!found) { /* New state: initialize and queue it */ state->arcs = NIL; state->enterKeys = NIL; state->init = false; state->fin = false; state->parent = NULL; state->children = NIL; state->number = -1; trgmNFA->queue = lappend(trgmNFA->queue, state); } return state; }
Definition at line 1710 of file trgm_regexp.c.
References Assert, TrgmState::children, TrgmState::fin, TrgmState::init, lappend(), lfirst, list_concat(), and TrgmState::parent.
Referenced by selectColorTrigrams().
{ ListCell *cell; Assert(state1 != state2); Assert(!state1->parent); Assert(!state2->parent); /* state1 absorbs state2's init/fin flags */ state1->init |= state2->init; state1->fin |= state2->fin; /* state2, and all its children, become children of state1 */ foreach(cell, state2->children) { TrgmState *state = (TrgmState *) lfirst(cell); state->parent = state1; } state2->parent = state1; state1->children = list_concat(state1->children, state2->children); state1->children = lappend(state1->children, state2); state2->children = NIL; }
static int packArcInfoCmp | ( | const void * | a1, | |
const void * | a2 | |||
) | [static] |
Definition at line 1932 of file trgm_regexp.c.
References TrgmPackArcInfo::colorTrgm, TrgmPackArcInfo::sourceState, and TrgmPackArcInfo::targetState.
Referenced by packGraph().
{ const TrgmPackArcInfo *p1 = (const TrgmPackArcInfo *) a1; const TrgmPackArcInfo *p2 = (const TrgmPackArcInfo *) a2; if (p1->sourceState < p2->sourceState) return -1; if (p1->sourceState > p2->sourceState) return 1; if (p1->colorTrgm < p2->colorTrgm) return -1; if (p1->colorTrgm > p2->colorTrgm) return 1; if (p1->targetState < p2->targetState) return -1; if (p1->targetState > p2->targetState) return 1; return 0; }
static TrgmPackedGraph * packGraph | ( | TrgmNFA * | trgmNFA, | |
MemoryContext | rcontext | |||
) | [static] |
Definition at line 1777 of file trgm_regexp.c.
References TrgmPackedState::arcs, TrgmState::arcs, TrgmPackedState::arcsCount, TrgmNFA::arcsCount, Assert, TrgmPackedArc::colorTrgm, TrgmPackArcInfo::colorTrgm, TrgmNFA::colorTrgms, TrgmNFA::colorTrgmsCount, TrgmPackedGraph::colorTrigramGroups, TrgmPackedGraph::colorTrigramsActive, TrgmPackedGraph::colorTrigramsCount, ColorTrgmInfo::count, TrgmArc::ctrgm, ColorTrgmInfo::expanded, TrgmState::fin, hash_seq_init(), hash_seq_search(), i, TrgmState::init, lfirst, MemoryContextAlloc(), NULL, ColorTrgmInfo::number, TrgmState::number, packArcInfoCmp(), palloc(), TrgmState::parent, qsort, TrgmPackArcInfo::sourceState, TrgmPackedGraph::states, TrgmNFA::states, TrgmPackedGraph::statesActive, TrgmPackedGraph::statesCount, TrgmPackedGraph::statesQueue, TrgmArc::target, TrgmPackedArc::targetState, and TrgmPackArcInfo::targetState.
Referenced by createTrgmNFAInternal().
{ int number = 2, arcIndex, arcsCount; HASH_SEQ_STATUS scan_status; TrgmState *state; TrgmPackArcInfo *arcs, *p1, *p2; TrgmPackedArc *packedArcs; TrgmPackedGraph *result; int i, j; /* Enumerate surviving states, giving init and fin reserved numbers */ hash_seq_init(&scan_status, trgmNFA->states); while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) { while (state->parent) state = state->parent; if (state->number < 0) { if (state->init) state->number = 0; else if (state->fin) state->number = 1; else { state->number = number; number++; } } } /* Collect array of all arcs */ arcs = (TrgmPackArcInfo *) palloc(sizeof(TrgmPackArcInfo) * trgmNFA->arcsCount); arcIndex = 0; hash_seq_init(&scan_status, trgmNFA->states); while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) { TrgmState *source = state; ListCell *cell; while (source->parent) source = source->parent; foreach(cell, state->arcs) { TrgmArc *arc = (TrgmArc *) lfirst(cell); TrgmState *target = arc->target; while (target->parent) target = target->parent; if (source->number != target->number) { ColorTrgmInfo *ctrgm; ctrgm = (ColorTrgmInfo *) bsearch(&arc->ctrgm, trgmNFA->colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp); Assert(ctrgm != NULL); Assert(ctrgm->expanded); arcs[arcIndex].sourceState = source->number; arcs[arcIndex].targetState = target->number; arcs[arcIndex].colorTrgm = ctrgm->number; arcIndex++; } } } /* Sort arcs to ease duplicate detection */ qsort(arcs, arcIndex, sizeof(TrgmPackArcInfo), packArcInfoCmp); /* We could have duplicates because states were merged. Remove them. */ /* p1 is probe point, p2 is last known non-duplicate. */ p2 = arcs; for (p1 = arcs + 1; p1 < arcs + arcIndex; p1++) { if (packArcInfoCmp(p1, p2) > 0) { p2++; *p2 = *p1; } } arcsCount = (p2 - arcs) + 1; /* Create packed representation */ result = (TrgmPackedGraph *) MemoryContextAlloc(rcontext, sizeof(TrgmPackedGraph)); /* Pack color trigrams information */ result->colorTrigramsCount = 0; for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { if (trgmNFA->colorTrgms[i].expanded) result->colorTrigramsCount++; } result->colorTrigramGroups = (int *) MemoryContextAlloc(rcontext, sizeof(int) * result->colorTrigramsCount); j = 0; for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { if (trgmNFA->colorTrgms[i].expanded) { result->colorTrigramGroups[j] = trgmNFA->colorTrgms[i].count; j++; } } /* Pack states and arcs information */ result->statesCount = number; result->states = (TrgmPackedState *) MemoryContextAlloc(rcontext, number * sizeof(TrgmPackedState)); packedArcs = (TrgmPackedArc *) MemoryContextAlloc(rcontext, arcsCount * sizeof(TrgmPackedArc)); j = 0; for (i = 0; i < number; i++) { int cnt = 0; result->states[i].arcs = &packedArcs[j]; while (j < arcsCount && arcs[j].sourceState == i) { packedArcs[j].targetState = arcs[j].targetState; packedArcs[j].colorTrgm = arcs[j].colorTrgm; cnt++; j++; } result->states[i].arcsCount = cnt; } /* Allocate working memory for trigramsMatchGraph() */ result->colorTrigramsActive = (bool *) MemoryContextAlloc(rcontext, sizeof(bool) * result->colorTrigramsCount); result->statesActive = (bool *) MemoryContextAlloc(rcontext, sizeof(bool) * result->statesCount); result->statesQueue = (int *) MemoryContextAlloc(rcontext, sizeof(int) * result->statesCount); return result; }
static bool prefixContains | ( | TrgmPrefix * | prefix1, | |
TrgmPrefix * | prefix2 | |||
) | [static] |
Definition at line 1377 of file trgm_regexp.c.
References COLOR_UNKNOWN, and TrgmPrefix::colors.
Referenced by addArc(), and addKey().
{ if (prefix1->colors[1] == COLOR_UNKNOWN) { /* Fully ambiguous prefix contains everything */ return true; } else if (prefix1->colors[0] == COLOR_UNKNOWN) { /* * Prefix with only first unknown color contains every prefix with * same second color. */ if (prefix1->colors[1] == prefix2->colors[1]) return true; else return false; } else { /* Exact prefix contains only the exact same prefix */ if (prefix1->colors[0] == prefix2->colors[0] && prefix1->colors[1] == prefix2->colors[1]) return true; else return false; } }
Definition at line 932 of file trgm_regexp.c.
References addArcs(), addKey(), TrgmState::fin, TrgmNFA::keysQueue, linitial, list_delete_first(), NIL, and TrgmState::stateKey.
Referenced by transformGraph().
{ /* keysQueue should be NIL already, but make sure */ trgmNFA->keysQueue = NIL; /* * Add state's own key, and then process all keys added to keysQueue until * queue is empty. But we can quit if the state gets marked final. */ addKey(trgmNFA, state, &state->stateKey); while (trgmNFA->keysQueue != NIL && !state->fin) { TrgmStateKey *key = (TrgmStateKey *) linitial(trgmNFA->keysQueue); trgmNFA->keysQueue = list_delete_first(trgmNFA->keysQueue); addKey(trgmNFA, state, key); } /* * Add outgoing arcs only if state isn't final (we have no interest in * outgoing arcs if we already match) */ if (!state->fin) addArcs(trgmNFA, state); }
Definition at line 697 of file trgm_regexp.c.
References ereport, errcode(), errmsg(), ERROR, palloc(), pfree(), pg_mb2wchar_with_len(), pg_regcomp(), pg_regerror(), REG_OKAY, VARDATA_ANY, and VARSIZE_ANY_EXHDR.
Referenced by createTrgmNFA().
{ int text_re_len = VARSIZE_ANY_EXHDR(text_re); char *text_re_val = VARDATA_ANY(text_re); pg_wchar *pattern; int pattern_len; int regcomp_result; char errMsg[100]; /* Convert pattern string to wide characters */ pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar)); pattern_len = pg_mb2wchar_with_len(text_re_val, pattern, text_re_len); /* Compile regex */ regcomp_result = pg_regcomp(regex, pattern, pattern_len, cflags, collation); pfree(pattern); if (regcomp_result != REG_OKAY) { /* re didn't compile (no need for pg_regfree, if so) */ pg_regerror(regcomp_result, regex, errMsg, sizeof(errMsg)); ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), errmsg("invalid regular expression: %s", errMsg))); } }
Definition at line 1419 of file trgm_regexp.c.
References ColorTrgmInfo::arcs, TrgmState::arcs, TrgmNFA::arcsCount, Assert, COLOR_BLANK, TrgmNFA::colorInfo, ColorTrgm::colors, colorTrgmInfoCmp(), colorTrgmInfoCountCmp(), TrgmNFA::colorTrgms, TrgmNFA::colorTrgmsCount, ColorTrgmInfo::count, TrgmArc::ctrgm, ColorTrgmInfo::ctrgm, ColorTrgmInfo::expanded, TrgmState::fin, hash_seq_init(), hash_seq_search(), i, TrgmState::init, lfirst, list_concat(), list_make1, MAX_TRGM_COUNT, mergeStates(), NULL, ColorTrgmInfo::number, palloc(), TrgmState::parent, qsort, TrgmArcInfo::source, TrgmNFA::states, TrgmArc::target, TrgmArcInfo::target, TrgmNFA::totalTrgmCount, and TrgmColorInfo::wordCharsCount.
Referenced by createTrgmNFAInternal().
{ HASH_SEQ_STATUS scan_status; int arcsCount = trgmNFA->arcsCount, i; TrgmState *state; ColorTrgmInfo *colorTrgms; int64 totalTrgmCount; int number; /* Collect color trigrams from all arcs */ colorTrgms = (ColorTrgmInfo *) palloc(sizeof(ColorTrgmInfo) * arcsCount); trgmNFA->colorTrgms = colorTrgms; i = 0; hash_seq_init(&scan_status, trgmNFA->states); while ((state = (TrgmState *) hash_seq_search(&scan_status)) != NULL) { ListCell *cell; foreach(cell, state->arcs) { TrgmArc *arc = (TrgmArc *) lfirst(cell); TrgmArcInfo *arcInfo = (TrgmArcInfo *) palloc(sizeof(TrgmArcInfo)); arcInfo->source = state; arcInfo->target = arc->target; colorTrgms[i].arcs = list_make1(arcInfo); colorTrgms[i].expanded = true; colorTrgms[i].ctrgm = arc->ctrgm; i++; } } Assert(i == arcsCount); /* Remove duplicates, merging their arcs lists */ if (arcsCount >= 2) { ColorTrgmInfo *p1, *p2; /* Sort trigrams to ease duplicate detection */ qsort(colorTrgms, arcsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp); /* p1 is probe point, p2 is last known non-duplicate. */ p2 = colorTrgms; for (p1 = colorTrgms + 1; p1 < colorTrgms + arcsCount; p1++) { if (colorTrgmInfoCmp(p1, p2) > 0) { p2++; *p2 = *p1; } else { p2->arcs = list_concat(p2->arcs, p1->arcs); } } trgmNFA->colorTrgmsCount = (p2 - colorTrgms) + 1; } else { trgmNFA->colorTrgmsCount = arcsCount; } /* * Count number of simple trigrams generated by each color trigram. * * Note: per-color-trigram counts cannot overflow an int so long as * COLOR_COUNT_LIMIT is not more than the cube root of INT_MAX, ie about * 1290. However, the grand total totalTrgmCount might conceivably * overflow an int, so we use int64 for that within this routine. */ totalTrgmCount = 0; for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { ColorTrgmInfo *trgmInfo = &colorTrgms[i]; int j, count = 1; for (j = 0; j < 3; j++) { TrgmColor c = trgmInfo->ctrgm.colors[j]; if (c != COLOR_BLANK) count *= trgmNFA->colorInfo[c].wordCharsCount; } trgmInfo->count = count; totalTrgmCount += count; } /* Sort color trigrams in descending order of simple trigram counts */ qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCountCmp); /* * Remove color trigrams from the graph so long as total number of simple * trigrams exceeds MAX_TRGM_COUNT. We prefer to remove color trigrams * with the most associated simple trigrams, since those are the most * promising for reducing the total number of simple trigrams. When * removing a color trigram we have to merge states connected by arcs * labeled with that trigram. It's necessary to not merge initial and * final states, because our graph becomes useless if that happens; so we * cannot always remove the trigram we'd prefer to. */ for (i = 0; (i < trgmNFA->colorTrgmsCount) && (totalTrgmCount > MAX_TRGM_COUNT); i++) { ColorTrgmInfo *trgmInfo = &colorTrgms[i]; bool canRemove = true; ListCell *cell; /* * Does any arc of this color trigram connect initial and final * states? If so we can't remove it. */ foreach(cell, trgmInfo->arcs) { TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell); TrgmState *source = arcInfo->source, *target = arcInfo->target; /* examine parent states, if any merging has already happened */ while (source->parent) source = source->parent; while (target->parent) target = target->parent; if ((source->init || target->init) && (source->fin || target->fin)) { canRemove = false; break; } } if (!canRemove) continue; /* OK, merge states linked by each arc labeled by the trigram */ foreach(cell, trgmInfo->arcs) { TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell); TrgmState *source = arcInfo->source, *target = arcInfo->target; while (source->parent) source = source->parent; while (target->parent) target = target->parent; if (source != target) mergeStates(source, target); } /* Mark trigram unexpanded, and update totalTrgmCount */ trgmInfo->expanded = false; totalTrgmCount -= trgmInfo->count; } /* Did we succeed in fitting into MAX_TRGM_COUNT? */ if (totalTrgmCount > MAX_TRGM_COUNT) return false; trgmNFA->totalTrgmCount = (int) totalTrgmCount; /* * Sort color trigrams by colors (will be useful for bsearch in packGraph) * and enumerate the color trigrams that are expanded. */ number = 0; qsort(colorTrgms, trgmNFA->colorTrgmsCount, sizeof(ColorTrgmInfo), colorTrgmInfoCmp); for (i = 0; i < trgmNFA->colorTrgmsCount; i++) { if (colorTrgms[i].expanded) { colorTrgms[i].number = number; number++; } } return true; }
static void transformGraph | ( | TrgmNFA * | trgmNFA | ) | [static] |
Definition at line 870 of file trgm_regexp.c.
References TrgmNFA::arcsCount, TrgmPrefix::colors, CurrentMemoryContext, HASHCTL::entrysize, TrgmState::fin, HASHCTL::hash, HASH_CONTEXT, hash_create(), HASH_ELEM, HASH_FUNCTION, hash_get_num_entries(), HASHCTL::hcxt, TrgmState::init, TrgmNFA::initState, HASHCTL::keysize, TrgmNFA::keysQueue, linitial, list_delete_first(), MAX_EXPANDED_ARCS, MAX_EXPANDED_STATES, MemSet, NIL, TrgmStateKey::nstate, TrgmNFA::overflowed, pg_reg_getinitialstate(), TrgmStateKey::prefix, processState(), TrgmNFA::queue, TrgmNFA::regex, and TrgmNFA::states.
Referenced by createTrgmNFAInternal().
{ HASHCTL hashCtl; TrgmStateKey initkey; TrgmState *initstate; /* Initialize this stage's workspace in trgmNFA struct */ trgmNFA->queue = NIL; trgmNFA->keysQueue = NIL; trgmNFA->arcsCount = 0; trgmNFA->overflowed = false; /* Create hashtable for states */ hashCtl.keysize = sizeof(TrgmStateKey); hashCtl.entrysize = sizeof(TrgmState); hashCtl.hcxt = CurrentMemoryContext; hashCtl.hash = tag_hash; trgmNFA->states = hash_create("Trigram NFA", 1024, &hashCtl, HASH_ELEM | HASH_CONTEXT | HASH_FUNCTION); /* Create initial state: ambiguous prefix, NFA's initial state */ MemSet(&initkey, 0, sizeof(initkey)); initkey.prefix.colors[0] = COLOR_UNKNOWN; initkey.prefix.colors[1] = COLOR_UNKNOWN; initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex); initstate = getState(trgmNFA, &initkey); initstate->init = true; trgmNFA->initState = initstate; /* * Recursively build the expanded graph by processing queue of states * (breadth-first search). getState already put initstate in the queue. */ while (trgmNFA->queue != NIL) { TrgmState *state = (TrgmState *) linitial(trgmNFA->queue); trgmNFA->queue = list_delete_first(trgmNFA->queue); /* * If we overflowed then just mark state as final. Otherwise do * actual processing. */ if (trgmNFA->overflowed) state->fin = true; else processState(trgmNFA, state); /* Did we overflow? */ if (trgmNFA->arcsCount > MAX_EXPANDED_ARCS || hash_get_num_entries(trgmNFA->states) > MAX_EXPANDED_STATES) trgmNFA->overflowed = true; } }
bool trigramsMatchGraph | ( | TrgmPackedGraph * | graph, | |
bool * | check | |||
) |
Definition at line 604 of file trgm_regexp.c.
References TrgmPackedState::arcs, TrgmPackedState::arcsCount, TrgmPackedArc::colorTrgm, TrgmPackedGraph::colorTrigramGroups, TrgmPackedGraph::colorTrigramsActive, TrgmPackedGraph::colorTrigramsCount, i, TrgmPackedGraph::states, TrgmPackedGraph::statesActive, TrgmPackedGraph::statesCount, TrgmPackedGraph::statesQueue, and TrgmPackedArc::targetState.
Referenced by gin_trgm_consistent(), and gtrgm_consistent().
{ int i, j, k, queueIn, queueOut; /* * Reset temporary working areas. */ memset(graph->colorTrigramsActive, 0, sizeof(bool) * graph->colorTrigramsCount); memset(graph->statesActive, 0, sizeof(bool) * graph->statesCount); /* * Check which color trigrams were matched. A match for any simple * trigram associated with a color trigram counts as a match of the color * trigram. */ j = 0; for (i = 0; i < graph->colorTrigramsCount; i++) { int cnt = graph->colorTrigramGroups[i]; for (k = j; k < j + cnt; k++) { if (check[k]) { /* * Found one matched trigram in the group. Can skip the rest * of them and go to the next group. */ graph->colorTrigramsActive[i] = true; break; } } j = j + cnt; } /* * Initialize the statesQueue to hold just the initial state. Note: * statesQueue has room for statesCount entries, which is certainly enough * since no state will be put in the queue more than once. The * statesActive array marks which states have been queued. */ graph->statesActive[0] = true; graph->statesQueue[0] = 0; queueIn = 0; queueOut = 1; /* Process queued states as long as there are any. */ while (queueIn < queueOut) { int stateno = graph->statesQueue[queueIn++]; TrgmPackedState *state = &graph->states[stateno]; int cnt = state->arcsCount; /* Loop over state's out-arcs */ for (i = 0; i < cnt; i++) { TrgmPackedArc *arc = &state->arcs[i]; /* * If corresponding color trigram is present then activate the * corresponding state. We're done if that's the final state, * otherwise queue the state if it's not been queued already. */ if (graph->colorTrigramsActive[arc->colorTrgm]) { int nextstate = arc->targetState; if (nextstate == 1) return true; /* success: final state is reachable */ if (!graph->statesActive[nextstate]) { graph->statesActive[nextstate] = true; graph->statesQueue[queueOut++] = nextstate; } } } } /* Queue is empty, so match fails. */ return false; }
static bool validArcLabel | ( | TrgmStateKey * | key, | |
TrgmColor | co | |||
) | [static] |
Definition at line 1292 of file trgm_regexp.c.
References Assert, COLOR_BLANK, COLOR_UNKNOWN, TrgmPrefix::colors, and TrgmStateKey::prefix.
Referenced by addArc(), and addKey().
{ /* * We have to know full trigram in order to add outgoing arc. So we can't * do it if prefix is ambiguous. */ if (key->prefix.colors[0] == COLOR_UNKNOWN) return false; /* If key->prefix.colors[0] isn't unknown, its second color isn't either */ Assert(key->prefix.colors[1] != COLOR_UNKNOWN); /* And we should not be called with an unknown arc color anytime */ Assert(co != COLOR_UNKNOWN); /* * We don't bother with making arcs representing three non-word * characters, since that's useless for trigram extraction. */ if (key->prefix.colors[0] == COLOR_BLANK && key->prefix.colors[1] == COLOR_BLANK && co == COLOR_BLANK) return false; /* * We also reject nonblank-blank-anything. The nonblank-blank-nonblank * case doesn't correspond to any trigram the trigram extraction code * would make. The nonblank-blank-blank case is also not possible with * RPADDING = 1. (Note that in many cases we'd fail to generate such a * trigram even if it were valid, for example processing "foo bar" will * not result in considering the trigram "o ". So if you want to support * RPADDING = 2, there's more to do than just twiddle this test.) */ if (key->prefix.colors[0] != COLOR_BLANK && key->prefix.colors[1] == COLOR_BLANK) return false; /* * Other combinations involving blank are valid, in particular we assume * blank-blank-nonblank is valid, which presumes that LPADDING is 2. * * Note: Using again the example "foo bar", we will not consider the * trigram " b", though this trigram would be found by the trigram * extraction code. Since we will find " ba", it doesn't seem worth * trying to hack the algorithm to generate the additional trigram. */ /* arc label is valid */ return true; }