#include "postgres.h"
#include <ctype.h>
#include "trgm.h"
#include "catalog/pg_type.h"
#include "tsearch/ts_locale.h"
Go to the source code of this file.
Functions | |
PG_FUNCTION_INFO_V1 (set_limit) | |
Datum | set_limit (PG_FUNCTION_ARGS) |
PG_FUNCTION_INFO_V1 (show_limit) | |
Datum | show_limit (PG_FUNCTION_ARGS) |
PG_FUNCTION_INFO_V1 (show_trgm) | |
Datum | show_trgm (PG_FUNCTION_ARGS) |
PG_FUNCTION_INFO_V1 (similarity) | |
Datum | similarity (PG_FUNCTION_ARGS) |
PG_FUNCTION_INFO_V1 (similarity_dist) | |
Datum | similarity_dist (PG_FUNCTION_ARGS) |
PG_FUNCTION_INFO_V1 (similarity_op) | |
Datum | similarity_op (PG_FUNCTION_ARGS) |
static int | comp_trgm (const void *a, const void *b) |
static int | unique_array (trgm *a, int len) |
static char * | find_word (char *str, int lenstr, char **endword, int *charlen) |
void | compact_trigram (trgm *tptr, char *str, int bytelen) |
static trgm * | make_trigrams (trgm *tptr, char *str, int bytelen, int charlen) |
TRGM * | generate_trgm (char *str, int slen) |
static const char * | get_wildcard_part (const char *str, int lenstr, char *buf, int *bytelen, int *charlen) |
TRGM * | generate_wildcard_trgm (const char *str, int slen) |
uint32 | trgm2int (trgm *ptr) |
float4 | cnt_sml (TRGM *trg1, TRGM *trg2) |
bool | trgm_contained_by (TRGM *trg1, TRGM *trg2) |
bool * | trgm_presence_map (TRGM *query, TRGM *key) |
Variables | |
PG_MODULE_MAGIC | |
float4 | trgm_limit = 0.3f |
Definition at line 539 of file trgm_op.c.
References ARRNELEM, CMPTRGM, and GETARR.
Referenced by gtrgm_consistent(), gtrgm_distance(), and similarity().
{ trgm *ptr1, *ptr2; int count = 0; int len1, len2; ptr1 = GETARR(trg1); ptr2 = GETARR(trg2); len1 = ARRNELEM(trg1); len2 = ARRNELEM(trg2); /* explicit test is needed to avoid 0/0 division when both lengths are 0 */ if (len1 <= 0 || len2 <= 0) return (float4) 0.0; while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) { int res = CMPTRGM(ptr1, ptr2); if (res < 0) ptr1++; else if (res > 0) ptr2++; else { ptr1++; ptr2++; count++; } } #ifdef DIVUNION return ((float4) count) / ((float4) (len1 + len2 - count)); #else return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2)); #endif }
static int comp_trgm | ( | const void * | a, | |
const void * | b | |||
) | [static] |
Definition at line 55 of file trgm_op.c.
References CMPTRGM.
Referenced by generate_trgm(), and generate_wildcard_trgm().
{ return CMPTRGM(a, b); }
void compact_trigram | ( | trgm * | tptr, | |
char * | str, | |||
int | bytelen | |||
) |
Definition at line 112 of file trgm_op.c.
References COMP_CRC32, CPTRGM, FIN_CRC32, and INIT_CRC32.
Referenced by fillTrgm(), and make_trigrams().
{ if (bytelen == 3) { CPTRGM(tptr, str); } else { pg_crc32 crc; INIT_CRC32(crc); COMP_CRC32(crc, str, bytelen); FIN_CRC32(crc); /* * use only 3 upper bytes from crc, hope, it's good enough hashing */ CPTRGM(tptr, &crc); } }
static char* find_word | ( | char * | str, | |
int | lenstr, | |||
char ** | endword, | |||
int * | charlen | |||
) | [static] |
Definition at line 85 of file trgm_op.c.
References ISWORDCHR, and pg_mblen().
Referenced by generate_trgm().
{ char *beginword = str; while (beginword - str < lenstr && !ISWORDCHR(beginword)) beginword += pg_mblen(beginword); if (beginword - str >= lenstr) return NULL; *endword = beginword; *charlen = 0; while (*endword - str < lenstr && ISWORDCHR(*endword)) { *endword += pg_mblen(*endword); (*charlen)++; } return beginword; }
TRGM* generate_trgm | ( | char * | str, | |
int | slen | |||
) |
Definition at line 180 of file trgm_op.c.
References ARRKEY, buf, CALCGTSIZE, comp_trgm(), find_word(), TRGM::flag, GETARR, lowerstr_with_len(), LPADDING, make_trigrams(), palloc(), pfree(), qsort, RPADDING, SET_VARSIZE, TRGMHDRSIZE, and unique_array().
Referenced by gin_extract_query_trgm(), gin_extract_value_trgm(), gtrgm_compress(), gtrgm_consistent(), gtrgm_distance(), show_trgm(), and similarity().
{ TRGM *trg; char *buf; trgm *tptr; int len, charlen, bytelen; char *bword, *eword; trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3); trg->flag = ARRKEY; SET_VARSIZE(trg, TRGMHDRSIZE); if (slen + LPADDING + RPADDING < 3 || slen == 0) return trg; tptr = GETARR(trg); buf = palloc(sizeof(char) * (slen + 4)); if (LPADDING > 0) { *buf = ' '; if (LPADDING > 1) *(buf + 1) = ' '; } eword = str; while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL) { #ifdef IGNORECASE bword = lowerstr_with_len(bword, eword - bword); bytelen = strlen(bword); #else bytelen = eword - bword; #endif memcpy(buf + LPADDING, bword, bytelen); #ifdef IGNORECASE pfree(bword); #endif buf[LPADDING + bytelen] = ' '; buf[LPADDING + bytelen + 1] = ' '; /* * count trigrams */ tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING, charlen + LPADDING + RPADDING); } pfree(buf); if ((len = tptr - GETARR(trg)) == 0) return trg; if (len > 0) { qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); len = unique_array(GETARR(trg), len); } SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); return trg; }
TRGM* generate_wildcard_trgm | ( | const char * | str, | |
int | slen | |||
) |
Definition at line 411 of file trgm_op.c.
References ARRKEY, buf, CALCGTSIZE, comp_trgm(), TRGM::flag, get_wildcard_part(), GETARR, lowerstr_with_len(), LPADDING, make_trigrams(), NULL, palloc(), pfree(), qsort, RPADDING, SET_VARSIZE, TRGMHDRSIZE, and unique_array().
Referenced by gin_extract_query_trgm(), and gtrgm_consistent().
{ TRGM *trg; char *buf, *buf2; trgm *tptr; int len, charlen, bytelen; const char *eword; trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3); trg->flag = ARRKEY; SET_VARSIZE(trg, TRGMHDRSIZE); if (slen + LPADDING + RPADDING < 3 || slen == 0) return trg; tptr = GETARR(trg); buf = palloc(sizeof(char) * (slen + 4)); /* * Extract trigrams from each substring extracted by get_wildcard_part. */ eword = str; while ((eword = get_wildcard_part(eword, slen - (eword - str), buf, &bytelen, &charlen)) != NULL) { #ifdef IGNORECASE buf2 = lowerstr_with_len(buf, bytelen); bytelen = strlen(buf2); #else buf2 = buf; #endif /* * count trigrams */ tptr = make_trigrams(tptr, buf2, bytelen, charlen); #ifdef IGNORECASE pfree(buf2); #endif } pfree(buf); if ((len = tptr - GETARR(trg)) == 0) return trg; /* * Make trigrams unique. */ if (len > 0) { qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm); len = unique_array(GETARR(trg), len); } SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len)); return trg; }
static const char* get_wildcard_part | ( | const char * | str, | |
int | lenstr, | |||
char * | buf, | |||
int * | bytelen, | |||
int * | charlen | |||
) | [static] |
Definition at line 266 of file trgm_op.c.
References ISESCAPECHAR, ISWILDCARDCHAR, ISWORDCHR, LPADDING, pg_mblen(), and RPADDING.
Referenced by generate_wildcard_trgm().
{ const char *beginword = str; const char *endword; char *s = buf; bool in_leading_wildcard_meta = false; bool in_trailing_wildcard_meta = false; bool in_escape = false; int clen; /* * Find the first word character, remembering whether preceding character * was wildcard meta-character. Note that the in_escape state persists * from this loop to the next one, since we may exit at a word character * that is in_escape. */ while (beginword - str < lenstr) { if (in_escape) { if (ISWORDCHR(beginword)) break; in_escape = false; in_leading_wildcard_meta = false; } else { if (ISESCAPECHAR(beginword)) in_escape = true; else if (ISWILDCARDCHAR(beginword)) in_leading_wildcard_meta = true; else if (ISWORDCHR(beginword)) break; else in_leading_wildcard_meta = false; } beginword += pg_mblen(beginword); } /* * Handle string end. */ if (beginword - str >= lenstr) return NULL; /* * Add left padding spaces if preceding character wasn't wildcard * meta-character. */ *charlen = 0; if (!in_leading_wildcard_meta) { if (LPADDING > 0) { *s++ = ' '; (*charlen)++; if (LPADDING > 1) { *s++ = ' '; (*charlen)++; } } } /* * Copy data into buf until wildcard meta-character, non-word character or * string boundary. Strip escapes during copy. */ endword = beginword; while (endword - str < lenstr) { clen = pg_mblen(endword); if (in_escape) { if (ISWORDCHR(endword)) { memcpy(s, endword, clen); (*charlen)++; s += clen; } else { /* * Back up endword to the escape character when stopping at * an escaped char, so that subsequent get_wildcard_part will * restart from the escape character. We assume here that * escape chars are single-byte. */ endword--; break; } in_escape = false; } else { if (ISESCAPECHAR(endword)) in_escape = true; else if (ISWILDCARDCHAR(endword)) { in_trailing_wildcard_meta = true; break; } else if (ISWORDCHR(endword)) { memcpy(s, endword, clen); (*charlen)++; s += clen; } else break; } endword += clen; } /* * Add right padding spaces if next character isn't wildcard * meta-character. */ if (!in_trailing_wildcard_meta) { if (RPADDING > 0) { *s++ = ' '; (*charlen)++; if (RPADDING > 1) { *s++ = ' '; (*charlen)++; } } } *bytelen = s - buf; return endword; }
Definition at line 137 of file trgm_op.c.
References Assert, compact_trigram(), CPTRGM, and pg_mblen().
Referenced by generate_trgm(), and generate_wildcard_trgm().
{ char *ptr = str; if (charlen < 3) return tptr; if (bytelen > charlen) { /* Find multibyte character boundaries and apply compact_trigram */ int lenfirst = pg_mblen(str), lenmiddle = pg_mblen(str + lenfirst), lenlast = pg_mblen(str + lenfirst + lenmiddle); while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen) { compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast); ptr += lenfirst; tptr++; lenfirst = lenmiddle; lenmiddle = lenlast; lenlast = pg_mblen(ptr + lenfirst + lenmiddle); } } else { /* Fast path when there are no multibyte characters */ Assert(bytelen == charlen); while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ ) { CPTRGM(tptr, ptr); ptr++; tptr++; } } return tptr; }
PG_FUNCTION_INFO_V1 | ( | show_trgm | ) |
PG_FUNCTION_INFO_V1 | ( | similarity_dist | ) |
PG_FUNCTION_INFO_V1 | ( | similarity | ) |
PG_FUNCTION_INFO_V1 | ( | similarity_op | ) |
PG_FUNCTION_INFO_V1 | ( | show_limit | ) |
PG_FUNCTION_INFO_V1 | ( | set_limit | ) |
Datum set_limit | ( | PG_FUNCTION_ARGS | ) |
Definition at line 38 of file trgm_op.c.
References elog, ERROR, PG_GETARG_FLOAT4, PG_RETURN_FLOAT4, and trgm_limit.
{ float4 nlimit = PG_GETARG_FLOAT4(0); if (nlimit < 0 || nlimit > 1.0) elog(ERROR, "wrong limit, should be between 0 and 1"); trgm_limit = nlimit; PG_RETURN_FLOAT4(trgm_limit); }
Datum show_limit | ( | PG_FUNCTION_ARGS | ) |
Definition at line 49 of file trgm_op.c.
References PG_RETURN_FLOAT4, and trgm_limit.
{ PG_RETURN_FLOAT4(trgm_limit); }
Datum show_trgm | ( | PG_FUNCTION_ARGS | ) |
Definition at line 490 of file trgm_op.c.
References ARRNELEM, construct_array(), CPTRGM, DatumGetPointer, generate_trgm(), GETARR, i, ISPRINTABLETRGM, Max, palloc(), pfree(), pg_database_encoding_max_length(), PG_FREE_IF_COPY, PG_GETARG_TEXT_P, PG_RETURN_POINTER, PointerGetDatum, SET_VARSIZE, snprintf(), TEXTOID, trgm2int(), VARDATA, VARHDRSZ, and VARSIZE.
{ text *in = PG_GETARG_TEXT_P(0); TRGM *trg; Datum *d; ArrayType *a; trgm *ptr; int i; trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ); d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg))); for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++) { text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3)); if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr)) { snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr)); SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item))); } else { SET_VARSIZE(item, VARHDRSZ + 3); CPTRGM(VARDATA(item), ptr); } d[i] = PointerGetDatum(item); } a = construct_array( d, ARRNELEM(trg), TEXTOID, -1, false, 'i' ); for (i = 0; i < ARRNELEM(trg); i++) pfree(DatumGetPointer(d[i])); pfree(d); pfree(trg); PG_FREE_IF_COPY(in, 0); PG_RETURN_POINTER(a); }
Datum similarity | ( | PG_FUNCTION_ARGS | ) |
Definition at line 664 of file trgm_op.c.
References cnt_sml(), generate_trgm(), pfree(), PG_FREE_IF_COPY, PG_GETARG_TEXT_P, PG_RETURN_FLOAT4, VARDATA, VARHDRSZ, and VARSIZE.
Referenced by similarity_dist(), and similarity_op().
{ text *in1 = PG_GETARG_TEXT_P(0); text *in2 = PG_GETARG_TEXT_P(1); TRGM *trg1, *trg2; float4 res; trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ); trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ); res = cnt_sml(trg1, trg2); pfree(trg1); pfree(trg2); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); PG_RETURN_FLOAT4(res); }
Datum similarity_dist | ( | PG_FUNCTION_ARGS | ) |
Definition at line 686 of file trgm_op.c.
References DatumGetFloat4, DirectFunctionCall2, PG_GETARG_DATUM, PG_RETURN_FLOAT4, and similarity().
{ float4 res = DatumGetFloat4(DirectFunctionCall2(similarity, PG_GETARG_DATUM(0), PG_GETARG_DATUM(1))); PG_RETURN_FLOAT4(1.0 - res); }
Datum similarity_op | ( | PG_FUNCTION_ARGS | ) |
Definition at line 696 of file trgm_op.c.
References DatumGetFloat4, DirectFunctionCall2, PG_GETARG_DATUM, PG_RETURN_BOOL, similarity(), and trgm_limit.
{ float4 res = DatumGetFloat4(DirectFunctionCall2(similarity, PG_GETARG_DATUM(0), PG_GETARG_DATUM(1))); PG_RETURN_BOOL(res >= trgm_limit); }
Definition at line 476 of file trgm_op.c.
References val.
Referenced by gin_extract_query_trgm(), gin_extract_value_trgm(), and show_trgm().
Definition at line 586 of file trgm_op.c.
References ARRNELEM, CMPTRGM, and GETARR.
Referenced by gtrgm_consistent().
{ trgm *ptr1, *ptr2; int len1, len2; ptr1 = GETARR(trg1); ptr2 = GETARR(trg2); len1 = ARRNELEM(trg1); len2 = ARRNELEM(trg2); while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2) { int res = CMPTRGM(ptr1, ptr2); if (res < 0) return false; else if (res > 0) ptr2++; else { ptr1++; ptr2++; } } if (ptr1 - GETARR(trg1) < len1) return false; else return true; }
Definition at line 625 of file trgm_op.c.
References ARRNELEM, CMPTRGM, GETARR, i, and palloc0().
Referenced by gtrgm_consistent().
{ bool *result; trgm *ptrq = GETARR(query), *ptrk = GETARR(key); int lenq = ARRNELEM(query), lenk = ARRNELEM(key), i; result = (bool *) palloc0(lenq * sizeof(bool)); /* for each query trigram, do a binary search in the key array */ for (i = 0; i < lenq; i++) { int lo = 0; int hi = lenk; while (lo < hi) { int mid = (lo + hi) / 2; int res = CMPTRGM(ptrq, ptrk + mid); if (res < 0) hi = mid; else if (res > 0) lo = mid + 1; else { result[i] = true; break; } } ptrq++; } return result; }
static int unique_array | ( | trgm * | a, | |
int | len | |||
) | [static] |
Definition at line 61 of file trgm_op.c.
References CMPTRGM, and CPTRGM.
Referenced by generate_trgm(), and generate_wildcard_trgm().
float4 trgm_limit = 0.3f |
Definition at line 16 of file trgm_op.c.
Referenced by gin_trgm_consistent(), gtrgm_consistent(), set_limit(), show_limit(), and similarity_op().