00001
00002
00003
00004 #include "postgres.h"
00005
00006 #include <ctype.h>
00007
00008 #include "trgm.h"
00009
00010 #include "catalog/pg_type.h"
00011 #include "tsearch/ts_locale.h"
00012
00013
00014 PG_MODULE_MAGIC;
00015
00016 float4 trgm_limit = 0.3f;
00017
00018 PG_FUNCTION_INFO_V1(set_limit);
00019 Datum set_limit(PG_FUNCTION_ARGS);
00020
00021 PG_FUNCTION_INFO_V1(show_limit);
00022 Datum show_limit(PG_FUNCTION_ARGS);
00023
00024 PG_FUNCTION_INFO_V1(show_trgm);
00025 Datum show_trgm(PG_FUNCTION_ARGS);
00026
00027 PG_FUNCTION_INFO_V1(similarity);
00028 Datum similarity(PG_FUNCTION_ARGS);
00029
00030 PG_FUNCTION_INFO_V1(similarity_dist);
00031 Datum similarity_dist(PG_FUNCTION_ARGS);
00032
00033 PG_FUNCTION_INFO_V1(similarity_op);
00034 Datum similarity_op(PG_FUNCTION_ARGS);
00035
00036
00037 Datum
00038 set_limit(PG_FUNCTION_ARGS)
00039 {
00040 float4 nlimit = PG_GETARG_FLOAT4(0);
00041
00042 if (nlimit < 0 || nlimit > 1.0)
00043 elog(ERROR, "wrong limit, should be between 0 and 1");
00044 trgm_limit = nlimit;
00045 PG_RETURN_FLOAT4(trgm_limit);
00046 }
00047
00048 Datum
00049 show_limit(PG_FUNCTION_ARGS)
00050 {
00051 PG_RETURN_FLOAT4(trgm_limit);
00052 }
00053
00054 static int
00055 comp_trgm(const void *a, const void *b)
00056 {
00057 return CMPTRGM(a, b);
00058 }
00059
00060 static int
00061 unique_array(trgm *a, int len)
00062 {
00063 trgm *curend,
00064 *tmp;
00065
00066 curend = tmp = a;
00067 while (tmp - a < len)
00068 if (CMPTRGM(tmp, curend))
00069 {
00070 curend++;
00071 CPTRGM(curend, tmp);
00072 tmp++;
00073 }
00074 else
00075 tmp++;
00076
00077 return curend + 1 - a;
00078 }
00079
00080
00081
00082
00083
00084 static char *
00085 find_word(char *str, int lenstr, char **endword, int *charlen)
00086 {
00087 char *beginword = str;
00088
00089 while (beginword - str < lenstr && !ISWORDCHR(beginword))
00090 beginword += pg_mblen(beginword);
00091
00092 if (beginword - str >= lenstr)
00093 return NULL;
00094
00095 *endword = beginword;
00096 *charlen = 0;
00097 while (*endword - str < lenstr && ISWORDCHR(*endword))
00098 {
00099 *endword += pg_mblen(*endword);
00100 (*charlen)++;
00101 }
00102
00103 return beginword;
00104 }
00105
00106
00107
00108
00109
00110
00111 void
00112 compact_trigram(trgm *tptr, char *str, int bytelen)
00113 {
00114 if (bytelen == 3)
00115 {
00116 CPTRGM(tptr, str);
00117 }
00118 else
00119 {
00120 pg_crc32 crc;
00121
00122 INIT_CRC32(crc);
00123 COMP_CRC32(crc, str, bytelen);
00124 FIN_CRC32(crc);
00125
00126
00127
00128
00129 CPTRGM(tptr, &crc);
00130 }
00131 }
00132
00133
00134
00135
00136 static trgm *
00137 make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
00138 {
00139 char *ptr = str;
00140
00141 if (charlen < 3)
00142 return tptr;
00143
00144 if (bytelen > charlen)
00145 {
00146
00147 int lenfirst = pg_mblen(str),
00148 lenmiddle = pg_mblen(str + lenfirst),
00149 lenlast = pg_mblen(str + lenfirst + lenmiddle);
00150
00151 while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
00152 {
00153 compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
00154
00155 ptr += lenfirst;
00156 tptr++;
00157
00158 lenfirst = lenmiddle;
00159 lenmiddle = lenlast;
00160 lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
00161 }
00162 }
00163 else
00164 {
00165
00166 Assert(bytelen == charlen);
00167
00168 while (ptr - str < bytelen - 2 )
00169 {
00170 CPTRGM(tptr, ptr);
00171 ptr++;
00172 tptr++;
00173 }
00174 }
00175
00176 return tptr;
00177 }
00178
00179 TRGM *
00180 generate_trgm(char *str, int slen)
00181 {
00182 TRGM *trg;
00183 char *buf;
00184 trgm *tptr;
00185 int len,
00186 charlen,
00187 bytelen;
00188 char *bword,
00189 *eword;
00190
00191 trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
00192 trg->flag = ARRKEY;
00193 SET_VARSIZE(trg, TRGMHDRSIZE);
00194
00195 if (slen + LPADDING + RPADDING < 3 || slen == 0)
00196 return trg;
00197
00198 tptr = GETARR(trg);
00199
00200 buf = palloc(sizeof(char) * (slen + 4));
00201
00202 if (LPADDING > 0)
00203 {
00204 *buf = ' ';
00205 if (LPADDING > 1)
00206 *(buf + 1) = ' ';
00207 }
00208
00209 eword = str;
00210 while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
00211 {
00212 #ifdef IGNORECASE
00213 bword = lowerstr_with_len(bword, eword - bword);
00214 bytelen = strlen(bword);
00215 #else
00216 bytelen = eword - bword;
00217 #endif
00218
00219 memcpy(buf + LPADDING, bword, bytelen);
00220
00221 #ifdef IGNORECASE
00222 pfree(bword);
00223 #endif
00224 buf[LPADDING + bytelen] = ' ';
00225 buf[LPADDING + bytelen + 1] = ' ';
00226
00227
00228
00229
00230 tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
00231 charlen + LPADDING + RPADDING);
00232 }
00233
00234 pfree(buf);
00235
00236 if ((len = tptr - GETARR(trg)) == 0)
00237 return trg;
00238
00239 if (len > 0)
00240 {
00241 qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
00242 len = unique_array(GETARR(trg), len);
00243 }
00244
00245 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
00246
00247 return trg;
00248 }
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265 static const char *
00266 get_wildcard_part(const char *str, int lenstr,
00267 char *buf, int *bytelen, int *charlen)
00268 {
00269 const char *beginword = str;
00270 const char *endword;
00271 char *s = buf;
00272 bool in_leading_wildcard_meta = false;
00273 bool in_trailing_wildcard_meta = false;
00274 bool in_escape = false;
00275 int clen;
00276
00277
00278
00279
00280
00281
00282
00283 while (beginword - str < lenstr)
00284 {
00285 if (in_escape)
00286 {
00287 if (ISWORDCHR(beginword))
00288 break;
00289 in_escape = false;
00290 in_leading_wildcard_meta = false;
00291 }
00292 else
00293 {
00294 if (ISESCAPECHAR(beginword))
00295 in_escape = true;
00296 else if (ISWILDCARDCHAR(beginword))
00297 in_leading_wildcard_meta = true;
00298 else if (ISWORDCHR(beginword))
00299 break;
00300 else
00301 in_leading_wildcard_meta = false;
00302 }
00303 beginword += pg_mblen(beginword);
00304 }
00305
00306
00307
00308
00309 if (beginword - str >= lenstr)
00310 return NULL;
00311
00312
00313
00314
00315
00316 *charlen = 0;
00317 if (!in_leading_wildcard_meta)
00318 {
00319 if (LPADDING > 0)
00320 {
00321 *s++ = ' ';
00322 (*charlen)++;
00323 if (LPADDING > 1)
00324 {
00325 *s++ = ' ';
00326 (*charlen)++;
00327 }
00328 }
00329 }
00330
00331
00332
00333
00334
00335 endword = beginword;
00336 while (endword - str < lenstr)
00337 {
00338 clen = pg_mblen(endword);
00339 if (in_escape)
00340 {
00341 if (ISWORDCHR(endword))
00342 {
00343 memcpy(s, endword, clen);
00344 (*charlen)++;
00345 s += clen;
00346 }
00347 else
00348 {
00349
00350
00351
00352
00353
00354
00355 endword--;
00356 break;
00357 }
00358 in_escape = false;
00359 }
00360 else
00361 {
00362 if (ISESCAPECHAR(endword))
00363 in_escape = true;
00364 else if (ISWILDCARDCHAR(endword))
00365 {
00366 in_trailing_wildcard_meta = true;
00367 break;
00368 }
00369 else if (ISWORDCHR(endword))
00370 {
00371 memcpy(s, endword, clen);
00372 (*charlen)++;
00373 s += clen;
00374 }
00375 else
00376 break;
00377 }
00378 endword += clen;
00379 }
00380
00381
00382
00383
00384
00385 if (!in_trailing_wildcard_meta)
00386 {
00387 if (RPADDING > 0)
00388 {
00389 *s++ = ' ';
00390 (*charlen)++;
00391 if (RPADDING > 1)
00392 {
00393 *s++ = ' ';
00394 (*charlen)++;
00395 }
00396 }
00397 }
00398
00399 *bytelen = s - buf;
00400 return endword;
00401 }
00402
00403
00404
00405
00406
00407
00408
00409
00410 TRGM *
00411 generate_wildcard_trgm(const char *str, int slen)
00412 {
00413 TRGM *trg;
00414 char *buf,
00415 *buf2;
00416 trgm *tptr;
00417 int len,
00418 charlen,
00419 bytelen;
00420 const char *eword;
00421
00422 trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
00423 trg->flag = ARRKEY;
00424 SET_VARSIZE(trg, TRGMHDRSIZE);
00425
00426 if (slen + LPADDING + RPADDING < 3 || slen == 0)
00427 return trg;
00428
00429 tptr = GETARR(trg);
00430
00431 buf = palloc(sizeof(char) * (slen + 4));
00432
00433
00434
00435
00436 eword = str;
00437 while ((eword = get_wildcard_part(eword, slen - (eword - str),
00438 buf, &bytelen, &charlen)) != NULL)
00439 {
00440 #ifdef IGNORECASE
00441 buf2 = lowerstr_with_len(buf, bytelen);
00442 bytelen = strlen(buf2);
00443 #else
00444 buf2 = buf;
00445 #endif
00446
00447
00448
00449
00450 tptr = make_trigrams(tptr, buf2, bytelen, charlen);
00451 #ifdef IGNORECASE
00452 pfree(buf2);
00453 #endif
00454 }
00455
00456 pfree(buf);
00457
00458 if ((len = tptr - GETARR(trg)) == 0)
00459 return trg;
00460
00461
00462
00463
00464 if (len > 0)
00465 {
00466 qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
00467 len = unique_array(GETARR(trg), len);
00468 }
00469
00470 SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
00471
00472 return trg;
00473 }
00474
00475 uint32
00476 trgm2int(trgm *ptr)
00477 {
00478 uint32 val = 0;
00479
00480 val |= *(((unsigned char *) ptr));
00481 val <<= 8;
00482 val |= *(((unsigned char *) ptr) + 1);
00483 val <<= 8;
00484 val |= *(((unsigned char *) ptr) + 2);
00485
00486 return val;
00487 }
00488
00489 Datum
00490 show_trgm(PG_FUNCTION_ARGS)
00491 {
00492 text *in = PG_GETARG_TEXT_P(0);
00493 TRGM *trg;
00494 Datum *d;
00495 ArrayType *a;
00496 trgm *ptr;
00497 int i;
00498
00499 trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
00500 d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
00501
00502 for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
00503 {
00504 text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
00505
00506 if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
00507 {
00508 snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
00509 SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
00510 }
00511 else
00512 {
00513 SET_VARSIZE(item, VARHDRSZ + 3);
00514 CPTRGM(VARDATA(item), ptr);
00515 }
00516 d[i] = PointerGetDatum(item);
00517 }
00518
00519 a = construct_array(
00520 d,
00521 ARRNELEM(trg),
00522 TEXTOID,
00523 -1,
00524 false,
00525 'i'
00526 );
00527
00528 for (i = 0; i < ARRNELEM(trg); i++)
00529 pfree(DatumGetPointer(d[i]));
00530
00531 pfree(d);
00532 pfree(trg);
00533 PG_FREE_IF_COPY(in, 0);
00534
00535 PG_RETURN_POINTER(a);
00536 }
00537
00538 float4
00539 cnt_sml(TRGM *trg1, TRGM *trg2)
00540 {
00541 trgm *ptr1,
00542 *ptr2;
00543 int count = 0;
00544 int len1,
00545 len2;
00546
00547 ptr1 = GETARR(trg1);
00548 ptr2 = GETARR(trg2);
00549
00550 len1 = ARRNELEM(trg1);
00551 len2 = ARRNELEM(trg2);
00552
00553
00554 if (len1 <= 0 || len2 <= 0)
00555 return (float4) 0.0;
00556
00557 while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
00558 {
00559 int res = CMPTRGM(ptr1, ptr2);
00560
00561 if (res < 0)
00562 ptr1++;
00563 else if (res > 0)
00564 ptr2++;
00565 else
00566 {
00567 ptr1++;
00568 ptr2++;
00569 count++;
00570 }
00571 }
00572
00573 #ifdef DIVUNION
00574 return ((float4) count) / ((float4) (len1 + len2 - count));
00575 #else
00576 return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
00577 #endif
00578
00579 }
00580
00581
00582
00583
00584
00585 bool
00586 trgm_contained_by(TRGM *trg1, TRGM *trg2)
00587 {
00588 trgm *ptr1,
00589 *ptr2;
00590 int len1,
00591 len2;
00592
00593 ptr1 = GETARR(trg1);
00594 ptr2 = GETARR(trg2);
00595
00596 len1 = ARRNELEM(trg1);
00597 len2 = ARRNELEM(trg2);
00598
00599 while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
00600 {
00601 int res = CMPTRGM(ptr1, ptr2);
00602
00603 if (res < 0)
00604 return false;
00605 else if (res > 0)
00606 ptr2++;
00607 else
00608 {
00609 ptr1++;
00610 ptr2++;
00611 }
00612 }
00613 if (ptr1 - GETARR(trg1) < len1)
00614 return false;
00615 else
00616 return true;
00617 }
00618
00619
00620
00621
00622
00623
00624 bool *
00625 trgm_presence_map(TRGM *query, TRGM *key)
00626 {
00627 bool *result;
00628 trgm *ptrq = GETARR(query),
00629 *ptrk = GETARR(key);
00630 int lenq = ARRNELEM(query),
00631 lenk = ARRNELEM(key),
00632 i;
00633
00634 result = (bool *) palloc0(lenq * sizeof(bool));
00635
00636
00637 for (i = 0; i < lenq; i++)
00638 {
00639 int lo = 0;
00640 int hi = lenk;
00641
00642 while (lo < hi)
00643 {
00644 int mid = (lo + hi) / 2;
00645 int res = CMPTRGM(ptrq, ptrk + mid);
00646
00647 if (res < 0)
00648 hi = mid;
00649 else if (res > 0)
00650 lo = mid + 1;
00651 else
00652 {
00653 result[i] = true;
00654 break;
00655 }
00656 }
00657 ptrq++;
00658 }
00659
00660 return result;
00661 }
00662
00663 Datum
00664 similarity(PG_FUNCTION_ARGS)
00665 {
00666 text *in1 = PG_GETARG_TEXT_P(0);
00667 text *in2 = PG_GETARG_TEXT_P(1);
00668 TRGM *trg1,
00669 *trg2;
00670 float4 res;
00671
00672 trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
00673 trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
00674
00675 res = cnt_sml(trg1, trg2);
00676
00677 pfree(trg1);
00678 pfree(trg2);
00679 PG_FREE_IF_COPY(in1, 0);
00680 PG_FREE_IF_COPY(in2, 1);
00681
00682 PG_RETURN_FLOAT4(res);
00683 }
00684
00685 Datum
00686 similarity_dist(PG_FUNCTION_ARGS)
00687 {
00688 float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
00689 PG_GETARG_DATUM(0),
00690 PG_GETARG_DATUM(1)));
00691
00692 PG_RETURN_FLOAT4(1.0 - res);
00693 }
00694
00695 Datum
00696 similarity_op(PG_FUNCTION_ARGS)
00697 {
00698 float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
00699 PG_GETARG_DATUM(0),
00700 PG_GETARG_DATUM(1)));
00701
00702 PG_RETURN_BOOL(res >= trgm_limit);
00703 }