Header And Logo

PostgreSQL
| The world's most advanced open source database.

trgm_op.c

Go to the documentation of this file.
00001 /*
00002  * contrib/pg_trgm/trgm_op.c
00003  */
00004 #include "postgres.h"
00005 
00006 #include <ctype.h>
00007 
00008 #include "trgm.h"
00009 
00010 #include "catalog/pg_type.h"
00011 #include "tsearch/ts_locale.h"
00012 
00013 
00014 PG_MODULE_MAGIC;
00015 
00016 float4      trgm_limit = 0.3f;
00017 
00018 PG_FUNCTION_INFO_V1(set_limit);
00019 Datum       set_limit(PG_FUNCTION_ARGS);
00020 
00021 PG_FUNCTION_INFO_V1(show_limit);
00022 Datum       show_limit(PG_FUNCTION_ARGS);
00023 
00024 PG_FUNCTION_INFO_V1(show_trgm);
00025 Datum       show_trgm(PG_FUNCTION_ARGS);
00026 
00027 PG_FUNCTION_INFO_V1(similarity);
00028 Datum       similarity(PG_FUNCTION_ARGS);
00029 
00030 PG_FUNCTION_INFO_V1(similarity_dist);
00031 Datum       similarity_dist(PG_FUNCTION_ARGS);
00032 
00033 PG_FUNCTION_INFO_V1(similarity_op);
00034 Datum       similarity_op(PG_FUNCTION_ARGS);
00035 
00036 
00037 Datum
00038 set_limit(PG_FUNCTION_ARGS)
00039 {
00040     float4      nlimit = PG_GETARG_FLOAT4(0);
00041 
00042     if (nlimit < 0 || nlimit > 1.0)
00043         elog(ERROR, "wrong limit, should be between 0 and 1");
00044     trgm_limit = nlimit;
00045     PG_RETURN_FLOAT4(trgm_limit);
00046 }
00047 
00048 Datum
00049 show_limit(PG_FUNCTION_ARGS)
00050 {
00051     PG_RETURN_FLOAT4(trgm_limit);
00052 }
00053 
00054 static int
00055 comp_trgm(const void *a, const void *b)
00056 {
00057     return CMPTRGM(a, b);
00058 }
00059 
00060 static int
00061 unique_array(trgm *a, int len)
00062 {
00063     trgm       *curend,
00064                *tmp;
00065 
00066     curend = tmp = a;
00067     while (tmp - a < len)
00068         if (CMPTRGM(tmp, curend))
00069         {
00070             curend++;
00071             CPTRGM(curend, tmp);
00072             tmp++;
00073         }
00074         else
00075             tmp++;
00076 
00077     return curend + 1 - a;
00078 }
00079 
00080 /*
00081  * Finds first word in string, returns pointer to the word,
00082  * endword points to the character after word
00083  */
00084 static char *
00085 find_word(char *str, int lenstr, char **endword, int *charlen)
00086 {
00087     char       *beginword = str;
00088 
00089     while (beginword - str < lenstr && !ISWORDCHR(beginword))
00090         beginword += pg_mblen(beginword);
00091 
00092     if (beginword - str >= lenstr)
00093         return NULL;
00094 
00095     *endword = beginword;
00096     *charlen = 0;
00097     while (*endword - str < lenstr && ISWORDCHR(*endword))
00098     {
00099         *endword += pg_mblen(*endword);
00100         (*charlen)++;
00101     }
00102 
00103     return beginword;
00104 }
00105 
00106 /*
00107  * Reduce a trigram (three possibly multi-byte characters) to a trgm,
00108  * which is always exactly three bytes.  If we have three single-byte
00109  * characters, we just use them as-is; otherwise we form a hash value.
00110  */
00111 void
00112 compact_trigram(trgm *tptr, char *str, int bytelen)
00113 {
00114     if (bytelen == 3)
00115     {
00116         CPTRGM(tptr, str);
00117     }
00118     else
00119     {
00120         pg_crc32    crc;
00121 
00122         INIT_CRC32(crc);
00123         COMP_CRC32(crc, str, bytelen);
00124         FIN_CRC32(crc);
00125 
00126         /*
00127          * use only 3 upper bytes from crc, hope, it's good enough hashing
00128          */
00129         CPTRGM(tptr, &crc);
00130     }
00131 }
00132 
00133 /*
00134  * Adds trigrams from words (already padded).
00135  */
00136 static trgm *
00137 make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
00138 {
00139     char       *ptr = str;
00140 
00141     if (charlen < 3)
00142         return tptr;
00143 
00144     if (bytelen > charlen)
00145     {
00146         /* Find multibyte character boundaries and apply compact_trigram */
00147         int         lenfirst = pg_mblen(str),
00148                     lenmiddle = pg_mblen(str + lenfirst),
00149                     lenlast = pg_mblen(str + lenfirst + lenmiddle);
00150 
00151         while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
00152         {
00153             compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
00154 
00155             ptr += lenfirst;
00156             tptr++;
00157 
00158             lenfirst = lenmiddle;
00159             lenmiddle = lenlast;
00160             lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
00161         }
00162     }
00163     else
00164     {
00165         /* Fast path when there are no multibyte characters */
00166         Assert(bytelen == charlen);
00167 
00168         while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
00169         {
00170             CPTRGM(tptr, ptr);
00171             ptr++;
00172             tptr++;
00173         }
00174     }
00175 
00176     return tptr;
00177 }
00178 
00179 TRGM *
00180 generate_trgm(char *str, int slen)
00181 {
00182     TRGM       *trg;
00183     char       *buf;
00184     trgm       *tptr;
00185     int         len,
00186                 charlen,
00187                 bytelen;
00188     char       *bword,
00189                *eword;
00190 
00191     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
00192     trg->flag = ARRKEY;
00193     SET_VARSIZE(trg, TRGMHDRSIZE);
00194 
00195     if (slen + LPADDING + RPADDING < 3 || slen == 0)
00196         return trg;
00197 
00198     tptr = GETARR(trg);
00199 
00200     buf = palloc(sizeof(char) * (slen + 4));
00201 
00202     if (LPADDING > 0)
00203     {
00204         *buf = ' ';
00205         if (LPADDING > 1)
00206             *(buf + 1) = ' ';
00207     }
00208 
00209     eword = str;
00210     while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
00211     {
00212 #ifdef IGNORECASE
00213         bword = lowerstr_with_len(bword, eword - bword);
00214         bytelen = strlen(bword);
00215 #else
00216         bytelen = eword - bword;
00217 #endif
00218 
00219         memcpy(buf + LPADDING, bword, bytelen);
00220 
00221 #ifdef IGNORECASE
00222         pfree(bword);
00223 #endif
00224         buf[LPADDING + bytelen] = ' ';
00225         buf[LPADDING + bytelen + 1] = ' ';
00226 
00227         /*
00228          * count trigrams
00229          */
00230         tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
00231                              charlen + LPADDING + RPADDING);
00232     }
00233 
00234     pfree(buf);
00235 
00236     if ((len = tptr - GETARR(trg)) == 0)
00237         return trg;
00238 
00239     if (len > 0)
00240     {
00241         qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
00242         len = unique_array(GETARR(trg), len);
00243     }
00244 
00245     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
00246 
00247     return trg;
00248 }
00249 
00250 /*
00251  * Extract the next non-wildcard part of a search string, ie, a word bounded
00252  * by '_' or '%' meta-characters, non-word characters or string end.
00253  *
00254  * str: source string, of length lenstr bytes (need not be null-terminated)
00255  * buf: where to return the substring (must be long enough)
00256  * *bytelen: receives byte length of the found substring
00257  * *charlen: receives character length of the found substring
00258  *
00259  * Returns pointer to end+1 of the found substring in the source string.
00260  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
00261  *
00262  * If the found word is bounded by non-word characters or string boundaries
00263  * then this function will include corresponding padding spaces into buf.
00264  */
00265 static const char *
00266 get_wildcard_part(const char *str, int lenstr,
00267                   char *buf, int *bytelen, int *charlen)
00268 {
00269     const char *beginword = str;
00270     const char *endword;
00271     char       *s = buf;
00272     bool        in_leading_wildcard_meta = false;
00273     bool        in_trailing_wildcard_meta = false;
00274     bool        in_escape = false;
00275     int         clen;
00276 
00277     /*
00278      * Find the first word character, remembering whether preceding character
00279      * was wildcard meta-character.  Note that the in_escape state persists
00280      * from this loop to the next one, since we may exit at a word character
00281      * that is in_escape.
00282      */
00283     while (beginword - str < lenstr)
00284     {
00285         if (in_escape)
00286         {
00287             if (ISWORDCHR(beginword))
00288                 break;
00289             in_escape = false;
00290             in_leading_wildcard_meta = false;
00291         }
00292         else
00293         {
00294             if (ISESCAPECHAR(beginword))
00295                 in_escape = true;
00296             else if (ISWILDCARDCHAR(beginword))
00297                 in_leading_wildcard_meta = true;
00298             else if (ISWORDCHR(beginword))
00299                 break;
00300             else
00301                 in_leading_wildcard_meta = false;
00302         }
00303         beginword += pg_mblen(beginword);
00304     }
00305 
00306     /*
00307      * Handle string end.
00308      */
00309     if (beginword - str >= lenstr)
00310         return NULL;
00311 
00312     /*
00313      * Add left padding spaces if preceding character wasn't wildcard
00314      * meta-character.
00315      */
00316     *charlen = 0;
00317     if (!in_leading_wildcard_meta)
00318     {
00319         if (LPADDING > 0)
00320         {
00321             *s++ = ' ';
00322             (*charlen)++;
00323             if (LPADDING > 1)
00324             {
00325                 *s++ = ' ';
00326                 (*charlen)++;
00327             }
00328         }
00329     }
00330 
00331     /*
00332      * Copy data into buf until wildcard meta-character, non-word character or
00333      * string boundary.  Strip escapes during copy.
00334      */
00335     endword = beginword;
00336     while (endword - str < lenstr)
00337     {
00338         clen = pg_mblen(endword);
00339         if (in_escape)
00340         {
00341             if (ISWORDCHR(endword))
00342             {
00343                 memcpy(s, endword, clen);
00344                 (*charlen)++;
00345                 s += clen;
00346             }
00347             else
00348             {
00349                 /*
00350                  * Back up endword to the escape character when stopping at
00351                  * an escaped char, so that subsequent get_wildcard_part will
00352                  * restart from the escape character.  We assume here that
00353                  * escape chars are single-byte.
00354                  */
00355                 endword--;
00356                 break;
00357             }
00358             in_escape = false;
00359         }
00360         else
00361         {
00362             if (ISESCAPECHAR(endword))
00363                 in_escape = true;
00364             else if (ISWILDCARDCHAR(endword))
00365             {
00366                 in_trailing_wildcard_meta = true;
00367                 break;
00368             }
00369             else if (ISWORDCHR(endword))
00370             {
00371                 memcpy(s, endword, clen);
00372                 (*charlen)++;
00373                 s += clen;
00374             }
00375             else
00376                 break;
00377         }
00378         endword += clen;
00379     }
00380 
00381     /*
00382      * Add right padding spaces if next character isn't wildcard
00383      * meta-character.
00384      */
00385     if (!in_trailing_wildcard_meta)
00386     {
00387         if (RPADDING > 0)
00388         {
00389             *s++ = ' ';
00390             (*charlen)++;
00391             if (RPADDING > 1)
00392             {
00393                 *s++ = ' ';
00394                 (*charlen)++;
00395             }
00396         }
00397     }
00398 
00399     *bytelen = s - buf;
00400     return endword;
00401 }
00402 
00403 /*
00404  * Generates trigrams for wildcard search string.
00405  *
00406  * Returns array of trigrams that must occur in any string that matches the
00407  * wildcard string.  For example, given pattern "a%bcd%" the trigrams
00408  * " a", "bcd" would be extracted.
00409  */
00410 TRGM *
00411 generate_wildcard_trgm(const char *str, int slen)
00412 {
00413     TRGM       *trg;
00414     char       *buf,
00415                *buf2;
00416     trgm       *tptr;
00417     int         len,
00418                 charlen,
00419                 bytelen;
00420     const char *eword;
00421 
00422     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
00423     trg->flag = ARRKEY;
00424     SET_VARSIZE(trg, TRGMHDRSIZE);
00425 
00426     if (slen + LPADDING + RPADDING < 3 || slen == 0)
00427         return trg;
00428 
00429     tptr = GETARR(trg);
00430 
00431     buf = palloc(sizeof(char) * (slen + 4));
00432 
00433     /*
00434      * Extract trigrams from each substring extracted by get_wildcard_part.
00435      */
00436     eword = str;
00437     while ((eword = get_wildcard_part(eword, slen - (eword - str),
00438                                       buf, &bytelen, &charlen)) != NULL)
00439     {
00440 #ifdef IGNORECASE
00441         buf2 = lowerstr_with_len(buf, bytelen);
00442         bytelen = strlen(buf2);
00443 #else
00444         buf2 = buf;
00445 #endif
00446 
00447         /*
00448          * count trigrams
00449          */
00450         tptr = make_trigrams(tptr, buf2, bytelen, charlen);
00451 #ifdef IGNORECASE
00452         pfree(buf2);
00453 #endif
00454     }
00455 
00456     pfree(buf);
00457 
00458     if ((len = tptr - GETARR(trg)) == 0)
00459         return trg;
00460 
00461     /*
00462      * Make trigrams unique.
00463      */
00464     if (len > 0)
00465     {
00466         qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
00467         len = unique_array(GETARR(trg), len);
00468     }
00469 
00470     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
00471 
00472     return trg;
00473 }
00474 
00475 uint32
00476 trgm2int(trgm *ptr)
00477 {
00478     uint32      val = 0;
00479 
00480     val |= *(((unsigned char *) ptr));
00481     val <<= 8;
00482     val |= *(((unsigned char *) ptr) + 1);
00483     val <<= 8;
00484     val |= *(((unsigned char *) ptr) + 2);
00485 
00486     return val;
00487 }
00488 
00489 Datum
00490 show_trgm(PG_FUNCTION_ARGS)
00491 {
00492     text       *in = PG_GETARG_TEXT_P(0);
00493     TRGM       *trg;
00494     Datum      *d;
00495     ArrayType  *a;
00496     trgm       *ptr;
00497     int         i;
00498 
00499     trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
00500     d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
00501 
00502     for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
00503     {
00504         text       *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
00505 
00506         if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
00507         {
00508             snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
00509             SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
00510         }
00511         else
00512         {
00513             SET_VARSIZE(item, VARHDRSZ + 3);
00514             CPTRGM(VARDATA(item), ptr);
00515         }
00516         d[i] = PointerGetDatum(item);
00517     }
00518 
00519     a = construct_array(
00520                         d,
00521                         ARRNELEM(trg),
00522                         TEXTOID,
00523                         -1,
00524                         false,
00525                         'i'
00526         );
00527 
00528     for (i = 0; i < ARRNELEM(trg); i++)
00529         pfree(DatumGetPointer(d[i]));
00530 
00531     pfree(d);
00532     pfree(trg);
00533     PG_FREE_IF_COPY(in, 0);
00534 
00535     PG_RETURN_POINTER(a);
00536 }
00537 
00538 float4
00539 cnt_sml(TRGM *trg1, TRGM *trg2)
00540 {
00541     trgm       *ptr1,
00542                *ptr2;
00543     int         count = 0;
00544     int         len1,
00545                 len2;
00546 
00547     ptr1 = GETARR(trg1);
00548     ptr2 = GETARR(trg2);
00549 
00550     len1 = ARRNELEM(trg1);
00551     len2 = ARRNELEM(trg2);
00552 
00553     /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
00554     if (len1 <= 0 || len2 <= 0)
00555         return (float4) 0.0;
00556 
00557     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
00558     {
00559         int         res = CMPTRGM(ptr1, ptr2);
00560 
00561         if (res < 0)
00562             ptr1++;
00563         else if (res > 0)
00564             ptr2++;
00565         else
00566         {
00567             ptr1++;
00568             ptr2++;
00569             count++;
00570         }
00571     }
00572 
00573 #ifdef DIVUNION
00574     return ((float4) count) / ((float4) (len1 + len2 - count));
00575 #else
00576     return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
00577 #endif
00578 
00579 }
00580 
00581 /*
00582  * Returns whether trg2 contains all trigrams in trg1.
00583  * This relies on the trigram arrays being sorted.
00584  */
00585 bool
00586 trgm_contained_by(TRGM *trg1, TRGM *trg2)
00587 {
00588     trgm       *ptr1,
00589                *ptr2;
00590     int         len1,
00591                 len2;
00592 
00593     ptr1 = GETARR(trg1);
00594     ptr2 = GETARR(trg2);
00595 
00596     len1 = ARRNELEM(trg1);
00597     len2 = ARRNELEM(trg2);
00598 
00599     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
00600     {
00601         int         res = CMPTRGM(ptr1, ptr2);
00602 
00603         if (res < 0)
00604             return false;
00605         else if (res > 0)
00606             ptr2++;
00607         else
00608         {
00609             ptr1++;
00610             ptr2++;
00611         }
00612     }
00613     if (ptr1 - GETARR(trg1) < len1)
00614         return false;
00615     else
00616         return true;
00617 }
00618 
00619 /*
00620  * Return a palloc'd boolean array showing, for each trigram in "query",
00621  * whether it is present in the trigram array "key".
00622  * This relies on the "key" array being sorted, but "query" need not be.
00623  */
00624 bool *
00625 trgm_presence_map(TRGM *query, TRGM *key)
00626 {
00627     bool       *result;
00628     trgm       *ptrq = GETARR(query),
00629                *ptrk = GETARR(key);
00630     int         lenq = ARRNELEM(query),
00631                 lenk = ARRNELEM(key),
00632                 i;
00633 
00634     result = (bool *) palloc0(lenq * sizeof(bool));
00635 
00636     /* for each query trigram, do a binary search in the key array */
00637     for (i = 0; i < lenq; i++)
00638     {
00639         int         lo = 0;
00640         int         hi = lenk;
00641 
00642         while (lo < hi)
00643         {
00644             int         mid = (lo + hi) / 2;
00645             int         res = CMPTRGM(ptrq, ptrk + mid);
00646 
00647             if (res < 0)
00648                 hi = mid;
00649             else if (res > 0)
00650                 lo = mid + 1;
00651             else
00652             {
00653                 result[i] = true;
00654                 break;
00655             }
00656         }
00657         ptrq++;
00658     }
00659 
00660     return result;
00661 }
00662 
00663 Datum
00664 similarity(PG_FUNCTION_ARGS)
00665 {
00666     text       *in1 = PG_GETARG_TEXT_P(0);
00667     text       *in2 = PG_GETARG_TEXT_P(1);
00668     TRGM       *trg1,
00669                *trg2;
00670     float4      res;
00671 
00672     trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
00673     trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);
00674 
00675     res = cnt_sml(trg1, trg2);
00676 
00677     pfree(trg1);
00678     pfree(trg2);
00679     PG_FREE_IF_COPY(in1, 0);
00680     PG_FREE_IF_COPY(in2, 1);
00681 
00682     PG_RETURN_FLOAT4(res);
00683 }
00684 
00685 Datum
00686 similarity_dist(PG_FUNCTION_ARGS)
00687 {
00688     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
00689                                                          PG_GETARG_DATUM(0),
00690                                                          PG_GETARG_DATUM(1)));
00691 
00692     PG_RETURN_FLOAT4(1.0 - res);
00693 }
00694 
00695 Datum
00696 similarity_op(PG_FUNCTION_ARGS)
00697 {
00698     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
00699                                                          PG_GETARG_DATUM(0),
00700                                                          PG_GETARG_DATUM(1)));
00701 
00702     PG_RETURN_BOOL(res >= trgm_limit);
00703 }