Header And Logo

PostgreSQL
| The world's most advanced open source database.

Functions | Variables

trgm_op.c File Reference

#include "postgres.h"
#include <ctype.h>
#include "trgm.h"
#include "catalog/pg_type.h"
#include "tsearch/ts_locale.h"
Include dependency graph for trgm_op.c:

Go to the source code of this file.

Functions

 PG_FUNCTION_INFO_V1 (set_limit)
Datum set_limit (PG_FUNCTION_ARGS)
 PG_FUNCTION_INFO_V1 (show_limit)
Datum show_limit (PG_FUNCTION_ARGS)
 PG_FUNCTION_INFO_V1 (show_trgm)
Datum show_trgm (PG_FUNCTION_ARGS)
 PG_FUNCTION_INFO_V1 (similarity)
Datum similarity (PG_FUNCTION_ARGS)
 PG_FUNCTION_INFO_V1 (similarity_dist)
Datum similarity_dist (PG_FUNCTION_ARGS)
 PG_FUNCTION_INFO_V1 (similarity_op)
Datum similarity_op (PG_FUNCTION_ARGS)
static int comp_trgm (const void *a, const void *b)
static int unique_array (trgm *a, int len)
static char * find_word (char *str, int lenstr, char **endword, int *charlen)
void compact_trigram (trgm *tptr, char *str, int bytelen)
static trgmmake_trigrams (trgm *tptr, char *str, int bytelen, int charlen)
TRGMgenerate_trgm (char *str, int slen)
static const char * get_wildcard_part (const char *str, int lenstr, char *buf, int *bytelen, int *charlen)
TRGMgenerate_wildcard_trgm (const char *str, int slen)
uint32 trgm2int (trgm *ptr)
float4 cnt_sml (TRGM *trg1, TRGM *trg2)
bool trgm_contained_by (TRGM *trg1, TRGM *trg2)
booltrgm_presence_map (TRGM *query, TRGM *key)

Variables

 PG_MODULE_MAGIC
float4 trgm_limit = 0.3f

Function Documentation

float4 cnt_sml ( TRGM trg1,
TRGM trg2 
)

Definition at line 539 of file trgm_op.c.

References ARRNELEM, CMPTRGM, and GETARR.

Referenced by gtrgm_consistent(), gtrgm_distance(), and similarity().

{
    trgm       *ptr1,
               *ptr2;
    int         count = 0;
    int         len1,
                len2;

    ptr1 = GETARR(trg1);
    ptr2 = GETARR(trg2);

    len1 = ARRNELEM(trg1);
    len2 = ARRNELEM(trg2);

    /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
    if (len1 <= 0 || len2 <= 0)
        return (float4) 0.0;

    while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
    {
        int         res = CMPTRGM(ptr1, ptr2);

        if (res < 0)
            ptr1++;
        else if (res > 0)
            ptr2++;
        else
        {
            ptr1++;
            ptr2++;
            count++;
        }
    }

#ifdef DIVUNION
    return ((float4) count) / ((float4) (len1 + len2 - count));
#else
    return ((float4) count) / ((float4) ((len1 > len2) ? len1 : len2));
#endif

}

static int comp_trgm ( const void *  a,
const void *  b 
) [static]

Definition at line 55 of file trgm_op.c.

References CMPTRGM.

Referenced by generate_trgm(), and generate_wildcard_trgm().

{
    return CMPTRGM(a, b);
}

void compact_trigram ( trgm tptr,
char *  str,
int  bytelen 
)

Definition at line 112 of file trgm_op.c.

References COMP_CRC32, CPTRGM, FIN_CRC32, and INIT_CRC32.

Referenced by fillTrgm(), and make_trigrams().

{
    if (bytelen == 3)
    {
        CPTRGM(tptr, str);
    }
    else
    {
        pg_crc32    crc;

        INIT_CRC32(crc);
        COMP_CRC32(crc, str, bytelen);
        FIN_CRC32(crc);

        /*
         * use only 3 upper bytes from crc, hope, it's good enough hashing
         */
        CPTRGM(tptr, &crc);
    }
}

static char* find_word ( char *  str,
int  lenstr,
char **  endword,
int *  charlen 
) [static]

Definition at line 85 of file trgm_op.c.

References ISWORDCHR, and pg_mblen().

Referenced by generate_trgm().

{
    char       *beginword = str;

    while (beginword - str < lenstr && !ISWORDCHR(beginword))
        beginword += pg_mblen(beginword);

    if (beginword - str >= lenstr)
        return NULL;

    *endword = beginword;
    *charlen = 0;
    while (*endword - str < lenstr && ISWORDCHR(*endword))
    {
        *endword += pg_mblen(*endword);
        (*charlen)++;
    }

    return beginword;
}

TRGM* generate_trgm ( char *  str,
int  slen 
)

Definition at line 180 of file trgm_op.c.

References ARRKEY, buf, CALCGTSIZE, comp_trgm(), find_word(), TRGM::flag, GETARR, lowerstr_with_len(), LPADDING, make_trigrams(), palloc(), pfree(), qsort, RPADDING, SET_VARSIZE, TRGMHDRSIZE, and unique_array().

Referenced by gin_extract_query_trgm(), gin_extract_value_trgm(), gtrgm_compress(), gtrgm_consistent(), gtrgm_distance(), show_trgm(), and similarity().

{
    TRGM       *trg;
    char       *buf;
    trgm       *tptr;
    int         len,
                charlen,
                bytelen;
    char       *bword,
               *eword;

    trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
    trg->flag = ARRKEY;
    SET_VARSIZE(trg, TRGMHDRSIZE);

    if (slen + LPADDING + RPADDING < 3 || slen == 0)
        return trg;

    tptr = GETARR(trg);

    buf = palloc(sizeof(char) * (slen + 4));

    if (LPADDING > 0)
    {
        *buf = ' ';
        if (LPADDING > 1)
            *(buf + 1) = ' ';
    }

    eword = str;
    while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
    {
#ifdef IGNORECASE
        bword = lowerstr_with_len(bword, eword - bword);
        bytelen = strlen(bword);
#else
        bytelen = eword - bword;
#endif

        memcpy(buf + LPADDING, bword, bytelen);

#ifdef IGNORECASE
        pfree(bword);
#endif
        buf[LPADDING + bytelen] = ' ';
        buf[LPADDING + bytelen + 1] = ' ';

        /*
         * count trigrams
         */
        tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
                             charlen + LPADDING + RPADDING);
    }

    pfree(buf);

    if ((len = tptr - GETARR(trg)) == 0)
        return trg;

    if (len > 0)
    {
        qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
        len = unique_array(GETARR(trg), len);
    }

    SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));

    return trg;
}

TRGM* generate_wildcard_trgm ( const char *  str,
int  slen 
)

Definition at line 411 of file trgm_op.c.

References ARRKEY, buf, CALCGTSIZE, comp_trgm(), TRGM::flag, get_wildcard_part(), GETARR, lowerstr_with_len(), LPADDING, make_trigrams(), NULL, palloc(), pfree(), qsort, RPADDING, SET_VARSIZE, TRGMHDRSIZE, and unique_array().

Referenced by gin_extract_query_trgm(), and gtrgm_consistent().

{
    TRGM       *trg;
    char       *buf,
               *buf2;
    trgm       *tptr;
    int         len,
                charlen,
                bytelen;
    const char *eword;

    trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) *3);
    trg->flag = ARRKEY;
    SET_VARSIZE(trg, TRGMHDRSIZE);

    if (slen + LPADDING + RPADDING < 3 || slen == 0)
        return trg;

    tptr = GETARR(trg);

    buf = palloc(sizeof(char) * (slen + 4));

    /*
     * Extract trigrams from each substring extracted by get_wildcard_part.
     */
    eword = str;
    while ((eword = get_wildcard_part(eword, slen - (eword - str),
                                      buf, &bytelen, &charlen)) != NULL)
    {
#ifdef IGNORECASE
        buf2 = lowerstr_with_len(buf, bytelen);
        bytelen = strlen(buf2);
#else
        buf2 = buf;
#endif

        /*
         * count trigrams
         */
        tptr = make_trigrams(tptr, buf2, bytelen, charlen);
#ifdef IGNORECASE
        pfree(buf2);
#endif
    }

    pfree(buf);

    if ((len = tptr - GETARR(trg)) == 0)
        return trg;

    /*
     * Make trigrams unique.
     */
    if (len > 0)
    {
        qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
        len = unique_array(GETARR(trg), len);
    }

    SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));

    return trg;
}

static const char* get_wildcard_part ( const char *  str,
int  lenstr,
char *  buf,
int *  bytelen,
int *  charlen 
) [static]

Definition at line 266 of file trgm_op.c.

References ISESCAPECHAR, ISWILDCARDCHAR, ISWORDCHR, LPADDING, pg_mblen(), and RPADDING.

Referenced by generate_wildcard_trgm().

{
    const char *beginword = str;
    const char *endword;
    char       *s = buf;
    bool        in_leading_wildcard_meta = false;
    bool        in_trailing_wildcard_meta = false;
    bool        in_escape = false;
    int         clen;

    /*
     * Find the first word character, remembering whether preceding character
     * was wildcard meta-character.  Note that the in_escape state persists
     * from this loop to the next one, since we may exit at a word character
     * that is in_escape.
     */
    while (beginword - str < lenstr)
    {
        if (in_escape)
        {
            if (ISWORDCHR(beginword))
                break;
            in_escape = false;
            in_leading_wildcard_meta = false;
        }
        else
        {
            if (ISESCAPECHAR(beginword))
                in_escape = true;
            else if (ISWILDCARDCHAR(beginword))
                in_leading_wildcard_meta = true;
            else if (ISWORDCHR(beginword))
                break;
            else
                in_leading_wildcard_meta = false;
        }
        beginword += pg_mblen(beginword);
    }

    /*
     * Handle string end.
     */
    if (beginword - str >= lenstr)
        return NULL;

    /*
     * Add left padding spaces if preceding character wasn't wildcard
     * meta-character.
     */
    *charlen = 0;
    if (!in_leading_wildcard_meta)
    {
        if (LPADDING > 0)
        {
            *s++ = ' ';
            (*charlen)++;
            if (LPADDING > 1)
            {
                *s++ = ' ';
                (*charlen)++;
            }
        }
    }

    /*
     * Copy data into buf until wildcard meta-character, non-word character or
     * string boundary.  Strip escapes during copy.
     */
    endword = beginword;
    while (endword - str < lenstr)
    {
        clen = pg_mblen(endword);
        if (in_escape)
        {
            if (ISWORDCHR(endword))
            {
                memcpy(s, endword, clen);
                (*charlen)++;
                s += clen;
            }
            else
            {
                /*
                 * Back up endword to the escape character when stopping at
                 * an escaped char, so that subsequent get_wildcard_part will
                 * restart from the escape character.  We assume here that
                 * escape chars are single-byte.
                 */
                endword--;
                break;
            }
            in_escape = false;
        }
        else
        {
            if (ISESCAPECHAR(endword))
                in_escape = true;
            else if (ISWILDCARDCHAR(endword))
            {
                in_trailing_wildcard_meta = true;
                break;
            }
            else if (ISWORDCHR(endword))
            {
                memcpy(s, endword, clen);
                (*charlen)++;
                s += clen;
            }
            else
                break;
        }
        endword += clen;
    }

    /*
     * Add right padding spaces if next character isn't wildcard
     * meta-character.
     */
    if (!in_trailing_wildcard_meta)
    {
        if (RPADDING > 0)
        {
            *s++ = ' ';
            (*charlen)++;
            if (RPADDING > 1)
            {
                *s++ = ' ';
                (*charlen)++;
            }
        }
    }

    *bytelen = s - buf;
    return endword;
}

static trgm* make_trigrams ( trgm tptr,
char *  str,
int  bytelen,
int  charlen 
) [static]

Definition at line 137 of file trgm_op.c.

References Assert, compact_trigram(), CPTRGM, and pg_mblen().

Referenced by generate_trgm(), and generate_wildcard_trgm().

{
    char       *ptr = str;

    if (charlen < 3)
        return tptr;

    if (bytelen > charlen)
    {
        /* Find multibyte character boundaries and apply compact_trigram */
        int         lenfirst = pg_mblen(str),
                    lenmiddle = pg_mblen(str + lenfirst),
                    lenlast = pg_mblen(str + lenfirst + lenmiddle);

        while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
        {
            compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);

            ptr += lenfirst;
            tptr++;

            lenfirst = lenmiddle;
            lenmiddle = lenlast;
            lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
        }
    }
    else
    {
        /* Fast path when there are no multibyte characters */
        Assert(bytelen == charlen);

        while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
        {
            CPTRGM(tptr, ptr);
            ptr++;
            tptr++;
        }
    }

    return tptr;
}

PG_FUNCTION_INFO_V1 ( show_trgm   ) 
PG_FUNCTION_INFO_V1 ( similarity_dist   ) 
PG_FUNCTION_INFO_V1 ( similarity   ) 
PG_FUNCTION_INFO_V1 ( similarity_op   ) 
PG_FUNCTION_INFO_V1 ( show_limit   ) 
PG_FUNCTION_INFO_V1 ( set_limit   ) 
Datum set_limit ( PG_FUNCTION_ARGS   ) 

Definition at line 38 of file trgm_op.c.

References elog, ERROR, PG_GETARG_FLOAT4, PG_RETURN_FLOAT4, and trgm_limit.

{
    float4      nlimit = PG_GETARG_FLOAT4(0);

    if (nlimit < 0 || nlimit > 1.0)
        elog(ERROR, "wrong limit, should be between 0 and 1");
    trgm_limit = nlimit;
    PG_RETURN_FLOAT4(trgm_limit);
}

Datum show_limit ( PG_FUNCTION_ARGS   ) 

Definition at line 49 of file trgm_op.c.

References PG_RETURN_FLOAT4, and trgm_limit.

Datum show_trgm ( PG_FUNCTION_ARGS   ) 

Definition at line 490 of file trgm_op.c.

References ARRNELEM, construct_array(), CPTRGM, DatumGetPointer, generate_trgm(), GETARR, i, ISPRINTABLETRGM, Max, palloc(), pfree(), pg_database_encoding_max_length(), PG_FREE_IF_COPY, PG_GETARG_TEXT_P, PG_RETURN_POINTER, PointerGetDatum, SET_VARSIZE, snprintf(), TEXTOID, trgm2int(), VARDATA, VARHDRSZ, and VARSIZE.

{
    text       *in = PG_GETARG_TEXT_P(0);
    TRGM       *trg;
    Datum      *d;
    ArrayType  *a;
    trgm       *ptr;
    int         i;

    trg = generate_trgm(VARDATA(in), VARSIZE(in) - VARHDRSZ);
    d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));

    for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
    {
        text       *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));

        if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
        {
            snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
            SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
        }
        else
        {
            SET_VARSIZE(item, VARHDRSZ + 3);
            CPTRGM(VARDATA(item), ptr);
        }
        d[i] = PointerGetDatum(item);
    }

    a = construct_array(
                        d,
                        ARRNELEM(trg),
                        TEXTOID,
                        -1,
                        false,
                        'i'
        );

    for (i = 0; i < ARRNELEM(trg); i++)
        pfree(DatumGetPointer(d[i]));

    pfree(d);
    pfree(trg);
    PG_FREE_IF_COPY(in, 0);

    PG_RETURN_POINTER(a);
}

Datum similarity ( PG_FUNCTION_ARGS   ) 

Definition at line 664 of file trgm_op.c.

References cnt_sml(), generate_trgm(), pfree(), PG_FREE_IF_COPY, PG_GETARG_TEXT_P, PG_RETURN_FLOAT4, VARDATA, VARHDRSZ, and VARSIZE.

Referenced by similarity_dist(), and similarity_op().

{
    text       *in1 = PG_GETARG_TEXT_P(0);
    text       *in2 = PG_GETARG_TEXT_P(1);
    TRGM       *trg1,
               *trg2;
    float4      res;

    trg1 = generate_trgm(VARDATA(in1), VARSIZE(in1) - VARHDRSZ);
    trg2 = generate_trgm(VARDATA(in2), VARSIZE(in2) - VARHDRSZ);

    res = cnt_sml(trg1, trg2);

    pfree(trg1);
    pfree(trg2);
    PG_FREE_IF_COPY(in1, 0);
    PG_FREE_IF_COPY(in2, 1);

    PG_RETURN_FLOAT4(res);
}

Datum similarity_dist ( PG_FUNCTION_ARGS   ) 
Datum similarity_op ( PG_FUNCTION_ARGS   ) 
uint32 trgm2int ( trgm ptr  ) 

Definition at line 476 of file trgm_op.c.

References val.

Referenced by gin_extract_query_trgm(), gin_extract_value_trgm(), and show_trgm().

{
    uint32      val = 0;

    val |= *(((unsigned char *) ptr));
    val <<= 8;
    val |= *(((unsigned char *) ptr) + 1);
    val <<= 8;
    val |= *(((unsigned char *) ptr) + 2);

    return val;
}

bool trgm_contained_by ( TRGM trg1,
TRGM trg2 
)

Definition at line 586 of file trgm_op.c.

References ARRNELEM, CMPTRGM, and GETARR.

Referenced by gtrgm_consistent().

{
    trgm       *ptr1,
               *ptr2;
    int         len1,
                len2;

    ptr1 = GETARR(trg1);
    ptr2 = GETARR(trg2);

    len1 = ARRNELEM(trg1);
    len2 = ARRNELEM(trg2);

    while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
    {
        int         res = CMPTRGM(ptr1, ptr2);

        if (res < 0)
            return false;
        else if (res > 0)
            ptr2++;
        else
        {
            ptr1++;
            ptr2++;
        }
    }
    if (ptr1 - GETARR(trg1) < len1)
        return false;
    else
        return true;
}

bool* trgm_presence_map ( TRGM query,
TRGM key 
)

Definition at line 625 of file trgm_op.c.

References ARRNELEM, CMPTRGM, GETARR, i, and palloc0().

Referenced by gtrgm_consistent().

{
    bool       *result;
    trgm       *ptrq = GETARR(query),
               *ptrk = GETARR(key);
    int         lenq = ARRNELEM(query),
                lenk = ARRNELEM(key),
                i;

    result = (bool *) palloc0(lenq * sizeof(bool));

    /* for each query trigram, do a binary search in the key array */
    for (i = 0; i < lenq; i++)
    {
        int         lo = 0;
        int         hi = lenk;

        while (lo < hi)
        {
            int         mid = (lo + hi) / 2;
            int         res = CMPTRGM(ptrq, ptrk + mid);

            if (res < 0)
                hi = mid;
            else if (res > 0)
                lo = mid + 1;
            else
            {
                result[i] = true;
                break;
            }
        }
        ptrq++;
    }

    return result;
}

static int unique_array ( trgm a,
int  len 
) [static]

Definition at line 61 of file trgm_op.c.

References CMPTRGM, and CPTRGM.

Referenced by generate_trgm(), and generate_wildcard_trgm().

{
    trgm       *curend,
               *tmp;

    curend = tmp = a;
    while (tmp - a < len)
        if (CMPTRGM(tmp, curend))
        {
            curend++;
            CPTRGM(curend, tmp);
            tmp++;
        }
        else
            tmp++;

    return curend + 1 - a;
}


Variable Documentation

Definition at line 14 of file trgm_op.c.

Definition at line 16 of file trgm_op.c.

Referenced by gin_trgm_consistent(), gtrgm_consistent(), set_limit(), show_limit(), and similarity_op().