Header And Logo

PostgreSQL
| The world's most advanced open source database.

Data Structures | Functions

tsvector.c File Reference

#include "postgres.h"
#include "libpq/pqformat.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
#include "utils/memutils.h"
Include dependency graph for tsvector.c:

Go to the source code of this file.

Data Structures

struct  WordEntryIN

Functions

static int comparePos (const void *a, const void *b)
static int uniquePos (WordEntryPos *a, int l)
static int compareentry (const void *va, const void *vb, void *arg)
static int uniqueentry (WordEntryIN *a, int l, char *buf, int *outbuflen)
static int WordEntryCMP (WordEntry *a, WordEntry *b, char *buf)
Datum tsvectorin (PG_FUNCTION_ARGS)
Datum tsvectorout (PG_FUNCTION_ARGS)
Datum tsvectorsend (PG_FUNCTION_ARGS)
Datum tsvectorrecv (PG_FUNCTION_ARGS)

Function Documentation

static int compareentry ( const void *  va,
const void *  vb,
void *  arg 
) [static]

Definition at line 81 of file tsvector.c.

References WordEntryIN::entry, WordEntry::len, WordEntry::pos, and tsCompareString().

Referenced by tsvectorrecv(), uniqueentry(), and WordEntryCMP().

{
    const WordEntryIN *a = (const WordEntryIN *) va;
    const WordEntryIN *b = (const WordEntryIN *) vb;
    char       *BufferStr = (char *) arg;

    return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
                           &BufferStr[b->entry.pos], b->entry.len,
                           false);
}

static int comparePos ( const void *  a,
const void *  b 
) [static]

Definition at line 32 of file tsvector.c.

References WEP_GETPOS.

Referenced by uniquePos().

{
    int         apos = WEP_GETPOS(*(const WordEntryPos *) a);
    int         bpos = WEP_GETPOS(*(const WordEntryPos *) b);

    if (apos == bpos)
        return 0;
    return (apos > bpos) ? 1 : -1;
}

Datum tsvectorin ( PG_FUNCTION_ARGS   ) 

Definition at line 175 of file tsvector.c.

References ARRPTR, Assert, buf, CALCDATASIZE, close_tsvector_parser(), cur, elog, WordEntryIN::entry, ereport, errcode(), errmsg(), ERROR, gettoken_tsvector(), WordEntry::haspos, i, init_tsvector_parser(), WordEntry::len, MAXSTRLEN, MAXSTRPOS, NULL, palloc(), palloc0(), pfree(), PG_GETARG_CSTRING, PG_RETURN_TSVECTOR, WordEntryIN::pos, WordEntry::pos, WordEntryIN::poslen, repalloc(), SET_VARSIZE, SHORTALIGN, TSVectorData::size, STRPTR, tmpbuf, and uniqueentry().

{
    char       *buf = PG_GETARG_CSTRING(0);
    TSVectorParseState state;
    WordEntryIN *arr;
    int         totallen;
    int         arrlen;         /* allocated size of arr */
    WordEntry  *inarr;
    int         len = 0;
    TSVector    in;
    int         i;
    char       *token;
    int         toklen;
    WordEntryPos *pos;
    int         poslen;
    char       *strbuf;
    int         stroff;

    /*
     * Tokens are appended to tmpbuf, cur is a pointer to the end of used
     * space in tmpbuf.
     */
    char       *tmpbuf;
    char       *cur;
    int         buflen = 256;   /* allocated size of tmpbuf */

    state = init_tsvector_parser(buf, false, false);

    arrlen = 64;
    arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
    cur = tmpbuf = (char *) palloc(buflen);

    while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
    {
        if (toklen >= MAXSTRLEN)
            ereport(ERROR,
                    (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                     errmsg("word is too long (%ld bytes, max %ld bytes)",
                            (long) toklen,
                            (long) (MAXSTRLEN - 1))));

        if (cur - tmpbuf > MAXSTRPOS)
            ereport(ERROR,
                    (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                     errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
                            (long) (cur - tmpbuf), (long) MAXSTRPOS)));

        /*
         * Enlarge buffers if needed
         */
        if (len >= arrlen)
        {
            arrlen *= 2;
            arr = (WordEntryIN *)
                repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
        }
        while ((cur - tmpbuf) + toklen >= buflen)
        {
            int         dist = cur - tmpbuf;

            buflen *= 2;
            tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
            cur = tmpbuf + dist;
        }
        arr[len].entry.len = toklen;
        arr[len].entry.pos = cur - tmpbuf;
        memcpy((void *) cur, (void *) token, toklen);
        cur += toklen;

        if (poslen != 0)
        {
            arr[len].entry.haspos = 1;
            arr[len].pos = pos;
            arr[len].poslen = poslen;
        }
        else
        {
            arr[len].entry.haspos = 0;
            arr[len].pos = NULL;
            arr[len].poslen = 0;
        }
        len++;
    }

    close_tsvector_parser(state);

    if (len > 0)
        len = uniqueentry(arr, len, tmpbuf, &buflen);
    else
        buflen = 0;

    if (buflen > MAXSTRPOS)
        ereport(ERROR,
                (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));

    totallen = CALCDATASIZE(len, buflen);
    in = (TSVector) palloc0(totallen);
    SET_VARSIZE(in, totallen);
    in->size = len;
    inarr = ARRPTR(in);
    strbuf = STRPTR(in);
    stroff = 0;
    for (i = 0; i < len; i++)
    {
        memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
        arr[i].entry.pos = stroff;
        stroff += arr[i].entry.len;
        if (arr[i].entry.haspos)
        {
            if (arr[i].poslen > 0xFFFF)
                elog(ERROR, "positions array too long");

            /* Copy number of positions */
            stroff = SHORTALIGN(stroff);
            *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
            stroff += sizeof(uint16);

            /* Copy positions */
            memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
            stroff += arr[i].poslen * sizeof(WordEntryPos);

            pfree(arr[i].pos);
        }
        inarr[i] = arr[i].entry;
    }

    Assert((strbuf + stroff - (char *) in) == totallen);

    PG_RETURN_TSVECTOR(in);
}

Datum tsvectorout ( PG_FUNCTION_ARGS   ) 

Definition at line 308 of file tsvector.c.

References ARRPTR, i, WordEntry::len, palloc(), pg_database_encoding_max_length(), PG_FREE_IF_COPY, PG_GETARG_TSVECTOR, pg_mblen(), PG_RETURN_CSTRING, WordEntry::pos, POSDATALEN, POSDATAPTR, TSVectorData::size, STRPTR, t_iseq, WEP_GETPOS, and WEP_GETWEIGHT.

{
    TSVector    out = PG_GETARG_TSVECTOR(0);
    char       *outbuf;
    int32       i,
                lenbuf = 0,
                pp;
    WordEntry  *ptr = ARRPTR(out);
    char       *curbegin,
               *curin,
               *curout;

    lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
    for (i = 0; i < out->size; i++)
    {
        lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
        if (ptr[i].haspos)
            lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
    }

    curout = outbuf = (char *) palloc(lenbuf);
    for (i = 0; i < out->size; i++)
    {
        curbegin = curin = STRPTR(out) + ptr->pos;
        if (i != 0)
            *curout++ = ' ';
        *curout++ = '\'';
        while (curin - curbegin < ptr->len)
        {
            int         len = pg_mblen(curin);

            if (t_iseq(curin, '\''))
                *curout++ = '\'';
            else if (t_iseq(curin, '\\'))
                *curout++ = '\\';

            while (len--)
                *curout++ = *curin++;
        }

        *curout++ = '\'';
        if ((pp = POSDATALEN(out, ptr)) != 0)
        {
            WordEntryPos *wptr;

            *curout++ = ':';
            wptr = POSDATAPTR(out, ptr);
            while (pp)
            {
                curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
                switch (WEP_GETWEIGHT(*wptr))
                {
                    case 3:
                        *curout++ = 'A';
                        break;
                    case 2:
                        *curout++ = 'B';
                        break;
                    case 1:
                        *curout++ = 'C';
                        break;
                    case 0:
                    default:
                        break;
                }

                if (pp > 1)
                    *curout++ = ',';
                pp--;
                wptr++;
            }
        }
        ptr++;
    }

    *curout = '\0';
    PG_FREE_IF_COPY(out, 0);
    PG_RETURN_CSTRING(outbuf);
}

Datum tsvectorrecv ( PG_FUNCTION_ARGS   ) 

Definition at line 440 of file tsvector.c.

References ARRPTR, buf, compareentry(), DATAHDRSIZE, elog, TSVectorData::entries, ERROR, WordEntry::haspos, i, WordEntry::len, MaxAllocSize, MAXNUMPOS, MAXSTRLEN, MAXSTRPOS, palloc0(), PG_GETARG_POINTER, PG_RETURN_TSVECTOR, WordEntry::pos, POSDATAPTR, pq_getmsgint(), pq_getmsgstring(), qsort_arg(), repalloc(), SET_VARSIZE, SHORTALIGN, TSVectorData::size, STRPTR, WEP_GETPOS, and WordEntryCMP().

{
    StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
    TSVector    vec;
    int         i;
    int32       nentries;
    int         datalen;        /* number of bytes used in the variable size
                                 * area after fixed size TSVector header and
                                 * WordEntries */
    Size        hdrlen;
    Size        len;            /* allocated size of vec */
    bool        needSort = false;

    nentries = pq_getmsgint(buf, sizeof(int32));
    if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
        elog(ERROR, "invalid size of tsvector");

    hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;

    len = hdrlen * 2;           /* times two to make room for lexemes */
    vec = (TSVector) palloc0(len);
    vec->size = nentries;

    datalen = 0;
    for (i = 0; i < nentries; i++)
    {
        const char *lexeme;
        uint16      npos;
        size_t      lex_len;

        lexeme = pq_getmsgstring(buf);
        npos = (uint16) pq_getmsgint(buf, sizeof(uint16));

        /* sanity checks */

        lex_len = strlen(lexeme);
        if (lex_len > MAXSTRLEN)
            elog(ERROR, "invalid tsvector: lexeme too long");

        if (datalen > MAXSTRPOS)
            elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");

        if (npos > MAXNUMPOS)
            elog(ERROR, "unexpected number of tsvector positions");

        /*
         * Looks valid. Fill the WordEntry struct, and copy lexeme.
         *
         * But make sure the buffer is large enough first.
         */
        while (hdrlen + SHORTALIGN(datalen + lex_len) +
               (npos + 1) * sizeof(WordEntryPos) >= len)
        {
            len *= 2;
            vec = (TSVector) repalloc(vec, len);
        }

        vec->entries[i].haspos = (npos > 0) ? 1 : 0;
        vec->entries[i].len = lex_len;
        vec->entries[i].pos = datalen;

        memcpy(STRPTR(vec) + datalen, lexeme, lex_len);

        datalen += lex_len;

        if (i > 0 && WordEntryCMP(&vec->entries[i],
                                  &vec->entries[i - 1],
                                  STRPTR(vec)) <= 0)
            needSort = true;

        /* Receive positions */
        if (npos > 0)
        {
            uint16      j;
            WordEntryPos *wepptr;

            /*
             * Pad to 2-byte alignment if necessary. Though we used palloc0
             * for the initial allocation, subsequent repalloc'd memory areas
             * are not initialized to zero.
             */
            if (datalen != SHORTALIGN(datalen))
            {
                *(STRPTR(vec) + datalen) = '\0';
                datalen = SHORTALIGN(datalen);
            }

            memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));

            wepptr = POSDATAPTR(vec, &vec->entries[i]);
            for (j = 0; j < npos; j++)
            {
                wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
                if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
                    elog(ERROR, "position information is misordered");
            }

            datalen += (npos + 1) * sizeof(WordEntry);
        }
    }

    SET_VARSIZE(vec, hdrlen + datalen);

    if (needSort)
        qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
                  compareentry, (void *) STRPTR(vec));

    PG_RETURN_TSVECTOR(vec);
}

Datum tsvectorsend ( PG_FUNCTION_ARGS   ) 

Definition at line 401 of file tsvector.c.

References ARRPTR, buf, i, WordEntry::len, PG_GETARG_TSVECTOR, PG_RETURN_BYTEA_P, WordEntry::pos, POSDATALEN, POSDATAPTR, pq_begintypsend(), pq_endtypsend(), pq_sendbyte(), pq_sendint(), pq_sendtext(), TSVectorData::size, and STRPTR.

{
    TSVector    vec = PG_GETARG_TSVECTOR(0);
    StringInfoData buf;
    int         i,
                j;
    WordEntry  *weptr = ARRPTR(vec);

    pq_begintypsend(&buf);

    pq_sendint(&buf, vec->size, sizeof(int32));
    for (i = 0; i < vec->size; i++)
    {
        uint16      npos;

        /*
         * the strings in the TSVector array are not null-terminated, so we
         * have to send the null-terminator separately
         */
        pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
        pq_sendbyte(&buf, '\0');

        npos = POSDATALEN(vec, weptr);
        pq_sendint(&buf, npos, sizeof(uint16));

        if (npos > 0)
        {
            WordEntryPos *wepptr = POSDATAPTR(vec, weptr);

            for (j = 0; j < npos; j++)
                pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
        }
        weptr++;
    }

    PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}

static int uniqueentry ( WordEntryIN a,
int  l,
char *  buf,
int *  outbuflen 
) [static]

Definition at line 97 of file tsvector.c.

References Assert, compareentry(), WordEntryIN::entry, WordEntry::haspos, WordEntry::len, pfree(), WordEntryIN::pos, WordEntry::pos, WordEntryIN::poslen, qsort_arg(), repalloc(), SHORTALIGN, and uniquePos().

Referenced by tsvectorin().

{
    int         buflen;
    WordEntryIN *ptr,
               *res;

    Assert(l >= 1);

    if (l > 1)
        qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
                  (void *) buf);

    buflen = 0;
    res = a;
    ptr = a + 1;
    while (ptr - a < l)
    {
        if (!(ptr->entry.len == res->entry.len &&
              strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
                      res->entry.len) == 0))
        {
            /* done accumulating data into *res, count space needed */
            buflen += res->entry.len;
            if (res->entry.haspos)
            {
                res->poslen = uniquePos(res->pos, res->poslen);
                buflen = SHORTALIGN(buflen);
                buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
            }
            res++;
            memcpy(res, ptr, sizeof(WordEntryIN));
        }
        else if (ptr->entry.haspos)
        {
            if (res->entry.haspos)
            {
                /* append ptr's positions to res's positions */
                int         newlen = ptr->poslen + res->poslen;

                res->pos = (WordEntryPos *)
                    repalloc(res->pos, newlen * sizeof(WordEntryPos));
                memcpy(&res->pos[res->poslen], ptr->pos,
                       ptr->poslen * sizeof(WordEntryPos));
                res->poslen = newlen;
                pfree(ptr->pos);
            }
            else
            {
                /* just give ptr's positions to pos */
                res->entry.haspos = 1;
                res->pos = ptr->pos;
                res->poslen = ptr->poslen;
            }
        }
        ptr++;
    }

    /* count space needed for last item */
    buflen += res->entry.len;
    if (res->entry.haspos)
    {
        res->poslen = uniquePos(res->pos, res->poslen);
        buflen = SHORTALIGN(buflen);
        buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
    }

    *outbuflen = buflen;
    return res + 1 - a;
}

static int uniquePos ( WordEntryPos a,
int  l 
) [static]

Definition at line 49 of file tsvector.c.

References comparePos(), MAXENTRYPOS, MAXNUMPOS, qsort, WEP_GETPOS, WEP_GETWEIGHT, and WEP_SETWEIGHT.

Referenced by uniqueentry().

{
    WordEntryPos *ptr,
               *res;

    if (l <= 1)
        return l;

    qsort((void *) a, l, sizeof(WordEntryPos), comparePos);

    res = a;
    ptr = a + 1;
    while (ptr - a < l)
    {
        if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
        {
            res++;
            *res = *ptr;
            if (res - a >= MAXNUMPOS - 1 ||
                WEP_GETPOS(*res) == MAXENTRYPOS - 1)
                break;
        }
        else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
            WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
        ptr++;
    }

    return res + 1 - a;
}

static int WordEntryCMP ( WordEntry a,
WordEntry b,
char *  buf 
) [static]

Definition at line 168 of file tsvector.c.

References compareentry().

Referenced by tsvectorrecv().

{
    return compareentry(a, b, buf);
}