Header And Logo

PostgreSQL
| The world's most advanced open source database.

tsvector.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * tsvector.c
00004  *    I/O functions for tsvector
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/utils/adt/tsvector.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 
00015 #include "postgres.h"
00016 
00017 #include "libpq/pqformat.h"
00018 #include "tsearch/ts_locale.h"
00019 #include "tsearch/ts_utils.h"
00020 #include "utils/memutils.h"
00021 
00022 typedef struct
00023 {
00024     WordEntry   entry;          /* must be first! */
00025     WordEntryPos *pos;
00026     int         poslen;         /* number of elements in pos */
00027 } WordEntryIN;
00028 
00029 
00030 /* Compare two WordEntryPos values for qsort */
00031 static int
00032 comparePos(const void *a, const void *b)
00033 {
00034     int         apos = WEP_GETPOS(*(const WordEntryPos *) a);
00035     int         bpos = WEP_GETPOS(*(const WordEntryPos *) b);
00036 
00037     if (apos == bpos)
00038         return 0;
00039     return (apos > bpos) ? 1 : -1;
00040 }
00041 
00042 /*
00043  * Removes duplicate pos entries. If there's two entries with same pos
00044  * but different weight, the higher weight is retained.
00045  *
00046  * Returns new length.
00047  */
00048 static int
00049 uniquePos(WordEntryPos *a, int l)
00050 {
00051     WordEntryPos *ptr,
00052                *res;
00053 
00054     if (l <= 1)
00055         return l;
00056 
00057     qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
00058 
00059     res = a;
00060     ptr = a + 1;
00061     while (ptr - a < l)
00062     {
00063         if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
00064         {
00065             res++;
00066             *res = *ptr;
00067             if (res - a >= MAXNUMPOS - 1 ||
00068                 WEP_GETPOS(*res) == MAXENTRYPOS - 1)
00069                 break;
00070         }
00071         else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
00072             WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
00073         ptr++;
00074     }
00075 
00076     return res + 1 - a;
00077 }
00078 
00079 /* Compare two WordEntryIN values for qsort */
00080 static int
00081 compareentry(const void *va, const void *vb, void *arg)
00082 {
00083     const WordEntryIN *a = (const WordEntryIN *) va;
00084     const WordEntryIN *b = (const WordEntryIN *) vb;
00085     char       *BufferStr = (char *) arg;
00086 
00087     return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
00088                            &BufferStr[b->entry.pos], b->entry.len,
00089                            false);
00090 }
00091 
00092 /*
00093  * Sort an array of WordEntryIN, remove duplicates.
00094  * *outbuflen receives the amount of space needed for strings and positions.
00095  */
00096 static int
00097 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
00098 {
00099     int         buflen;
00100     WordEntryIN *ptr,
00101                *res;
00102 
00103     Assert(l >= 1);
00104 
00105     if (l > 1)
00106         qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
00107                   (void *) buf);
00108 
00109     buflen = 0;
00110     res = a;
00111     ptr = a + 1;
00112     while (ptr - a < l)
00113     {
00114         if (!(ptr->entry.len == res->entry.len &&
00115               strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
00116                       res->entry.len) == 0))
00117         {
00118             /* done accumulating data into *res, count space needed */
00119             buflen += res->entry.len;
00120             if (res->entry.haspos)
00121             {
00122                 res->poslen = uniquePos(res->pos, res->poslen);
00123                 buflen = SHORTALIGN(buflen);
00124                 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
00125             }
00126             res++;
00127             memcpy(res, ptr, sizeof(WordEntryIN));
00128         }
00129         else if (ptr->entry.haspos)
00130         {
00131             if (res->entry.haspos)
00132             {
00133                 /* append ptr's positions to res's positions */
00134                 int         newlen = ptr->poslen + res->poslen;
00135 
00136                 res->pos = (WordEntryPos *)
00137                     repalloc(res->pos, newlen * sizeof(WordEntryPos));
00138                 memcpy(&res->pos[res->poslen], ptr->pos,
00139                        ptr->poslen * sizeof(WordEntryPos));
00140                 res->poslen = newlen;
00141                 pfree(ptr->pos);
00142             }
00143             else
00144             {
00145                 /* just give ptr's positions to pos */
00146                 res->entry.haspos = 1;
00147                 res->pos = ptr->pos;
00148                 res->poslen = ptr->poslen;
00149             }
00150         }
00151         ptr++;
00152     }
00153 
00154     /* count space needed for last item */
00155     buflen += res->entry.len;
00156     if (res->entry.haspos)
00157     {
00158         res->poslen = uniquePos(res->pos, res->poslen);
00159         buflen = SHORTALIGN(buflen);
00160         buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
00161     }
00162 
00163     *outbuflen = buflen;
00164     return res + 1 - a;
00165 }
00166 
00167 static int
00168 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
00169 {
00170     return compareentry(a, b, buf);
00171 }
00172 
00173 
00174 Datum
00175 tsvectorin(PG_FUNCTION_ARGS)
00176 {
00177     char       *buf = PG_GETARG_CSTRING(0);
00178     TSVectorParseState state;
00179     WordEntryIN *arr;
00180     int         totallen;
00181     int         arrlen;         /* allocated size of arr */
00182     WordEntry  *inarr;
00183     int         len = 0;
00184     TSVector    in;
00185     int         i;
00186     char       *token;
00187     int         toklen;
00188     WordEntryPos *pos;
00189     int         poslen;
00190     char       *strbuf;
00191     int         stroff;
00192 
00193     /*
00194      * Tokens are appended to tmpbuf, cur is a pointer to the end of used
00195      * space in tmpbuf.
00196      */
00197     char       *tmpbuf;
00198     char       *cur;
00199     int         buflen = 256;   /* allocated size of tmpbuf */
00200 
00201     state = init_tsvector_parser(buf, false, false);
00202 
00203     arrlen = 64;
00204     arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
00205     cur = tmpbuf = (char *) palloc(buflen);
00206 
00207     while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
00208     {
00209         if (toklen >= MAXSTRLEN)
00210             ereport(ERROR,
00211                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00212                      errmsg("word is too long (%ld bytes, max %ld bytes)",
00213                             (long) toklen,
00214                             (long) (MAXSTRLEN - 1))));
00215 
00216         if (cur - tmpbuf > MAXSTRPOS)
00217             ereport(ERROR,
00218                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00219                      errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
00220                             (long) (cur - tmpbuf), (long) MAXSTRPOS)));
00221 
00222         /*
00223          * Enlarge buffers if needed
00224          */
00225         if (len >= arrlen)
00226         {
00227             arrlen *= 2;
00228             arr = (WordEntryIN *)
00229                 repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
00230         }
00231         while ((cur - tmpbuf) + toklen >= buflen)
00232         {
00233             int         dist = cur - tmpbuf;
00234 
00235             buflen *= 2;
00236             tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
00237             cur = tmpbuf + dist;
00238         }
00239         arr[len].entry.len = toklen;
00240         arr[len].entry.pos = cur - tmpbuf;
00241         memcpy((void *) cur, (void *) token, toklen);
00242         cur += toklen;
00243 
00244         if (poslen != 0)
00245         {
00246             arr[len].entry.haspos = 1;
00247             arr[len].pos = pos;
00248             arr[len].poslen = poslen;
00249         }
00250         else
00251         {
00252             arr[len].entry.haspos = 0;
00253             arr[len].pos = NULL;
00254             arr[len].poslen = 0;
00255         }
00256         len++;
00257     }
00258 
00259     close_tsvector_parser(state);
00260 
00261     if (len > 0)
00262         len = uniqueentry(arr, len, tmpbuf, &buflen);
00263     else
00264         buflen = 0;
00265 
00266     if (buflen > MAXSTRPOS)
00267         ereport(ERROR,
00268                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00269                  errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
00270 
00271     totallen = CALCDATASIZE(len, buflen);
00272     in = (TSVector) palloc0(totallen);
00273     SET_VARSIZE(in, totallen);
00274     in->size = len;
00275     inarr = ARRPTR(in);
00276     strbuf = STRPTR(in);
00277     stroff = 0;
00278     for (i = 0; i < len; i++)
00279     {
00280         memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
00281         arr[i].entry.pos = stroff;
00282         stroff += arr[i].entry.len;
00283         if (arr[i].entry.haspos)
00284         {
00285             if (arr[i].poslen > 0xFFFF)
00286                 elog(ERROR, "positions array too long");
00287 
00288             /* Copy number of positions */
00289             stroff = SHORTALIGN(stroff);
00290             *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
00291             stroff += sizeof(uint16);
00292 
00293             /* Copy positions */
00294             memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
00295             stroff += arr[i].poslen * sizeof(WordEntryPos);
00296 
00297             pfree(arr[i].pos);
00298         }
00299         inarr[i] = arr[i].entry;
00300     }
00301 
00302     Assert((strbuf + stroff - (char *) in) == totallen);
00303 
00304     PG_RETURN_TSVECTOR(in);
00305 }
00306 
00307 Datum
00308 tsvectorout(PG_FUNCTION_ARGS)
00309 {
00310     TSVector    out = PG_GETARG_TSVECTOR(0);
00311     char       *outbuf;
00312     int32       i,
00313                 lenbuf = 0,
00314                 pp;
00315     WordEntry  *ptr = ARRPTR(out);
00316     char       *curbegin,
00317                *curin,
00318                *curout;
00319 
00320     lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
00321     for (i = 0; i < out->size; i++)
00322     {
00323         lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
00324         if (ptr[i].haspos)
00325             lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
00326     }
00327 
00328     curout = outbuf = (char *) palloc(lenbuf);
00329     for (i = 0; i < out->size; i++)
00330     {
00331         curbegin = curin = STRPTR(out) + ptr->pos;
00332         if (i != 0)
00333             *curout++ = ' ';
00334         *curout++ = '\'';
00335         while (curin - curbegin < ptr->len)
00336         {
00337             int         len = pg_mblen(curin);
00338 
00339             if (t_iseq(curin, '\''))
00340                 *curout++ = '\'';
00341             else if (t_iseq(curin, '\\'))
00342                 *curout++ = '\\';
00343 
00344             while (len--)
00345                 *curout++ = *curin++;
00346         }
00347 
00348         *curout++ = '\'';
00349         if ((pp = POSDATALEN(out, ptr)) != 0)
00350         {
00351             WordEntryPos *wptr;
00352 
00353             *curout++ = ':';
00354             wptr = POSDATAPTR(out, ptr);
00355             while (pp)
00356             {
00357                 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
00358                 switch (WEP_GETWEIGHT(*wptr))
00359                 {
00360                     case 3:
00361                         *curout++ = 'A';
00362                         break;
00363                     case 2:
00364                         *curout++ = 'B';
00365                         break;
00366                     case 1:
00367                         *curout++ = 'C';
00368                         break;
00369                     case 0:
00370                     default:
00371                         break;
00372                 }
00373 
00374                 if (pp > 1)
00375                     *curout++ = ',';
00376                 pp--;
00377                 wptr++;
00378             }
00379         }
00380         ptr++;
00381     }
00382 
00383     *curout = '\0';
00384     PG_FREE_IF_COPY(out, 0);
00385     PG_RETURN_CSTRING(outbuf);
00386 }
00387 
00388 /*
00389  * Binary Input / Output functions. The binary format is as follows:
00390  *
00391  * uint32   number of lexemes
00392  *
00393  * for each lexeme:
00394  *      lexeme text in client encoding, null-terminated
00395  *      uint16  number of positions
00396  *      for each position:
00397  *          uint16 WordEntryPos
00398  */
00399 
00400 Datum
00401 tsvectorsend(PG_FUNCTION_ARGS)
00402 {
00403     TSVector    vec = PG_GETARG_TSVECTOR(0);
00404     StringInfoData buf;
00405     int         i,
00406                 j;
00407     WordEntry  *weptr = ARRPTR(vec);
00408 
00409     pq_begintypsend(&buf);
00410 
00411     pq_sendint(&buf, vec->size, sizeof(int32));
00412     for (i = 0; i < vec->size; i++)
00413     {
00414         uint16      npos;
00415 
00416         /*
00417          * the strings in the TSVector array are not null-terminated, so we
00418          * have to send the null-terminator separately
00419          */
00420         pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
00421         pq_sendbyte(&buf, '\0');
00422 
00423         npos = POSDATALEN(vec, weptr);
00424         pq_sendint(&buf, npos, sizeof(uint16));
00425 
00426         if (npos > 0)
00427         {
00428             WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
00429 
00430             for (j = 0; j < npos; j++)
00431                 pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
00432         }
00433         weptr++;
00434     }
00435 
00436     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
00437 }
00438 
00439 Datum
00440 tsvectorrecv(PG_FUNCTION_ARGS)
00441 {
00442     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
00443     TSVector    vec;
00444     int         i;
00445     int32       nentries;
00446     int         datalen;        /* number of bytes used in the variable size
00447                                  * area after fixed size TSVector header and
00448                                  * WordEntries */
00449     Size        hdrlen;
00450     Size        len;            /* allocated size of vec */
00451     bool        needSort = false;
00452 
00453     nentries = pq_getmsgint(buf, sizeof(int32));
00454     if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
00455         elog(ERROR, "invalid size of tsvector");
00456 
00457     hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
00458 
00459     len = hdrlen * 2;           /* times two to make room for lexemes */
00460     vec = (TSVector) palloc0(len);
00461     vec->size = nentries;
00462 
00463     datalen = 0;
00464     for (i = 0; i < nentries; i++)
00465     {
00466         const char *lexeme;
00467         uint16      npos;
00468         size_t      lex_len;
00469 
00470         lexeme = pq_getmsgstring(buf);
00471         npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
00472 
00473         /* sanity checks */
00474 
00475         lex_len = strlen(lexeme);
00476         if (lex_len > MAXSTRLEN)
00477             elog(ERROR, "invalid tsvector: lexeme too long");
00478 
00479         if (datalen > MAXSTRPOS)
00480             elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
00481 
00482         if (npos > MAXNUMPOS)
00483             elog(ERROR, "unexpected number of tsvector positions");
00484 
00485         /*
00486          * Looks valid. Fill the WordEntry struct, and copy lexeme.
00487          *
00488          * But make sure the buffer is large enough first.
00489          */
00490         while (hdrlen + SHORTALIGN(datalen + lex_len) +
00491                (npos + 1) * sizeof(WordEntryPos) >= len)
00492         {
00493             len *= 2;
00494             vec = (TSVector) repalloc(vec, len);
00495         }
00496 
00497         vec->entries[i].haspos = (npos > 0) ? 1 : 0;
00498         vec->entries[i].len = lex_len;
00499         vec->entries[i].pos = datalen;
00500 
00501         memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
00502 
00503         datalen += lex_len;
00504 
00505         if (i > 0 && WordEntryCMP(&vec->entries[i],
00506                                   &vec->entries[i - 1],
00507                                   STRPTR(vec)) <= 0)
00508             needSort = true;
00509 
00510         /* Receive positions */
00511         if (npos > 0)
00512         {
00513             uint16      j;
00514             WordEntryPos *wepptr;
00515 
00516             /*
00517              * Pad to 2-byte alignment if necessary. Though we used palloc0
00518              * for the initial allocation, subsequent repalloc'd memory areas
00519              * are not initialized to zero.
00520              */
00521             if (datalen != SHORTALIGN(datalen))
00522             {
00523                 *(STRPTR(vec) + datalen) = '\0';
00524                 datalen = SHORTALIGN(datalen);
00525             }
00526 
00527             memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
00528 
00529             wepptr = POSDATAPTR(vec, &vec->entries[i]);
00530             for (j = 0; j < npos; j++)
00531             {
00532                 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
00533                 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
00534                     elog(ERROR, "position information is misordered");
00535             }
00536 
00537             datalen += (npos + 1) * sizeof(WordEntry);
00538         }
00539     }
00540 
00541     SET_VARSIZE(vec, hdrlen + datalen);
00542 
00543     if (needSort)
00544         qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
00545                   compareentry, (void *) STRPTR(vec));
00546 
00547     PG_RETURN_TSVECTOR(vec);
00548 }