Header And Logo

PostgreSQL
| The world's most advanced open source database.

to_tsany.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * to_tsany.c
00004  *      to_ts* function definitions
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/tsearch/to_tsany.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 #include "postgres.h"
00015 
00016 #include "tsearch/ts_cache.h"
00017 #include "tsearch/ts_utils.h"
00018 #include "utils/builtins.h"
00019 
00020 
00021 Datum
00022 get_current_ts_config(PG_FUNCTION_ARGS)
00023 {
00024     PG_RETURN_OID(getTSCurrentConfig(true));
00025 }
00026 
00027 /*
00028  * to_tsvector
00029  */
00030 static int
00031 compareWORD(const void *a, const void *b)
00032 {
00033     int         res;
00034 
00035     res = tsCompareString(
00036                ((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len,
00037                ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len,
00038                           false);
00039 
00040     if (res == 0)
00041     {
00042         if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos)
00043             return 0;
00044 
00045         res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1;
00046     }
00047 
00048     return res;
00049 }
00050 
00051 static int
00052 uniqueWORD(ParsedWord *a, int32 l)
00053 {
00054     ParsedWord *ptr,
00055                *res;
00056     int         tmppos;
00057 
00058     if (l == 1)
00059     {
00060         tmppos = LIMITPOS(a->pos.pos);
00061         a->alen = 2;
00062         a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
00063         a->pos.apos[0] = 1;
00064         a->pos.apos[1] = tmppos;
00065         return l;
00066     }
00067 
00068     res = a;
00069     ptr = a + 1;
00070 
00071     /*
00072      * Sort words with its positions
00073      */
00074     qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
00075 
00076     /*
00077      * Initialize first word and its first position
00078      */
00079     tmppos = LIMITPOS(a->pos.pos);
00080     a->alen = 2;
00081     a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
00082     a->pos.apos[0] = 1;
00083     a->pos.apos[1] = tmppos;
00084 
00085     /*
00086      * Summarize position information for each word
00087      */
00088     while (ptr - a < l)
00089     {
00090         if (!(ptr->len == res->len &&
00091               strncmp(ptr->word, res->word, res->len) == 0))
00092         {
00093             /*
00094              * Got a new word, so put it in result
00095              */
00096             res++;
00097             res->len = ptr->len;
00098             res->word = ptr->word;
00099             tmppos = LIMITPOS(ptr->pos.pos);
00100             res->alen = 2;
00101             res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
00102             res->pos.apos[0] = 1;
00103             res->pos.apos[1] = tmppos;
00104         }
00105         else
00106         {
00107             /*
00108              * The word already exists, so adjust position information. But
00109              * before we should check size of position's array, max allowed
00110              * value for position and uniqueness of position
00111              */
00112             pfree(ptr->word);
00113             if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
00114                 res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
00115             {
00116                 if (res->pos.apos[0] + 1 >= res->alen)
00117                 {
00118                     res->alen *= 2;
00119                     res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
00120                 }
00121                 if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
00122                 {
00123                     res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
00124                     res->pos.apos[0]++;
00125                 }
00126             }
00127         }
00128         ptr++;
00129     }
00130 
00131     return res + 1 - a;
00132 }
00133 
00134 /*
00135  * make value of tsvector, given parsed text
00136  */
00137 TSVector
00138 make_tsvector(ParsedText *prs)
00139 {
00140     int         i,
00141                 j,
00142                 lenstr = 0,
00143                 totallen;
00144     TSVector    in;
00145     WordEntry  *ptr;
00146     char       *str;
00147     int         stroff;
00148 
00149     prs->curwords = uniqueWORD(prs->words, prs->curwords);
00150     for (i = 0; i < prs->curwords; i++)
00151     {
00152         lenstr += prs->words[i].len;
00153         if (prs->words[i].alen)
00154         {
00155             lenstr = SHORTALIGN(lenstr);
00156             lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
00157         }
00158     }
00159 
00160     if (lenstr > MAXSTRPOS)
00161         ereport(ERROR,
00162                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00163                  errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));
00164 
00165     totallen = CALCDATASIZE(prs->curwords, lenstr);
00166     in = (TSVector) palloc0(totallen);
00167     SET_VARSIZE(in, totallen);
00168     in->size = prs->curwords;
00169 
00170     ptr = ARRPTR(in);
00171     str = STRPTR(in);
00172     stroff = 0;
00173     for (i = 0; i < prs->curwords; i++)
00174     {
00175         ptr->len = prs->words[i].len;
00176         ptr->pos = stroff;
00177         memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
00178         stroff += prs->words[i].len;
00179         pfree(prs->words[i].word);
00180         if (prs->words[i].alen)
00181         {
00182             int         k = prs->words[i].pos.apos[0];
00183             WordEntryPos *wptr;
00184 
00185             if (k > 0xFFFF)
00186                 elog(ERROR, "positions array too long");
00187 
00188             ptr->haspos = 1;
00189             stroff = SHORTALIGN(stroff);
00190             *(uint16 *) (str + stroff) = (uint16) k;
00191             wptr = POSDATAPTR(in, ptr);
00192             for (j = 0; j < k; j++)
00193             {
00194                 WEP_SETWEIGHT(wptr[j], 0);
00195                 WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
00196             }
00197             stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
00198             pfree(prs->words[i].pos.apos);
00199         }
00200         else
00201             ptr->haspos = 0;
00202         ptr++;
00203     }
00204     pfree(prs->words);
00205     return in;
00206 }
00207 
00208 Datum
00209 to_tsvector_byid(PG_FUNCTION_ARGS)
00210 {
00211     Oid         cfgId = PG_GETARG_OID(0);
00212     text       *in = PG_GETARG_TEXT_P(1);
00213     ParsedText  prs;
00214     TSVector    out;
00215 
00216     prs.lenwords = (VARSIZE(in) - VARHDRSZ) / 6;        /* just estimation of
00217                                                          * word's number */
00218     if (prs.lenwords == 0)
00219         prs.lenwords = 2;
00220     prs.curwords = 0;
00221     prs.pos = 0;
00222     prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
00223 
00224     parsetext(cfgId, &prs, VARDATA(in), VARSIZE(in) - VARHDRSZ);
00225     PG_FREE_IF_COPY(in, 1);
00226 
00227     if (prs.curwords)
00228         out = make_tsvector(&prs);
00229     else
00230     {
00231         pfree(prs.words);
00232         out = palloc(CALCDATASIZE(0, 0));
00233         SET_VARSIZE(out, CALCDATASIZE(0, 0));
00234         out->size = 0;
00235     }
00236 
00237     PG_RETURN_POINTER(out);
00238 }
00239 
00240 Datum
00241 to_tsvector(PG_FUNCTION_ARGS)
00242 {
00243     text       *in = PG_GETARG_TEXT_P(0);
00244     Oid         cfgId;
00245 
00246     cfgId = getTSCurrentConfig(true);
00247     PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
00248                                         ObjectIdGetDatum(cfgId),
00249                                         PointerGetDatum(in)));
00250 }
00251 
00252 /*
00253  * to_tsquery
00254  */
00255 
00256 
00257 /*
00258  * This function is used for morph parsing.
00259  *
00260  * The value is passed to parsetext which will call the right dictionary to
00261  * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
00262  * to the stack.
00263  *
00264  * All words belonging to the same variant are pushed as an ANDed list,
00265  * and different variants are ORred together.
00266  */
00267 static void
00268 pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
00269 {
00270     int32       count = 0;
00271     ParsedText  prs;
00272     uint32      variant,
00273                 pos,
00274                 cntvar = 0,
00275                 cntpos = 0,
00276                 cnt = 0;
00277     Oid         cfg_id = DatumGetObjectId(opaque);      /* the input is actually
00278                                                          * an Oid, not a pointer */
00279 
00280     prs.lenwords = 4;
00281     prs.curwords = 0;
00282     prs.pos = 0;
00283     prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
00284 
00285     parsetext(cfg_id, &prs, strval, lenval);
00286 
00287     if (prs.curwords > 0)
00288     {
00289 
00290         while (count < prs.curwords)
00291         {
00292             pos = prs.words[count].pos.pos;
00293             cntvar = 0;
00294             while (count < prs.curwords && pos == prs.words[count].pos.pos)
00295             {
00296                 variant = prs.words[count].nvariant;
00297 
00298                 cnt = 0;
00299                 while (count < prs.curwords && pos == prs.words[count].pos.pos && variant == prs.words[count].nvariant)
00300                 {
00301 
00302                     pushValue(state, prs.words[count].word, prs.words[count].len, weight,
00303                               ((prs.words[count].flags & TSL_PREFIX) || prefix) ? true : false);
00304                     pfree(prs.words[count].word);
00305                     if (cnt)
00306                         pushOperator(state, OP_AND);
00307                     cnt++;
00308                     count++;
00309                 }
00310 
00311                 if (cntvar)
00312                     pushOperator(state, OP_OR);
00313                 cntvar++;
00314             }
00315 
00316             if (cntpos)
00317                 pushOperator(state, OP_AND);
00318 
00319             cntpos++;
00320         }
00321 
00322         pfree(prs.words);
00323 
00324     }
00325     else
00326         pushStop(state);
00327 }
00328 
00329 Datum
00330 to_tsquery_byid(PG_FUNCTION_ARGS)
00331 {
00332     Oid         cfgid = PG_GETARG_OID(0);
00333     text       *in = PG_GETARG_TEXT_P(1);
00334     TSQuery     query;
00335     QueryItem  *res;
00336     int32       len;
00337 
00338     query = parse_tsquery(text_to_cstring(in), pushval_morph, ObjectIdGetDatum(cfgid), false);
00339 
00340     if (query->size == 0)
00341         PG_RETURN_TSQUERY(query);
00342 
00343     /* clean out any stopword placeholders from the tree */
00344     res = clean_fakeval(GETQUERY(query), &len);
00345     if (!res)
00346     {
00347         SET_VARSIZE(query, HDRSIZETQ);
00348         query->size = 0;
00349         PG_RETURN_POINTER(query);
00350     }
00351     memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem));
00352 
00353     /*
00354      * Removing the stopword placeholders might've resulted in fewer
00355      * QueryItems. If so, move the operands up accordingly.
00356      */
00357     if (len != query->size)
00358     {
00359         char       *oldoperand = GETOPERAND(query);
00360         int32       lenoperand = VARSIZE(query) - (oldoperand - (char *) query);
00361 
00362         Assert(len < query->size);
00363 
00364         query->size = len;
00365         memmove((void *) GETOPERAND(query), oldoperand, VARSIZE(query) - (oldoperand - (char *) query));
00366         SET_VARSIZE(query, COMPUTESIZE(len, lenoperand));
00367     }
00368 
00369     pfree(res);
00370     PG_RETURN_TSQUERY(query);
00371 }
00372 
00373 Datum
00374 to_tsquery(PG_FUNCTION_ARGS)
00375 {
00376     text       *in = PG_GETARG_TEXT_P(0);
00377     Oid         cfgId;
00378 
00379     cfgId = getTSCurrentConfig(true);
00380     PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
00381                                         ObjectIdGetDatum(cfgId),
00382                                         PointerGetDatum(in)));
00383 }
00384 
00385 Datum
00386 plainto_tsquery_byid(PG_FUNCTION_ARGS)
00387 {
00388     Oid         cfgid = PG_GETARG_OID(0);
00389     text       *in = PG_GETARG_TEXT_P(1);
00390     TSQuery     query;
00391     QueryItem  *res;
00392     int32       len;
00393 
00394     query = parse_tsquery(text_to_cstring(in), pushval_morph, ObjectIdGetDatum(cfgid), true);
00395 
00396     if (query->size == 0)
00397         PG_RETURN_TSQUERY(query);
00398 
00399     res = clean_fakeval(GETQUERY(query), &len);
00400     if (!res)
00401     {
00402         SET_VARSIZE(query, HDRSIZETQ);
00403         query->size = 0;
00404         PG_RETURN_POINTER(query);
00405     }
00406     memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem));
00407 
00408     if (len != query->size)
00409     {
00410         char       *oldoperand = GETOPERAND(query);
00411         int32       lenoperand = VARSIZE(query) - (oldoperand - (char *) query);
00412 
00413         Assert(len < query->size);
00414 
00415         query->size = len;
00416         memcpy((void *) GETOPERAND(query), oldoperand, lenoperand);
00417         SET_VARSIZE(query, COMPUTESIZE(len, lenoperand));
00418     }
00419 
00420     pfree(res);
00421     PG_RETURN_POINTER(query);
00422 }
00423 
00424 Datum
00425 plainto_tsquery(PG_FUNCTION_ARGS)
00426 {
00427     text       *in = PG_GETARG_TEXT_P(0);
00428     Oid         cfgId;
00429 
00430     cfgId = getTSCurrentConfig(true);
00431     PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
00432                                         ObjectIdGetDatum(cfgId),
00433                                         PointerGetDatum(in)));
00434 }