PostgreSQL Source Code: src/backend/tsearch/ts

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * ts_parse.c
00004  *      main parse functions for tsearch
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/tsearch/ts_parse.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 
00015 #include "postgres.h"
00016 
00017 #include "tsearch/ts_cache.h"
00018 #include "tsearch/ts_utils.h"
00019 
00020 #define IGNORE_LONGLEXEME   1
00021 
00022 /*
00023  * Lexize subsystem
00024  */
00025 
00026 typedef struct ParsedLex
00027 {
00028     int         type;
00029     char       *lemm;
00030     int         lenlemm;
00031     struct ParsedLex *next;
00032 } ParsedLex;
00033 
00034 typedef struct ListParsedLex
00035 {
00036     ParsedLex  *head;
00037     ParsedLex  *tail;
00038 } ListParsedLex;
00039 
00040 typedef struct
00041 {
00042     TSConfigCacheEntry *cfg;
00043     Oid         curDictId;
00044     int         posDict;
00045     DictSubState dictState;
00046     ParsedLex  *curSub;
00047     ListParsedLex towork;       /* current list to work */
00048     ListParsedLex waste;        /* list of lexemes that already lexized */
00049 
00050     /*
00051      * fields to store last variant to lexize (basically, thesaurus or similar
00052      * to, which wants  several lexemes
00053      */
00054 
00055     ParsedLex  *lastRes;
00056     TSLexeme   *tmpRes;
00057 } LexizeData;
00058 
00059 static void
00060 LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
00061 {
00062     ld->cfg = cfg;
00063     ld->curDictId = InvalidOid;
00064     ld->posDict = 0;
00065     ld->towork.head = ld->towork.tail = ld->curSub = NULL;
00066     ld->waste.head = ld->waste.tail = NULL;
00067     ld->lastRes = NULL;
00068     ld->tmpRes = NULL;
00069 }
00070 
00071 static void
00072 LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
00073 {
00074     if (list->tail)
00075     {
00076         list->tail->next = newpl;
00077         list->tail = newpl;
00078     }
00079     else
00080         list->head = list->tail = newpl;
00081     newpl->next = NULL;
00082 }
00083 
00084 static ParsedLex *
00085 LPLRemoveHead(ListParsedLex *list)
00086 {
00087     ParsedLex  *res = list->head;
00088 
00089     if (list->head)
00090         list->head = list->head->next;
00091 
00092     if (list->head == NULL)
00093         list->tail = NULL;
00094 
00095     return res;
00096 }
00097 
00098 static void
00099 LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
00100 {
00101     ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
00102 
00103     newpl->type = type;
00104     newpl->lemm = lemm;
00105     newpl->lenlemm = lenlemm;
00106     LPLAddTail(&ld->towork, newpl);
00107     ld->curSub = ld->towork.tail;
00108 }
00109 
00110 static void
00111 RemoveHead(LexizeData *ld)
00112 {
00113     LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
00114 
00115     ld->posDict = 0;
00116 }
00117 
00118 static void
00119 setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
00120 {
00121     if (correspondLexem)
00122     {
00123         *correspondLexem = ld->waste.head;
00124     }
00125     else
00126     {
00127         ParsedLex  *tmp,
00128                    *ptr = ld->waste.head;
00129 
00130         while (ptr)
00131         {
00132             tmp = ptr->next;
00133             pfree(ptr);
00134             ptr = tmp;
00135         }
00136     }
00137     ld->waste.head = ld->waste.tail = NULL;
00138 }
00139 
00140 static void
00141 moveToWaste(LexizeData *ld, ParsedLex *stop)
00142 {
00143     bool        go = true;
00144 
00145     while (ld->towork.head && go)
00146     {
00147         if (ld->towork.head == stop)
00148         {
00149             ld->curSub = stop->next;
00150             go = false;
00151         }
00152         RemoveHead(ld);
00153     }
00154 }
00155 
00156 static void
00157 setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
00158 {
00159     if (ld->tmpRes)
00160     {
00161         TSLexeme   *ptr;
00162 
00163         for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
00164             pfree(ptr->lexeme);
00165         pfree(ld->tmpRes);
00166     }
00167     ld->tmpRes = res;
00168     ld->lastRes = lex;
00169 }
00170 
00171 static TSLexeme *
00172 LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
00173 {
00174     int         i;
00175     ListDictionary *map;
00176     TSDictionaryCacheEntry *dict;
00177     TSLexeme   *res;
00178 
00179     if (ld->curDictId == InvalidOid)
00180     {
00181         /*
00182          * usial mode: dictionary wants only one word, but we should keep in
00183          * mind that we should go through all stack
00184          */
00185 
00186         while (ld->towork.head)
00187         {
00188             ParsedLex  *curVal = ld->towork.head;
00189             char       *curValLemm = curVal->lemm;
00190             int         curValLenLemm = curVal->lenlemm;
00191 
00192             map = ld->cfg->map + curVal->type;
00193 
00194             if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
00195             {
00196                 /* skip this type of lexeme */
00197                 RemoveHead(ld);
00198                 continue;
00199             }
00200 
00201             for (i = ld->posDict; i < map->len; i++)
00202             {
00203                 dict = lookup_ts_dictionary_cache(map->dictIds[i]);
00204 
00205                 ld->dictState.isend = ld->dictState.getnext = false;
00206                 ld->dictState.private_state = NULL;
00207                 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
00208                                                              &(dict->lexize),
00209                                              PointerGetDatum(dict->dictData),
00210                                                  PointerGetDatum(curValLemm),
00211                                                 Int32GetDatum(curValLenLemm),
00212                                               PointerGetDatum(&ld->dictState)
00213                                                                  ));
00214 
00215                 if (ld->dictState.getnext)
00216                 {
00217                     /*
00218                      * dictionary wants next word, so setup and store current
00219                      * position and go to multiword mode
00220                      */
00221 
00222                     ld->curDictId = DatumGetObjectId(map->dictIds[i]);
00223                     ld->posDict = i + 1;
00224                     ld->curSub = curVal->next;
00225                     if (res)
00226                         setNewTmpRes(ld, curVal, res);
00227                     return LexizeExec(ld, correspondLexem);
00228                 }
00229 
00230                 if (!res)       /* dictionary doesn't know this lexeme */
00231                     continue;
00232 
00233                 if (res->flags & TSL_FILTER)
00234                 {
00235                     curValLemm = res->lexeme;
00236                     curValLenLemm = strlen(res->lexeme);
00237                     continue;
00238                 }
00239 
00240                 RemoveHead(ld);
00241                 setCorrLex(ld, correspondLexem);
00242                 return res;
00243             }
00244 
00245             RemoveHead(ld);
00246         }
00247     }
00248     else
00249     {                           /* curDictId is valid */
00250         dict = lookup_ts_dictionary_cache(ld->curDictId);
00251 
00252         /*
00253          * Dictionary ld->curDictId asks  us about following words
00254          */
00255 
00256         while (ld->curSub)
00257         {
00258             ParsedLex  *curVal = ld->curSub;
00259 
00260             map = ld->cfg->map + curVal->type;
00261 
00262             if (curVal->type != 0)
00263             {
00264                 bool        dictExists = false;
00265 
00266                 if (curVal->type >= ld->cfg->lenmap || map->len == 0)
00267                 {
00268                     /* skip this type of lexeme */
00269                     ld->curSub = curVal->next;
00270                     continue;
00271                 }
00272 
00273                 /*
00274                  * We should be sure that current type of lexeme is recognized
00275                  * by our dictinonary: we just check is it exist in list of
00276                  * dictionaries ?
00277                  */
00278                 for (i = 0; i < map->len && !dictExists; i++)
00279                     if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
00280                         dictExists = true;
00281 
00282                 if (!dictExists)
00283                 {
00284                     /*
00285                      * Dictionary can't work with current tpe of lexeme,
00286                      * return to basic mode and redo all stored lexemes
00287                      */
00288                     ld->curDictId = InvalidOid;
00289                     return LexizeExec(ld, correspondLexem);
00290                 }
00291             }
00292 
00293             ld->dictState.isend = (curVal->type == 0) ? true : false;
00294             ld->dictState.getnext = false;
00295 
00296             res = (TSLexeme *) DatumGetPointer(FunctionCall4(
00297                                                              &(dict->lexize),
00298                                              PointerGetDatum(dict->dictData),
00299                                                PointerGetDatum(curVal->lemm),
00300                                               Int32GetDatum(curVal->lenlemm),
00301                                               PointerGetDatum(&ld->dictState)
00302                                                              ));
00303 
00304             if (ld->dictState.getnext)
00305             {
00306                 /* Dictionary wants one more */
00307                 ld->curSub = curVal->next;
00308                 if (res)
00309                     setNewTmpRes(ld, curVal, res);
00310                 continue;
00311             }
00312 
00313             if (res || ld->tmpRes)
00314             {
00315                 /*
00316                  * Dictionary normalizes lexemes, so we remove from stack all
00317                  * used lexemes, return to basic mode and redo end of stack
00318                  * (if it exists)
00319                  */
00320                 if (res)
00321                 {
00322                     moveToWaste(ld, ld->curSub);
00323                 }
00324                 else
00325                 {
00326                     res = ld->tmpRes;
00327                     moveToWaste(ld, ld->lastRes);
00328                 }
00329 
00330                 /* reset to initial state */
00331                 ld->curDictId = InvalidOid;
00332                 ld->posDict = 0;
00333                 ld->lastRes = NULL;
00334                 ld->tmpRes = NULL;
00335                 setCorrLex(ld, correspondLexem);
00336                 return res;
00337             }
00338 
00339             /*
00340              * Dict don't want next lexem and didn't recognize anything, redo
00341              * from ld->towork.head
00342              */
00343             ld->curDictId = InvalidOid;
00344             return LexizeExec(ld, correspondLexem);
00345         }
00346     }
00347 
00348     setCorrLex(ld, correspondLexem);
00349     return NULL;
00350 }
00351 
00352 /*
00353  * Parse string and lexize words.
00354  *
00355  * prs will be filled in.
00356  */
00357 void
00358 parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
00359 {
00360     int         type,
00361                 lenlemm;
00362     char       *lemm = NULL;
00363     LexizeData  ldata;
00364     TSLexeme   *norms;
00365     TSConfigCacheEntry *cfg;
00366     TSParserCacheEntry *prsobj;
00367     void       *prsdata;
00368 
00369     cfg = lookup_ts_config_cache(cfgId);
00370     prsobj = lookup_ts_parser_cache(cfg->prsId);
00371 
00372     prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
00373                                                      PointerGetDatum(buf),
00374                                                      Int32GetDatum(buflen)));
00375 
00376     LexizeInit(&ldata, cfg);
00377 
00378     do
00379     {
00380         type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
00381                                            PointerGetDatum(prsdata),
00382                                            PointerGetDatum(&lemm),
00383                                            PointerGetDatum(&lenlemm)));
00384 
00385         if (type > 0 && lenlemm >= MAXSTRLEN)
00386         {
00387 #ifdef IGNORE_LONGLEXEME
00388             ereport(NOTICE,
00389                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00390                      errmsg("word is too long to be indexed"),
00391                      errdetail("Words longer than %d characters are ignored.",
00392                                MAXSTRLEN)));
00393             continue;
00394 #else
00395             ereport(ERROR,
00396                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00397                      errmsg("word is too long to be indexed"),
00398                      errdetail("Words longer than %d characters are ignored.",
00399                                MAXSTRLEN)));
00400 #endif
00401         }
00402 
00403         LexizeAddLemm(&ldata, type, lemm, lenlemm);
00404 
00405         while ((norms = LexizeExec(&ldata, NULL)) != NULL)
00406         {
00407             TSLexeme   *ptr = norms;
00408 
00409             prs->pos++;         /* set pos */
00410 
00411             while (ptr->lexeme)
00412             {
00413                 if (prs->curwords == prs->lenwords)
00414                 {
00415                     prs->lenwords *= 2;
00416                     prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
00417                 }
00418 
00419                 if (ptr->flags & TSL_ADDPOS)
00420                     prs->pos++;
00421                 prs->words[prs->curwords].len = strlen(ptr->lexeme);
00422                 prs->words[prs->curwords].word = ptr->lexeme;
00423                 prs->words[prs->curwords].nvariant = ptr->nvariant;
00424                 prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
00425                 prs->words[prs->curwords].alen = 0;
00426                 prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
00427                 ptr++;
00428                 prs->curwords++;
00429             }
00430             pfree(norms);
00431         }
00432     } while (type > 0);
00433 
00434     FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
00435 }
00436 
00437 /*
00438  * Headline framework
00439  */
00440 static void
00441 hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
00442 {
00443     while (prs->curwords >= prs->lenwords)
00444     {
00445         prs->lenwords *= 2;
00446         prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
00447     }
00448     memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
00449     prs->words[prs->curwords].type = (uint8) type;
00450     prs->words[prs->curwords].len = buflen;
00451     prs->words[prs->curwords].word = palloc(buflen);
00452     memcpy(prs->words[prs->curwords].word, buf, buflen);
00453     prs->curwords++;
00454 }
00455 
00456 static void
00457 hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
00458 {
00459     int         i;
00460     QueryItem  *item = GETQUERY(query);
00461     HeadlineWordEntry *word;
00462 
00463     while (prs->curwords + query->size >= prs->lenwords)
00464     {
00465         prs->lenwords *= 2;
00466         prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
00467     }
00468 
00469     word = &(prs->words[prs->curwords - 1]);
00470     for (i = 0; i < query->size; i++)
00471     {
00472         if (item->type == QI_VAL &&
00473             tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
00474                             buf, buflen, item->qoperand.prefix) == 0)
00475         {
00476             if (word->item)
00477             {
00478                 memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
00479                 prs->words[prs->curwords].item = &item->qoperand;
00480                 prs->words[prs->curwords].repeated = 1;
00481                 prs->curwords++;
00482             }
00483             else
00484                 word->item = &item->qoperand;
00485         }
00486         item++;
00487     }
00488 }
00489 
00490 static void
00491 addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
00492 {
00493     ParsedLex  *tmplexs;
00494     TSLexeme   *ptr;
00495 
00496     while (lexs)
00497     {
00498 
00499         if (lexs->type > 0)
00500             hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
00501 
00502         ptr = norms;
00503         while (ptr && ptr->lexeme)
00504         {
00505             hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
00506             ptr++;
00507         }
00508 
00509         tmplexs = lexs->next;
00510         pfree(lexs);
00511         lexs = tmplexs;
00512     }
00513 
00514     if (norms)
00515     {
00516         ptr = norms;
00517         while (ptr->lexeme)
00518         {
00519             pfree(ptr->lexeme);
00520             ptr++;
00521         }
00522         pfree(norms);
00523     }
00524 }
00525 
00526 void
00527 hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
00528 {
00529     int         type,
00530                 lenlemm;
00531     char       *lemm = NULL;
00532     LexizeData  ldata;
00533     TSLexeme   *norms;
00534     ParsedLex  *lexs;
00535     TSConfigCacheEntry *cfg;
00536     TSParserCacheEntry *prsobj;
00537     void       *prsdata;
00538 
00539     cfg = lookup_ts_config_cache(cfgId);
00540     prsobj = lookup_ts_parser_cache(cfg->prsId);
00541 
00542     prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
00543                                                      PointerGetDatum(buf),
00544                                                      Int32GetDatum(buflen)));
00545 
00546     LexizeInit(&ldata, cfg);
00547 
00548     do
00549     {
00550         type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
00551                                            PointerGetDatum(prsdata),
00552                                            PointerGetDatum(&lemm),
00553                                            PointerGetDatum(&lenlemm)));
00554 
00555         if (type > 0 && lenlemm >= MAXSTRLEN)
00556         {
00557 #ifdef IGNORE_LONGLEXEME
00558             ereport(NOTICE,
00559                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00560                      errmsg("word is too long to be indexed"),
00561                      errdetail("Words longer than %d characters are ignored.",
00562                                MAXSTRLEN)));
00563             continue;
00564 #else
00565             ereport(ERROR,
00566                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00567                      errmsg("word is too long to be indexed"),
00568                      errdetail("Words longer than %d characters are ignored.",
00569                                MAXSTRLEN)));
00570 #endif
00571         }
00572 
00573         LexizeAddLemm(&ldata, type, lemm, lenlemm);
00574 
00575         do
00576         {
00577             if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
00578                 addHLParsedLex(prs, query, lexs, norms);
00579             else
00580                 addHLParsedLex(prs, query, lexs, NULL);
00581         } while (norms);
00582 
00583     } while (type > 0);
00584 
00585     FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
00586 }
00587 
00588 text *
00589 generateHeadline(HeadlineParsedText *prs)
00590 {
00591     text       *out;
00592     char       *ptr;
00593     int         len = 128;
00594     int         numfragments = 0;
00595     int16       infrag = 0;
00596 
00597     HeadlineWordEntry *wrd = prs->words;
00598 
00599     out = (text *) palloc(len);
00600     ptr = ((char *) out) + VARHDRSZ;
00601 
00602     while (wrd - prs->words < prs->curwords)
00603     {
00604         while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
00605         {
00606             int         dist = ptr - ((char *) out);
00607 
00608             len *= 2;
00609             out = (text *) repalloc(out, len);
00610             ptr = ((char *) out) + dist;
00611         }
00612 
00613         if (wrd->in && !wrd->repeated)
00614         {
00615             if (!infrag)
00616             {
00617 
00618                 /* start of a new fragment */
00619                 infrag = 1;
00620                 numfragments++;
00621                 /* add a fragment delimitor if this is after the first one */
00622                 if (numfragments > 1)
00623                 {
00624                     memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
00625                     ptr += prs->fragdelimlen;
00626                 }
00627 
00628             }
00629             if (wrd->replace)
00630             {
00631                 *ptr = ' ';
00632                 ptr++;
00633             }
00634             else if (!wrd->skip)
00635             {
00636                 if (wrd->selected)
00637                 {
00638                     memcpy(ptr, prs->startsel, prs->startsellen);
00639                     ptr += prs->startsellen;
00640                 }
00641                 memcpy(ptr, wrd->word, wrd->len);
00642                 ptr += wrd->len;
00643                 if (wrd->selected)
00644                 {
00645                     memcpy(ptr, prs->stopsel, prs->stopsellen);
00646                     ptr += prs->stopsellen;
00647                 }
00648             }
00649         }
00650         else if (!wrd->repeated)
00651         {
00652             if (infrag)
00653                 infrag = 0;
00654             pfree(wrd->word);
00655         }
00656 
00657         wrd++;
00658     }
00659 
00660     SET_VARSIZE(out, ptr - ((char *) out));
00661     return out;
00662 }
Header And Logo

ts_parse.c