Header And Logo

PostgreSQL
| The world's most advanced open source database.

dict_thesaurus.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * dict_thesaurus.c
00004  *      Thesaurus dictionary: phrase to phrase substitution
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/tsearch/dict_thesaurus.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 #include "postgres.h"
00015 
00016 #include "catalog/namespace.h"
00017 #include "commands/defrem.h"
00018 #include "tsearch/ts_cache.h"
00019 #include "tsearch/ts_locale.h"
00020 #include "tsearch/ts_utils.h"
00021 #include "utils/builtins.h"
00022 
00023 
00024 /*
00025  * Temporay we use TSLexeme.flags for inner use...
00026  */
00027 #define DT_USEASIS      0x1000
00028 
00029 typedef struct LexemeInfo
00030 {
00031     uint16      idsubst;        /* entry's number in DictThesaurus->subst */
00032     uint16      posinsubst;     /* pos info in entry */
00033     uint16      tnvariant;      /* total num lexemes in one variant */
00034     struct LexemeInfo *nextentry;
00035     struct LexemeInfo *nextvariant;
00036 } LexemeInfo;
00037 
00038 typedef struct
00039 {
00040     char       *lexeme;
00041     LexemeInfo *entries;
00042 } TheLexeme;
00043 
00044 typedef struct
00045 {
00046     uint16      lastlexeme;     /* number lexemes to substitute */
00047     uint16      reslen;
00048     TSLexeme   *res;            /* prepared substituted result */
00049 } TheSubstitute;
00050 
00051 typedef struct
00052 {
00053     /* subdictionary to normalize lexemes */
00054     Oid         subdictOid;
00055     TSDictionaryCacheEntry *subdict;
00056 
00057     /* Array to search lexeme by exact match */
00058     TheLexeme  *wrds;
00059     int         nwrds;          /* current number of words */
00060     int         ntwrds;         /* allocated array length */
00061 
00062     /*
00063      * Storage of substituted result, n-th element is for n-th expression
00064      */
00065     TheSubstitute *subst;
00066     int         nsubst;
00067 } DictThesaurus;
00068 
00069 
00070 static void
00071 newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
00072 {
00073     TheLexeme  *ptr;
00074 
00075     if (d->nwrds >= d->ntwrds)
00076     {
00077         if (d->ntwrds == 0)
00078         {
00079             d->ntwrds = 16;
00080             d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
00081         }
00082         else
00083         {
00084             d->ntwrds *= 2;
00085             d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
00086         }
00087     }
00088 
00089     ptr = d->wrds + d->nwrds;
00090     d->nwrds++;
00091 
00092     ptr->lexeme = palloc(e - b + 1);
00093 
00094     memcpy(ptr->lexeme, b, e - b);
00095     ptr->lexeme[e - b] = '\0';
00096 
00097     ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
00098 
00099     ptr->entries->nextentry = NULL;
00100     ptr->entries->idsubst = idsubst;
00101     ptr->entries->posinsubst = posinsubst;
00102 }
00103 
00104 static void
00105 addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
00106 {
00107     static int  nres = 0;
00108     static int  ntres = 0;
00109     TheSubstitute *ptr;
00110 
00111     if (nwrd == 0)
00112     {
00113         nres = ntres = 0;
00114 
00115         if (idsubst >= d->nsubst)
00116         {
00117             if (d->nsubst == 0)
00118             {
00119                 d->nsubst = 16;
00120                 d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
00121             }
00122             else
00123             {
00124                 d->nsubst *= 2;
00125                 d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
00126             }
00127         }
00128     }
00129 
00130     ptr = d->subst + idsubst;
00131 
00132     ptr->lastlexeme = posinsubst - 1;
00133 
00134     if (nres + 1 >= ntres)
00135     {
00136         if (ntres == 0)
00137         {
00138             ntres = 2;
00139             ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
00140         }
00141         else
00142         {
00143             ntres *= 2;
00144             ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
00145         }
00146 
00147     }
00148 
00149     ptr->res[nres].lexeme = palloc(e - b + 1);
00150     memcpy(ptr->res[nres].lexeme, b, e - b);
00151     ptr->res[nres].lexeme[e - b] = '\0';
00152 
00153     ptr->res[nres].nvariant = nwrd;
00154     if (useasis)
00155         ptr->res[nres].flags = DT_USEASIS;
00156     else
00157         ptr->res[nres].flags = 0;
00158 
00159     ptr->res[++nres].lexeme = NULL;
00160 }
00161 
00162 #define TR_WAITLEX  1
00163 #define TR_INLEX    2
00164 #define TR_WAITSUBS 3
00165 #define TR_INSUBS   4
00166 
00167 static void
00168 thesaurusRead(char *filename, DictThesaurus *d)
00169 {
00170     tsearch_readline_state trst;
00171     uint16      idsubst = 0;
00172     bool        useasis = false;
00173     char       *line;
00174 
00175     filename = get_tsearch_config_filename(filename, "ths");
00176     if (!tsearch_readline_begin(&trst, filename))
00177         ereport(ERROR,
00178                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00179                  errmsg("could not open thesaurus file \"%s\": %m",
00180                         filename)));
00181 
00182     while ((line = tsearch_readline(&trst)) != NULL)
00183     {
00184         char       *ptr;
00185         int         state = TR_WAITLEX;
00186         char       *beginwrd = NULL;
00187         uint16      posinsubst = 0;
00188         uint16      nwrd = 0;
00189 
00190         ptr = line;
00191 
00192         /* is it a comment? */
00193         while (*ptr && t_isspace(ptr))
00194             ptr += pg_mblen(ptr);
00195 
00196         if (t_iseq(ptr, '#') || *ptr == '\0' ||
00197             t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
00198         {
00199             pfree(line);
00200             continue;
00201         }
00202 
00203         while (*ptr)
00204         {
00205             if (state == TR_WAITLEX)
00206             {
00207                 if (t_iseq(ptr, ':'))
00208                 {
00209                     if (posinsubst == 0)
00210                         ereport(ERROR,
00211                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00212                                  errmsg("unexpected delimiter")));
00213                     state = TR_WAITSUBS;
00214                 }
00215                 else if (!t_isspace(ptr))
00216                 {
00217                     beginwrd = ptr;
00218                     state = TR_INLEX;
00219                 }
00220             }
00221             else if (state == TR_INLEX)
00222             {
00223                 if (t_iseq(ptr, ':'))
00224                 {
00225                     newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
00226                     state = TR_WAITSUBS;
00227                 }
00228                 else if (t_isspace(ptr))
00229                 {
00230                     newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
00231                     state = TR_WAITLEX;
00232                 }
00233             }
00234             else if (state == TR_WAITSUBS)
00235             {
00236                 if (t_iseq(ptr, '*'))
00237                 {
00238                     useasis = true;
00239                     state = TR_INSUBS;
00240                     beginwrd = ptr + pg_mblen(ptr);
00241                 }
00242                 else if (t_iseq(ptr, '\\'))
00243                 {
00244                     useasis = false;
00245                     state = TR_INSUBS;
00246                     beginwrd = ptr + pg_mblen(ptr);
00247                 }
00248                 else if (!t_isspace(ptr))
00249                 {
00250                     useasis = false;
00251                     beginwrd = ptr;
00252                     state = TR_INSUBS;
00253                 }
00254             }
00255             else if (state == TR_INSUBS)
00256             {
00257                 if (t_isspace(ptr))
00258                 {
00259                     if (ptr == beginwrd)
00260                         ereport(ERROR,
00261                                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00262                                  errmsg("unexpected end of line or lexeme")));
00263                     addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
00264                     state = TR_WAITSUBS;
00265                 }
00266             }
00267             else
00268                 elog(ERROR, "unrecognized thesaurus state: %d", state);
00269 
00270             ptr += pg_mblen(ptr);
00271         }
00272 
00273         if (state == TR_INSUBS)
00274         {
00275             if (ptr == beginwrd)
00276                 ereport(ERROR,
00277                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00278                          errmsg("unexpected end of line or lexeme")));
00279             addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
00280         }
00281 
00282         idsubst++;
00283 
00284         if (!(nwrd && posinsubst))
00285             ereport(ERROR,
00286                     (errcode(ERRCODE_CONFIG_FILE_ERROR),
00287                      errmsg("unexpected end of line")));
00288 
00289         pfree(line);
00290     }
00291 
00292     d->nsubst = idsubst;
00293 
00294     tsearch_readline_end(&trst);
00295 }
00296 
00297 static TheLexeme *
00298 addCompiledLexeme(TheLexeme *newwrds, int *nnw, int *tnm, TSLexeme *lexeme, LexemeInfo *src, uint16 tnvariant)
00299 {
00300     if (*nnw >= *tnm)
00301     {
00302         *tnm *= 2;
00303         newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
00304     }
00305 
00306     newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
00307 
00308     if (lexeme && lexeme->lexeme)
00309     {
00310         newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
00311         newwrds[*nnw].entries->tnvariant = tnvariant;
00312     }
00313     else
00314     {
00315         newwrds[*nnw].lexeme = NULL;
00316         newwrds[*nnw].entries->tnvariant = 1;
00317     }
00318 
00319     newwrds[*nnw].entries->idsubst = src->idsubst;
00320     newwrds[*nnw].entries->posinsubst = src->posinsubst;
00321 
00322     newwrds[*nnw].entries->nextentry = NULL;
00323 
00324     (*nnw)++;
00325     return newwrds;
00326 }
00327 
00328 static int
00329 cmpLexemeInfo(LexemeInfo *a, LexemeInfo *b)
00330 {
00331     if (a == NULL || b == NULL)
00332         return 0;
00333 
00334     if (a->idsubst == b->idsubst)
00335     {
00336         if (a->posinsubst == b->posinsubst)
00337         {
00338             if (a->tnvariant == b->tnvariant)
00339                 return 0;
00340 
00341             return (a->tnvariant > b->tnvariant) ? 1 : -1;
00342         }
00343 
00344         return (a->posinsubst > b->posinsubst) ? 1 : -1;
00345     }
00346 
00347     return (a->idsubst > b->idsubst) ? 1 : -1;
00348 }
00349 
00350 static int
00351 cmpLexeme(const TheLexeme *a, const TheLexeme *b)
00352 {
00353     if (a->lexeme == NULL)
00354     {
00355         if (b->lexeme == NULL)
00356             return 0;
00357         else
00358             return 1;
00359     }
00360     else if (b->lexeme == NULL)
00361         return -1;
00362 
00363     return strcmp(a->lexeme, b->lexeme);
00364 }
00365 
00366 static int
00367 cmpLexemeQ(const void *a, const void *b)
00368 {
00369     return cmpLexeme((const TheLexeme *) a, (const TheLexeme *) b);
00370 }
00371 
00372 static int
00373 cmpTheLexeme(const void *a, const void *b)
00374 {
00375     const TheLexeme *la = (const TheLexeme *) a;
00376     const TheLexeme *lb = (const TheLexeme *) b;
00377     int         res;
00378 
00379     if ((res = cmpLexeme(la, lb)) != 0)
00380         return res;
00381 
00382     return -cmpLexemeInfo(la->entries, lb->entries);
00383 }
00384 
00385 static void
00386 compileTheLexeme(DictThesaurus *d)
00387 {
00388     int         i,
00389                 nnw = 0,
00390                 tnm = 16;
00391     TheLexeme  *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
00392                *ptrwrds;
00393 
00394     for (i = 0; i < d->nwrds; i++)
00395     {
00396         TSLexeme   *ptr;
00397 
00398         if (strcmp(d->wrds[i].lexeme, "?") == 0)        /* Is stop word marker? */
00399             newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
00400         else
00401         {
00402             ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
00403                                        PointerGetDatum(d->subdict->dictData),
00404                                           PointerGetDatum(d->wrds[i].lexeme),
00405                                     Int32GetDatum(strlen(d->wrds[i].lexeme)),
00406                                                      PointerGetDatum(NULL)));
00407 
00408             if (!ptr)
00409                 ereport(ERROR,
00410                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00411                          errmsg("thesaurus sample word \"%s\" isn't recognized by subdictionary (rule %d)",
00412                                 d->wrds[i].lexeme,
00413                                 d->wrds[i].entries->idsubst + 1)));
00414             else if (!(ptr->lexeme))
00415                 ereport(ERROR,
00416                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00417                          errmsg("thesaurus sample word \"%s\" is a stop word (rule %d)",
00418                                 d->wrds[i].lexeme,
00419                                 d->wrds[i].entries->idsubst + 1),
00420                          errhint("Use \"?\" to represent a stop word within a sample phrase.")));
00421             else
00422             {
00423                 while (ptr->lexeme)
00424                 {
00425                     TSLexeme   *remptr = ptr + 1;
00426                     int         tnvar = 1;
00427                     int         curvar = ptr->nvariant;
00428 
00429                     /* compute n words in one variant */
00430                     while (remptr->lexeme)
00431                     {
00432                         if (remptr->nvariant != (remptr - 1)->nvariant)
00433                             break;
00434                         tnvar++;
00435                         remptr++;
00436                     }
00437 
00438                     remptr = ptr;
00439                     while (remptr->lexeme && remptr->nvariant == curvar)
00440                     {
00441                         newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
00442                         remptr++;
00443                     }
00444 
00445                     ptr = remptr;
00446                 }
00447             }
00448         }
00449 
00450         pfree(d->wrds[i].lexeme);
00451         pfree(d->wrds[i].entries);
00452     }
00453 
00454     if (d->wrds)
00455         pfree(d->wrds);
00456     d->wrds = newwrds;
00457     d->nwrds = nnw;
00458     d->ntwrds = tnm;
00459 
00460     if (d->nwrds > 1)
00461     {
00462         qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
00463 
00464         /* uniq */
00465         newwrds = d->wrds;
00466         ptrwrds = d->wrds + 1;
00467         while (ptrwrds - d->wrds < d->nwrds)
00468         {
00469             if (cmpLexeme(ptrwrds, newwrds) == 0)
00470             {
00471                 if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
00472                 {
00473                     ptrwrds->entries->nextentry = newwrds->entries;
00474                     newwrds->entries = ptrwrds->entries;
00475                 }
00476                 else
00477                     pfree(ptrwrds->entries);
00478 
00479                 if (ptrwrds->lexeme)
00480                     pfree(ptrwrds->lexeme);
00481             }
00482             else
00483             {
00484                 newwrds++;
00485                 *newwrds = *ptrwrds;
00486             }
00487 
00488             ptrwrds++;
00489         }
00490 
00491         d->nwrds = newwrds - d->wrds + 1;
00492         d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
00493     }
00494 }
00495 
00496 static void
00497 compileTheSubstitute(DictThesaurus *d)
00498 {
00499     int         i;
00500 
00501     for (i = 0; i < d->nsubst; i++)
00502     {
00503         TSLexeme   *rem = d->subst[i].res,
00504                    *outptr,
00505                    *inptr;
00506         int         n = 2;
00507 
00508         outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
00509         outptr->lexeme = NULL;
00510         inptr = rem;
00511 
00512         while (inptr && inptr->lexeme)
00513         {
00514             TSLexeme   *lexized,
00515                         tmplex[2];
00516 
00517             if (inptr->flags & DT_USEASIS)
00518             {                   /* do not lexize */
00519                 tmplex[0] = *inptr;
00520                 tmplex[0].flags = 0;
00521                 tmplex[1].lexeme = NULL;
00522                 lexized = tmplex;
00523             }
00524             else
00525             {
00526                 lexized = (TSLexeme *) DatumGetPointer(
00527                                                        FunctionCall4(
00528                                                        &(d->subdict->lexize),
00529                                        PointerGetDatum(d->subdict->dictData),
00530                                               PointerGetDatum(inptr->lexeme),
00531                                         Int32GetDatum(strlen(inptr->lexeme)),
00532                                                         PointerGetDatum(NULL)
00533                                                                      )
00534                     );
00535             }
00536 
00537             if (lexized && lexized->lexeme)
00538             {
00539                 int         toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
00540 
00541                 while (lexized->lexeme)
00542                 {
00543                     if (outptr - d->subst[i].res + 1 >= n)
00544                     {
00545                         int         diff = outptr - d->subst[i].res;
00546 
00547                         n *= 2;
00548                         d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
00549                         outptr = d->subst[i].res + diff;
00550                     }
00551 
00552                     *outptr = *lexized;
00553                     outptr->lexeme = pstrdup(lexized->lexeme);
00554 
00555                     outptr++;
00556                     lexized++;
00557                 }
00558 
00559                 if (toset > 0)
00560                     d->subst[i].res[toset].flags |= TSL_ADDPOS;
00561             }
00562             else if (lexized)
00563             {
00564                 ereport(ERROR,
00565                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00566                          errmsg("thesaurus substitute word \"%s\" is a stop word (rule %d)",
00567                                 inptr->lexeme, i + 1)));
00568             }
00569             else
00570             {
00571                 ereport(ERROR,
00572                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00573                          errmsg("thesaurus substitute word \"%s\" isn't recognized by subdictionary (rule %d)",
00574                                 inptr->lexeme, i + 1)));
00575             }
00576 
00577             if (inptr->lexeme)
00578                 pfree(inptr->lexeme);
00579             inptr++;
00580         }
00581 
00582         if (outptr == d->subst[i].res)
00583             ereport(ERROR,
00584                     (errcode(ERRCODE_CONFIG_FILE_ERROR),
00585                      errmsg("thesaurus substitute phrase is empty (rule %d)",
00586                             i + 1)));
00587 
00588         d->subst[i].reslen = outptr - d->subst[i].res;
00589 
00590         pfree(rem);
00591     }
00592 }
00593 
00594 Datum
00595 thesaurus_init(PG_FUNCTION_ARGS)
00596 {
00597     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
00598     DictThesaurus *d;
00599     char       *subdictname = NULL;
00600     bool        fileloaded = false;
00601     ListCell   *l;
00602 
00603     d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
00604 
00605     foreach(l, dictoptions)
00606     {
00607         DefElem    *defel = (DefElem *) lfirst(l);
00608 
00609         if (pg_strcasecmp("DictFile", defel->defname) == 0)
00610         {
00611             if (fileloaded)
00612                 ereport(ERROR,
00613                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00614                          errmsg("multiple DictFile parameters")));
00615             thesaurusRead(defGetString(defel), d);
00616             fileloaded = true;
00617         }
00618         else if (pg_strcasecmp("Dictionary", defel->defname) == 0)
00619         {
00620             if (subdictname)
00621                 ereport(ERROR,
00622                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00623                          errmsg("multiple Dictionary parameters")));
00624             subdictname = pstrdup(defGetString(defel));
00625         }
00626         else
00627         {
00628             ereport(ERROR,
00629                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00630                      errmsg("unrecognized Thesaurus parameter: \"%s\"",
00631                             defel->defname)));
00632         }
00633     }
00634 
00635     if (!fileloaded)
00636         ereport(ERROR,
00637                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00638                  errmsg("missing DictFile parameter")));
00639     if (!subdictname)
00640         ereport(ERROR,
00641                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00642                  errmsg("missing Dictionary parameter")));
00643 
00644     d->subdictOid = get_ts_dict_oid(stringToQualifiedNameList(subdictname), false);
00645     d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
00646 
00647     compileTheLexeme(d);
00648     compileTheSubstitute(d);
00649 
00650     PG_RETURN_POINTER(d);
00651 }
00652 
00653 static LexemeInfo *
00654 findTheLexeme(DictThesaurus *d, char *lexeme)
00655 {
00656     TheLexeme   key,
00657                *res;
00658 
00659     if (d->nwrds == 0)
00660         return NULL;
00661 
00662     key.lexeme = lexeme;
00663     key.entries = NULL;
00664 
00665     res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
00666 
00667     if (res == NULL)
00668         return NULL;
00669     return res->entries;
00670 }
00671 
00672 static bool
00673 matchIdSubst(LexemeInfo *stored, uint16 idsubst)
00674 {
00675     bool        res = true;
00676 
00677     if (stored)
00678     {
00679         res = false;
00680 
00681         for (; stored; stored = stored->nextvariant)
00682             if (stored->idsubst == idsubst)
00683             {
00684                 res = true;
00685                 break;
00686             }
00687     }
00688 
00689     return res;
00690 }
00691 
00692 static LexemeInfo *
00693 findVariant(LexemeInfo *in, LexemeInfo *stored, uint16 curpos, LexemeInfo **newin, int newn)
00694 {
00695     for (;;)
00696     {
00697         int         i;
00698         LexemeInfo *ptr = newin[0];
00699 
00700         for (i = 0; i < newn; i++)
00701         {
00702             while (newin[i] && newin[i]->idsubst < ptr->idsubst)
00703                 newin[i] = newin[i]->nextentry;
00704 
00705             if (newin[i] == NULL)
00706                 return in;
00707 
00708             if (newin[i]->idsubst > ptr->idsubst)
00709             {
00710                 ptr = newin[i];
00711                 i = -1;
00712                 continue;
00713             }
00714 
00715             while (newin[i]->idsubst == ptr->idsubst)
00716             {
00717                 if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
00718                 {
00719                     ptr = newin[i];
00720                     break;
00721                 }
00722 
00723                 newin[i] = newin[i]->nextentry;
00724                 if (newin[i] == NULL)
00725                     return in;
00726             }
00727 
00728             if (newin[i]->idsubst != ptr->idsubst)
00729             {
00730                 ptr = newin[i];
00731                 i = -1;
00732                 continue;
00733             }
00734         }
00735 
00736         if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
00737         {                       /* found */
00738 
00739             ptr->nextvariant = in;
00740             in = ptr;
00741         }
00742 
00743         /* step forward */
00744         for (i = 0; i < newn; i++)
00745             newin[i] = newin[i]->nextentry;
00746     }
00747 }
00748 
00749 static TSLexeme *
00750 copyTSLexeme(TheSubstitute *ts)
00751 {
00752     TSLexeme   *res;
00753     uint16      i;
00754 
00755     res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
00756     for (i = 0; i < ts->reslen; i++)
00757     {
00758         res[i] = ts->res[i];
00759         res[i].lexeme = pstrdup(ts->res[i].lexeme);
00760     }
00761 
00762     res[ts->reslen].lexeme = NULL;
00763 
00764     return res;
00765 }
00766 
00767 static TSLexeme *
00768 checkMatch(DictThesaurus *d, LexemeInfo *info, uint16 curpos, bool *moreres)
00769 {
00770     *moreres = false;
00771     while (info)
00772     {
00773         Assert(info->idsubst < d->nsubst);
00774         if (info->nextvariant)
00775             *moreres = true;
00776         if (d->subst[info->idsubst].lastlexeme == curpos)
00777             return copyTSLexeme(d->subst + info->idsubst);
00778         info = info->nextvariant;
00779     }
00780 
00781     return NULL;
00782 }
00783 
00784 Datum
00785 thesaurus_lexize(PG_FUNCTION_ARGS)
00786 {
00787     DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
00788     DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
00789     TSLexeme   *res = NULL;
00790     LexemeInfo *stored,
00791                *info = NULL;
00792     uint16      curpos = 0;
00793     bool        moreres = false;
00794 
00795     if (PG_NARGS() != 4 || dstate == NULL)
00796         elog(ERROR, "forbidden call of thesaurus or nested call");
00797 
00798     if (dstate->isend)
00799         PG_RETURN_POINTER(NULL);
00800     stored = (LexemeInfo *) dstate->private_state;
00801 
00802     if (stored)
00803         curpos = stored->posinsubst + 1;
00804 
00805     if (!d->subdict->isvalid)
00806         d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
00807 
00808     res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
00809                                        PointerGetDatum(d->subdict->dictData),
00810                                                      PG_GETARG_DATUM(1),
00811                                                      PG_GETARG_DATUM(2),
00812                                                      PointerGetDatum(NULL)));
00813 
00814     if (res && res->lexeme)
00815     {
00816         TSLexeme   *ptr = res,
00817                    *basevar;
00818 
00819         while (ptr->lexeme)
00820         {
00821             uint16      nv = ptr->nvariant;
00822             uint16      i,
00823                         nlex = 0;
00824             LexemeInfo **infos;
00825 
00826             basevar = ptr;
00827             while (ptr->lexeme && nv == ptr->nvariant)
00828             {
00829                 nlex++;
00830                 ptr++;
00831             }
00832 
00833             infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
00834             for (i = 0; i < nlex; i++)
00835                 if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
00836                     break;
00837 
00838             if (i < nlex)
00839             {
00840                 /* no chance to find */
00841                 pfree(infos);
00842                 continue;
00843             }
00844 
00845             info = findVariant(info, stored, curpos, infos, nlex);
00846         }
00847     }
00848     else if (res)
00849     {                           /* stop-word */
00850         LexemeInfo *infos = findTheLexeme(d, NULL);
00851 
00852         info = findVariant(NULL, stored, curpos, &infos, 1);
00853     }
00854     else
00855     {
00856         info = NULL;            /* word isn't recognized */
00857     }
00858 
00859     dstate->private_state = (void *) info;
00860 
00861     if (!info)
00862     {
00863         dstate->getnext = false;
00864         PG_RETURN_POINTER(NULL);
00865     }
00866 
00867     if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
00868     {
00869         dstate->getnext = moreres;
00870         PG_RETURN_POINTER(res);
00871     }
00872 
00873     dstate->getnext = true;
00874 
00875     PG_RETURN_POINTER(NULL);
00876 }