PostgreSQL Source Code: src/backend/tsearch/spell.c Source File

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * spell.c
00004  *      Normalizing word with ISpell
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/tsearch/spell.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 
00015 #include "postgres.h"
00016 
00017 #include "catalog/pg_collation.h"
00018 #include "tsearch/dicts/spell.h"
00019 #include "tsearch/ts_locale.h"
00020 #include "utils/memutils.h"
00021 
00022 
00023 /*
00024  * Initialization requires a lot of memory that's not needed
00025  * after the initialization is done.  During initialization,
00026  * CurrentMemoryContext is the long-lived memory context associated
00027  * with the dictionary cache entry.  We keep the short-lived stuff
00028  * in the Conf->buildCxt context.
00029  */
00030 #define tmpalloc(sz)  MemoryContextAlloc(Conf->buildCxt, (sz))
00031 #define tmpalloc0(sz)  MemoryContextAllocZero(Conf->buildCxt, (sz))
00032 
00033 /*
00034  * Prepare for constructing an ISpell dictionary.
00035  *
00036  * The IspellDict struct is assumed to be zeroed when allocated.
00037  */
00038 void
00039 NIStartBuild(IspellDict *Conf)
00040 {
00041     /*
00042      * The temp context is a child of CurTransactionContext, so that it will
00043      * go away automatically on error.
00044      */
00045     Conf->buildCxt = AllocSetContextCreate(CurTransactionContext,
00046                                            "Ispell dictionary init context",
00047                                            ALLOCSET_DEFAULT_MINSIZE,
00048                                            ALLOCSET_DEFAULT_INITSIZE,
00049                                            ALLOCSET_DEFAULT_MAXSIZE);
00050 }
00051 
00052 /*
00053  * Clean up when dictionary construction is complete.
00054  */
00055 void
00056 NIFinishBuild(IspellDict *Conf)
00057 {
00058     /* Release no-longer-needed temp memory */
00059     MemoryContextDelete(Conf->buildCxt);
00060     /* Just for cleanliness, zero the now-dangling pointers */
00061     Conf->buildCxt = NULL;
00062     Conf->Spell = NULL;
00063     Conf->firstfree = NULL;
00064 }
00065 
00066 
00067 /*
00068  * "Compact" palloc: allocate without extra palloc overhead.
00069  *
00070  * Since we have no need to free the ispell data items individually, there's
00071  * not much value in the per-chunk overhead normally consumed by palloc.
00072  * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
00073  *
00074  * We currently pre-zero all data allocated this way, even though some of it
00075  * doesn't need that.  The cpalloc and cpalloc0 macros are just documentation
00076  * to indicate which allocations actually require zeroing.
00077  */
00078 #define COMPACT_ALLOC_CHUNK 8192    /* amount to get from palloc at once */
00079 #define COMPACT_MAX_REQ     1024    /* must be < COMPACT_ALLOC_CHUNK */
00080 
00081 static void *
00082 compact_palloc0(IspellDict *Conf, size_t size)
00083 {
00084     void       *result;
00085 
00086     /* Should only be called during init */
00087     Assert(Conf->buildCxt != NULL);
00088 
00089     /* No point in this for large chunks */
00090     if (size > COMPACT_MAX_REQ)
00091         return palloc0(size);
00092 
00093     /* Keep everything maxaligned */
00094     size = MAXALIGN(size);
00095 
00096     /* Need more space? */
00097     if (size > Conf->avail)
00098     {
00099         Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
00100         Conf->avail = COMPACT_ALLOC_CHUNK;
00101     }
00102 
00103     result = (void *) Conf->firstfree;
00104     Conf->firstfree += size;
00105     Conf->avail -= size;
00106 
00107     return result;
00108 }
00109 
00110 #define cpalloc(size) compact_palloc0(Conf, size)
00111 #define cpalloc0(size) compact_palloc0(Conf, size)
00112 
00113 static char *
00114 cpstrdup(IspellDict *Conf, const char *str)
00115 {
00116     char       *res = cpalloc(strlen(str) + 1);
00117 
00118     strcpy(res, str);
00119     return res;
00120 }
00121 
00122 
00123 /*
00124  * Apply lowerstr(), producing a temporary result (in the buildCxt).
00125  */
00126 static char *
00127 lowerstr_ctx(IspellDict *Conf, const char *src)
00128 {
00129     MemoryContext saveCtx;
00130     char       *dst;
00131 
00132     saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
00133     dst = lowerstr(src);
00134     MemoryContextSwitchTo(saveCtx);
00135 
00136     return dst;
00137 }
00138 
00139 #define MAX_NORM 1024
00140 #define MAXNORMLEN 256
00141 
00142 #define STRNCMP(s,p)    strncmp( (s), (p), strlen(p) )
00143 #define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
00144 #define GETCHAR(A,N,T)    GETWCHAR( (A)->repl, (A)->replen, N, T )
00145 
00146 static char *VoidString = "";
00147 
00148 static int
00149 cmpspell(const void *s1, const void *s2)
00150 {
00151     return (strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word));
00152 }
00153 static int
00154 cmpspellaffix(const void *s1, const void *s2)
00155 {
00156     return (strncmp((*(SPELL *const *) s1)->p.flag, (*(SPELL *const *) s2)->p.flag, MAXFLAGLEN));
00157 }
00158 
00159 static char *
00160 findchar(char *str, int c)
00161 {
00162     while (*str)
00163     {
00164         if (t_iseq(str, c))
00165             return str;
00166         str += pg_mblen(str);
00167     }
00168 
00169     return NULL;
00170 }
00171 
00172 
00173 /* backward string compare for suffix tree operations */
00174 static int
00175 strbcmp(const unsigned char *s1, const unsigned char *s2)
00176 {
00177     int         l1 = strlen((const char *) s1) - 1,
00178                 l2 = strlen((const char *) s2) - 1;
00179 
00180     while (l1 >= 0 && l2 >= 0)
00181     {
00182         if (s1[l1] < s2[l2])
00183             return -1;
00184         if (s1[l1] > s2[l2])
00185             return 1;
00186         l1--;
00187         l2--;
00188     }
00189     if (l1 < l2)
00190         return -1;
00191     if (l1 > l2)
00192         return 1;
00193 
00194     return 0;
00195 }
00196 
00197 static int
00198 strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
00199 {
00200     int         l1 = strlen((const char *) s1) - 1,
00201                 l2 = strlen((const char *) s2) - 1,
00202                 l = count;
00203 
00204     while (l1 >= 0 && l2 >= 0 && l > 0)
00205     {
00206         if (s1[l1] < s2[l2])
00207             return -1;
00208         if (s1[l1] > s2[l2])
00209             return 1;
00210         l1--;
00211         l2--;
00212         l--;
00213     }
00214     if (l == 0)
00215         return 0;
00216     if (l1 < l2)
00217         return -1;
00218     if (l1 > l2)
00219         return 1;
00220     return 0;
00221 }
00222 
00223 static int
00224 cmpaffix(const void *s1, const void *s2)
00225 {
00226     const AFFIX *a1 = (const AFFIX *) s1;
00227     const AFFIX *a2 = (const AFFIX *) s2;
00228 
00229     if (a1->type < a2->type)
00230         return -1;
00231     if (a1->type > a2->type)
00232         return 1;
00233     if (a1->type == FF_PREFIX)
00234         return strcmp(a1->repl, a2->repl);
00235     else
00236         return strbcmp((const unsigned char *) a1->repl,
00237                        (const unsigned char *) a2->repl);
00238 }
00239 
00240 static void
00241 NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
00242 {
00243     if (Conf->nspell >= Conf->mspell)
00244     {
00245         if (Conf->mspell)
00246         {
00247             Conf->mspell *= 2;
00248             Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
00249         }
00250         else
00251         {
00252             Conf->mspell = 1024 * 20;
00253             Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
00254         }
00255     }
00256     Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
00257     strcpy(Conf->Spell[Conf->nspell]->word, word);
00258     strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
00259     Conf->nspell++;
00260 }
00261 
00262 /*
00263  * import dictionary
00264  *
00265  * Note caller must already have applied get_tsearch_config_filename
00266  */
00267 void
00268 NIImportDictionary(IspellDict *Conf, const char *filename)
00269 {
00270     tsearch_readline_state trst;
00271     char       *line;
00272 
00273     if (!tsearch_readline_begin(&trst, filename))
00274         ereport(ERROR,
00275                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00276                  errmsg("could not open dictionary file \"%s\": %m",
00277                         filename)));
00278 
00279     while ((line = tsearch_readline(&trst)) != NULL)
00280     {
00281         char       *s,
00282                    *pstr;
00283         const char *flag;
00284 
00285         /* Extract flag from the line */
00286         flag = NULL;
00287         if ((s = findchar(line, '/')))
00288         {
00289             *s++ = '\0';
00290             flag = s;
00291             while (*s)
00292             {
00293                 /* we allow only single encoded flags for faster works */
00294                 if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
00295                     s++;
00296                 else
00297                 {
00298                     *s = '\0';
00299                     break;
00300                 }
00301             }
00302         }
00303         else
00304             flag = "";
00305 
00306         /* Remove trailing spaces */
00307         s = line;
00308         while (*s)
00309         {
00310             if (t_isspace(s))
00311             {
00312                 *s = '\0';
00313                 break;
00314             }
00315             s += pg_mblen(s);
00316         }
00317         pstr = lowerstr_ctx(Conf, line);
00318 
00319         NIAddSpell(Conf, pstr, flag);
00320         pfree(pstr);
00321 
00322         pfree(line);
00323     }
00324     tsearch_readline_end(&trst);
00325 }
00326 
00327 
00328 static int
00329 FindWord(IspellDict *Conf, const char *word, int affixflag, int flag)
00330 {
00331     SPNode     *node = Conf->Dictionary;
00332     SPNodeData *StopLow,
00333                *StopHigh,
00334                *StopMiddle;
00335     const uint8 *ptr = (const uint8 *) word;
00336 
00337     flag &= FF_DICTFLAGMASK;
00338 
00339     while (node && *ptr)
00340     {
00341         StopLow = node->data;
00342         StopHigh = node->data + node->length;
00343         while (StopLow < StopHigh)
00344         {
00345             StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
00346             if (StopMiddle->val == *ptr)
00347             {
00348                 if (*(ptr + 1) == '\0' && StopMiddle->isword)
00349                 {
00350                     if (flag == 0)
00351                     {
00352                         if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
00353                             return 0;
00354                     }
00355                     else if ((flag & StopMiddle->compoundflag) == 0)
00356                         return 0;
00357 
00358                     if ((affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
00359                         return 1;
00360                 }
00361                 node = StopMiddle->node;
00362                 ptr++;
00363                 break;
00364             }
00365             else if (StopMiddle->val < *ptr)
00366                 StopLow = StopMiddle + 1;
00367             else
00368                 StopHigh = StopMiddle;
00369         }
00370         if (StopLow >= StopHigh)
00371             break;
00372     }
00373     return 0;
00374 }
00375 
00376 static void
00377 NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
00378 {
00379     AFFIX      *Affix;
00380 
00381     if (Conf->naffixes >= Conf->maffixes)
00382     {
00383         if (Conf->maffixes)
00384         {
00385             Conf->maffixes *= 2;
00386             Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
00387         }
00388         else
00389         {
00390             Conf->maffixes = 16;
00391             Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
00392         }
00393     }
00394 
00395     Affix = Conf->Affix + Conf->naffixes;
00396 
00397     if (strcmp(mask, ".") == 0)
00398     {
00399         Affix->issimple = 1;
00400         Affix->isregis = 0;
00401     }
00402     else if (RS_isRegis(mask))
00403     {
00404         Affix->issimple = 0;
00405         Affix->isregis = 1;
00406         RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX) ? true : false,
00407                    (mask && *mask) ? mask : VoidString);
00408     }
00409     else
00410     {
00411         int         masklen;
00412         int         wmasklen;
00413         int         err;
00414         pg_wchar   *wmask;
00415         char       *tmask;
00416 
00417         Affix->issimple = 0;
00418         Affix->isregis = 0;
00419         tmask = (char *) tmpalloc(strlen(mask) + 3);
00420         if (type == FF_SUFFIX)
00421             sprintf(tmask, "%s$", mask);
00422         else
00423             sprintf(tmask, "^%s", mask);
00424 
00425         masklen = strlen(tmask);
00426         wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
00427         wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
00428 
00429         err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen,
00430                          REG_ADVANCED | REG_NOSUB,
00431                          DEFAULT_COLLATION_OID);
00432         if (err)
00433         {
00434             char        errstr[100];
00435 
00436             pg_regerror(err, &(Affix->reg.regex), errstr, sizeof(errstr));
00437             ereport(ERROR,
00438                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
00439                      errmsg("invalid regular expression: %s", errstr)));
00440         }
00441     }
00442 
00443     Affix->flagflags = flagflags;
00444     if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
00445     {
00446         if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
00447             Affix->flagflags |= FF_COMPOUNDFLAG;
00448     }
00449     Affix->flag = flag;
00450     Affix->type = type;
00451 
00452     Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
00453     if ((Affix->replen = strlen(repl)) > 0)
00454         Affix->repl = cpstrdup(Conf, repl);
00455     else
00456         Affix->repl = VoidString;
00457     Conf->naffixes++;
00458 }
00459 
00460 #define PAE_WAIT_MASK   0
00461 #define PAE_INMASK  1
00462 #define PAE_WAIT_FIND   2
00463 #define PAE_INFIND  3
00464 #define PAE_WAIT_REPL   4
00465 #define PAE_INREPL  5
00466 
00467 static bool
00468 parse_affentry(char *str, char *mask, char *find, char *repl)
00469 {
00470     int         state = PAE_WAIT_MASK;
00471     char       *pmask = mask,
00472                *pfind = find,
00473                *prepl = repl;
00474 
00475     *mask = *find = *repl = '\0';
00476 
00477     while (*str)
00478     {
00479         if (state == PAE_WAIT_MASK)
00480         {
00481             if (t_iseq(str, '#'))
00482                 return false;
00483             else if (!t_isspace(str))
00484             {
00485                 COPYCHAR(pmask, str);
00486                 pmask += pg_mblen(str);
00487                 state = PAE_INMASK;
00488             }
00489         }
00490         else if (state == PAE_INMASK)
00491         {
00492             if (t_iseq(str, '>'))
00493             {
00494                 *pmask = '\0';
00495                 state = PAE_WAIT_FIND;
00496             }
00497             else if (!t_isspace(str))
00498             {
00499                 COPYCHAR(pmask, str);
00500                 pmask += pg_mblen(str);
00501             }
00502         }
00503         else if (state == PAE_WAIT_FIND)
00504         {
00505             if (t_iseq(str, '-'))
00506             {
00507                 state = PAE_INFIND;
00508             }
00509             else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
00510             {
00511                 COPYCHAR(prepl, str);
00512                 prepl += pg_mblen(str);
00513                 state = PAE_INREPL;
00514             }
00515             else if (!t_isspace(str))
00516                 ereport(ERROR,
00517                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00518                          errmsg("syntax error")));
00519         }
00520         else if (state == PAE_INFIND)
00521         {
00522             if (t_iseq(str, ','))
00523             {
00524                 *pfind = '\0';
00525                 state = PAE_WAIT_REPL;
00526             }
00527             else if (t_isalpha(str))
00528             {
00529                 COPYCHAR(pfind, str);
00530                 pfind += pg_mblen(str);
00531             }
00532             else if (!t_isspace(str))
00533                 ereport(ERROR,
00534                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00535                          errmsg("syntax error")));
00536         }
00537         else if (state == PAE_WAIT_REPL)
00538         {
00539             if (t_iseq(str, '-'))
00540             {
00541                 break;          /* void repl */
00542             }
00543             else if (t_isalpha(str))
00544             {
00545                 COPYCHAR(prepl, str);
00546                 prepl += pg_mblen(str);
00547                 state = PAE_INREPL;
00548             }
00549             else if (!t_isspace(str))
00550                 ereport(ERROR,
00551                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00552                          errmsg("syntax error")));
00553         }
00554         else if (state == PAE_INREPL)
00555         {
00556             if (t_iseq(str, '#'))
00557             {
00558                 *prepl = '\0';
00559                 break;
00560             }
00561             else if (t_isalpha(str))
00562             {
00563                 COPYCHAR(prepl, str);
00564                 prepl += pg_mblen(str);
00565             }
00566             else if (!t_isspace(str))
00567                 ereport(ERROR,
00568                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00569                          errmsg("syntax error")));
00570         }
00571         else
00572             elog(ERROR, "unrecognized state in parse_affentry: %d", state);
00573 
00574         str += pg_mblen(str);
00575     }
00576 
00577     *pmask = *pfind = *prepl = '\0';
00578 
00579     return (*mask && (*find || *repl)) ? true : false;
00580 }
00581 
00582 static void
00583 addFlagValue(IspellDict *Conf, char *s, uint32 val)
00584 {
00585     while (*s && t_isspace(s))
00586         s += pg_mblen(s);
00587 
00588     if (!*s)
00589         ereport(ERROR,
00590                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00591                  errmsg("syntax error")));
00592 
00593     if (pg_mblen(s) != 1)
00594         ereport(ERROR,
00595                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00596                  errmsg("multibyte flag character is not allowed")));
00597 
00598     Conf->flagval[*(unsigned char *) s] = (unsigned char) val;
00599     Conf->usecompound = true;
00600 }
00601 
00602 static void
00603 NIImportOOAffixes(IspellDict *Conf, const char *filename)
00604 {
00605     char        type[BUFSIZ],
00606                *ptype = NULL;
00607     char        sflag[BUFSIZ];
00608     char        mask[BUFSIZ],
00609                *pmask;
00610     char        find[BUFSIZ],
00611                *pfind;
00612     char        repl[BUFSIZ],
00613                *prepl;
00614     bool        isSuffix = false;
00615     int         flag = 0;
00616     char        flagflags = 0;
00617     tsearch_readline_state trst;
00618     int         scanread = 0;
00619     char        scanbuf[BUFSIZ];
00620     char       *recoded;
00621 
00622     /* read file to find any flag */
00623     memset(Conf->flagval, 0, sizeof(Conf->flagval));
00624     Conf->usecompound = false;
00625 
00626     if (!tsearch_readline_begin(&trst, filename))
00627         ereport(ERROR,
00628                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00629                  errmsg("could not open affix file \"%s\": %m",
00630                         filename)));
00631 
00632     while ((recoded = tsearch_readline(&trst)) != NULL)
00633     {
00634         if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
00635         {
00636             pfree(recoded);
00637             continue;
00638         }
00639 
00640         if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
00641             addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
00642                          FF_COMPOUNDFLAG);
00643         else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
00644             addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
00645                          FF_COMPOUNDBEGIN);
00646         else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
00647             addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
00648                          FF_COMPOUNDLAST);
00649         /* COMPOUNDLAST and COMPOUNDEND are synonyms */
00650         else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
00651             addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
00652                          FF_COMPOUNDLAST);
00653         else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
00654             addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
00655                          FF_COMPOUNDMIDDLE);
00656         else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
00657             addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
00658                          FF_COMPOUNDONLY);
00659         else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
00660             addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
00661                          FF_COMPOUNDPERMITFLAG);
00662         else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
00663             addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
00664                          FF_COMPOUNDFORBIDFLAG);
00665         else if (STRNCMP(recoded, "FLAG") == 0)
00666         {
00667             char       *s = recoded + strlen("FLAG");
00668 
00669             while (*s && t_isspace(s))
00670                 s += pg_mblen(s);
00671 
00672             if (*s && STRNCMP(s, "default") != 0)
00673                 ereport(ERROR,
00674                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00675                          errmsg("Ispell dictionary supports only default flag value")));
00676         }
00677 
00678         pfree(recoded);
00679     }
00680     tsearch_readline_end(&trst);
00681 
00682     sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
00683 
00684     if (!tsearch_readline_begin(&trst, filename))
00685         ereport(ERROR,
00686                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00687                  errmsg("could not open affix file \"%s\": %m",
00688                         filename)));
00689 
00690     while ((recoded = tsearch_readline(&trst)) != NULL)
00691     {
00692         if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
00693             goto nextline;
00694 
00695         scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
00696 
00697         if (ptype)
00698             pfree(ptype);
00699         ptype = lowerstr_ctx(Conf, type);
00700         if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
00701             goto nextline;
00702 
00703         if (scanread == 4)
00704         {
00705             if (strlen(sflag) != 1)
00706                 goto nextline;
00707             flag = *sflag;
00708             isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
00709             if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
00710                 flagflags = FF_CROSSPRODUCT;
00711             else
00712                 flagflags = 0;
00713         }
00714         else
00715         {
00716             char       *ptr;
00717             int         aflg = 0;
00718 
00719             if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
00720                 goto nextline;
00721             prepl = lowerstr_ctx(Conf, repl);
00722             /* affix flag */
00723             if ((ptr = strchr(prepl, '/')) != NULL)
00724             {
00725                 *ptr = '\0';
00726                 ptr = repl + (ptr - prepl) + 1;
00727                 while (*ptr)
00728                 {
00729                     aflg |= Conf->flagval[*(unsigned char *) ptr];
00730                     ptr++;
00731                 }
00732             }
00733             pfind = lowerstr_ctx(Conf, find);
00734             pmask = lowerstr_ctx(Conf, mask);
00735             if (t_iseq(find, '0'))
00736                 *pfind = '\0';
00737             if (t_iseq(repl, '0'))
00738                 *prepl = '\0';
00739 
00740             NIAddAffix(Conf, flag, flagflags | aflg, pmask, pfind, prepl,
00741                        isSuffix ? FF_SUFFIX : FF_PREFIX);
00742             pfree(prepl);
00743             pfree(pfind);
00744             pfree(pmask);
00745         }
00746 
00747 nextline:
00748         pfree(recoded);
00749     }
00750 
00751     tsearch_readline_end(&trst);
00752     if (ptype)
00753         pfree(ptype);
00754 }
00755 
00756 /*
00757  * import affixes
00758  *
00759  * Note caller must already have applied get_tsearch_config_filename
00760  */
00761 void
00762 NIImportAffixes(IspellDict *Conf, const char *filename)
00763 {
00764     char       *pstr = NULL;
00765     char        mask[BUFSIZ];
00766     char        find[BUFSIZ];
00767     char        repl[BUFSIZ];
00768     char       *s;
00769     bool        suffixes = false;
00770     bool        prefixes = false;
00771     int         flag = 0;
00772     char        flagflags = 0;
00773     tsearch_readline_state trst;
00774     bool        oldformat = false;
00775     char       *recoded = NULL;
00776 
00777     if (!tsearch_readline_begin(&trst, filename))
00778         ereport(ERROR,
00779                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00780                  errmsg("could not open affix file \"%s\": %m",
00781                         filename)));
00782 
00783     memset(Conf->flagval, 0, sizeof(Conf->flagval));
00784     Conf->usecompound = false;
00785 
00786     while ((recoded = tsearch_readline(&trst)) != NULL)
00787     {
00788         pstr = lowerstr(recoded);
00789 
00790         /* Skip comments and empty lines */
00791         if (*pstr == '#' || *pstr == '\n')
00792             goto nextline;
00793 
00794         if (STRNCMP(pstr, "compoundwords") == 0)
00795         {
00796             s = findchar(pstr, 'l');
00797             if (s)
00798             {
00799                 s = recoded + (s - pstr);       /* we need non-lowercased
00800                                                  * string */
00801                 while (*s && !t_isspace(s))
00802                     s += pg_mblen(s);
00803                 while (*s && t_isspace(s))
00804                     s += pg_mblen(s);
00805 
00806                 if (*s && pg_mblen(s) == 1)
00807                 {
00808                     Conf->flagval[*(unsigned char *) s] = FF_COMPOUNDFLAG;
00809                     Conf->usecompound = true;
00810                 }
00811                 oldformat = true;
00812                 goto nextline;
00813             }
00814         }
00815         if (STRNCMP(pstr, "suffixes") == 0)
00816         {
00817             suffixes = true;
00818             prefixes = false;
00819             oldformat = true;
00820             goto nextline;
00821         }
00822         if (STRNCMP(pstr, "prefixes") == 0)
00823         {
00824             suffixes = false;
00825             prefixes = true;
00826             oldformat = true;
00827             goto nextline;
00828         }
00829         if (STRNCMP(pstr, "flag") == 0)
00830         {
00831             s = recoded + 4;    /* we need non-lowercased string */
00832             flagflags = 0;
00833 
00834             while (*s && t_isspace(s))
00835                 s += pg_mblen(s);
00836             oldformat = true;
00837 
00838             /* allow only single-encoded flags */
00839             if (pg_mblen(s) != 1)
00840                 ereport(ERROR,
00841                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00842                          errmsg("multibyte flag character is not allowed")));
00843 
00844             if (*s == '*')
00845             {
00846                 flagflags |= FF_CROSSPRODUCT;
00847                 s++;
00848             }
00849             else if (*s == '~')
00850             {
00851                 flagflags |= FF_COMPOUNDONLY;
00852                 s++;
00853             }
00854 
00855             if (*s == '\\')
00856                 s++;
00857 
00858             /* allow only single-encoded flags */
00859             if (pg_mblen(s) != 1)
00860                 ereport(ERROR,
00861                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00862                          errmsg("multibyte flag character is not allowed")));
00863 
00864             flag = *(unsigned char *) s;
00865             goto nextline;
00866         }
00867         if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 || STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
00868             STRNCMP(recoded, "PFX") == 0 || STRNCMP(recoded, "SFX") == 0)
00869         {
00870             if (oldformat)
00871                 ereport(ERROR,
00872                         (errcode(ERRCODE_CONFIG_FILE_ERROR),
00873                          errmsg("wrong affix file format for flag")));
00874             tsearch_readline_end(&trst);
00875             NIImportOOAffixes(Conf, filename);
00876             return;
00877         }
00878         if ((!suffixes) && (!prefixes))
00879             goto nextline;
00880 
00881         if (!parse_affentry(pstr, mask, find, repl))
00882             goto nextline;
00883 
00884         NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
00885 
00886 nextline:
00887         pfree(recoded);
00888         pfree(pstr);
00889     }
00890     tsearch_readline_end(&trst);
00891 }
00892 
00893 static int
00894 MergeAffix(IspellDict *Conf, int a1, int a2)
00895 {
00896     char      **ptr;
00897 
00898     while (Conf->nAffixData + 1 >= Conf->lenAffixData)
00899     {
00900         Conf->lenAffixData *= 2;
00901         Conf->AffixData = (char **) repalloc(Conf->AffixData,
00902                                         sizeof(char *) * Conf->lenAffixData);
00903     }
00904 
00905     ptr = Conf->AffixData + Conf->nAffixData;
00906     *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
00907                    strlen(Conf->AffixData[a2]) +
00908                    1 /* space */ + 1 /* \0 */ );
00909     sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]);
00910     ptr++;
00911     *ptr = NULL;
00912     Conf->nAffixData++;
00913 
00914     return Conf->nAffixData - 1;
00915 }
00916 
00917 static uint32
00918 makeCompoundFlags(IspellDict *Conf, int affix)
00919 {
00920     uint32      flag = 0;
00921     char       *str = Conf->AffixData[affix];
00922 
00923     while (str && *str)
00924     {
00925         flag |= Conf->flagval[*(unsigned char *) str];
00926         str++;
00927     }
00928 
00929     return (flag & FF_DICTFLAGMASK);
00930 }
00931 
00932 static SPNode *
00933 mkSPNode(IspellDict *Conf, int low, int high, int level)
00934 {
00935     int         i;
00936     int         nchar = 0;
00937     char        lastchar = '\0';
00938     SPNode     *rs;
00939     SPNodeData *data;
00940     int         lownew = low;
00941 
00942     for (i = low; i < high; i++)
00943         if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
00944         {
00945             nchar++;
00946             lastchar = Conf->Spell[i]->word[level];
00947         }
00948 
00949     if (!nchar)
00950         return NULL;
00951 
00952     rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
00953     rs->length = nchar;
00954     data = rs->data;
00955 
00956     lastchar = '\0';
00957     for (i = low; i < high; i++)
00958         if (Conf->Spell[i]->p.d.len > level)
00959         {
00960             if (lastchar != Conf->Spell[i]->word[level])
00961             {
00962                 if (lastchar)
00963                 {
00964                     data->node = mkSPNode(Conf, lownew, i, level + 1);
00965                     lownew = i;
00966                     data++;
00967                 }
00968                 lastchar = Conf->Spell[i]->word[level];
00969             }
00970             data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
00971             if (Conf->Spell[i]->p.d.len == level + 1)
00972             {
00973                 bool        clearCompoundOnly = false;
00974 
00975                 if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
00976                 {
00977                     /*
00978                      * MergeAffix called a few times. If one of word is
00979                      * allowed to be in compound word and another isn't, then
00980                      * clear FF_COMPOUNDONLY flag.
00981                      */
00982 
00983                     clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
00984                         & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
00985                         ? false : true;
00986                     data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
00987                 }
00988                 else
00989                     data->affix = Conf->Spell[i]->p.d.affix;
00990                 data->isword = 1;
00991 
00992                 data->compoundflag = makeCompoundFlags(Conf, data->affix);
00993 
00994                 if ((data->compoundflag & FF_COMPOUNDONLY) &&
00995                     (data->compoundflag & FF_COMPOUNDFLAG) == 0)
00996                     data->compoundflag |= FF_COMPOUNDFLAG;
00997 
00998                 if (clearCompoundOnly)
00999                     data->compoundflag &= ~FF_COMPOUNDONLY;
01000             }
01001         }
01002 
01003     data->node = mkSPNode(Conf, lownew, high, level + 1);
01004 
01005     return rs;
01006 }
01007 
01008 /*
01009  * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
01010  * and affixes.
01011  */
01012 void
01013 NISortDictionary(IspellDict *Conf)
01014 {
01015     int         i;
01016     int         naffix = 0;
01017     int         curaffix;
01018 
01019     /* compress affixes */
01020 
01021     /* Count the number of different flags used in the dictionary */
01022 
01023     qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
01024 
01025     naffix = 0;
01026     for (i = 0; i < Conf->nspell; i++)
01027     {
01028         if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
01029             naffix++;
01030     }
01031 
01032     /*
01033      * Fill in Conf->AffixData with the affixes that were used in the
01034      * dictionary. Replace textual flag-field of Conf->Spell entries with
01035      * indexes into Conf->AffixData array.
01036      */
01037     Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
01038 
01039     curaffix = -1;
01040     for (i = 0; i < Conf->nspell; i++)
01041     {
01042         if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
01043         {
01044             curaffix++;
01045             Assert(curaffix < naffix);
01046             Conf->AffixData[curaffix] = cpstrdup(Conf, Conf->Spell[i]->p.flag);
01047         }
01048 
01049         Conf->Spell[i]->p.d.affix = curaffix;
01050         Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
01051     }
01052 
01053     Conf->lenAffixData = Conf->nAffixData = naffix;
01054 
01055     qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
01056     Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
01057 }
01058 
01059 static AffixNode *
01060 mkANode(IspellDict *Conf, int low, int high, int level, int type)
01061 {
01062     int         i;
01063     int         nchar = 0;
01064     uint8       lastchar = '\0';
01065     AffixNode  *rs;
01066     AffixNodeData *data;
01067     int         lownew = low;
01068     int         naff;
01069     AFFIX     **aff;
01070 
01071     for (i = low; i < high; i++)
01072         if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
01073         {
01074             nchar++;
01075             lastchar = GETCHAR(Conf->Affix + i, level, type);
01076         }
01077 
01078     if (!nchar)
01079         return NULL;
01080 
01081     aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
01082     naff = 0;
01083 
01084     rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
01085     rs->length = nchar;
01086     data = rs->data;
01087 
01088     lastchar = '\0';
01089     for (i = low; i < high; i++)
01090         if (Conf->Affix[i].replen > level)
01091         {
01092             if (lastchar != GETCHAR(Conf->Affix + i, level, type))
01093             {
01094                 if (lastchar)
01095                 {
01096                     data->node = mkANode(Conf, lownew, i, level + 1, type);
01097                     if (naff)
01098                     {
01099                         data->naff = naff;
01100                         data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
01101                         memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
01102                         naff = 0;
01103                     }
01104                     data++;
01105                     lownew = i;
01106                 }
01107                 lastchar = GETCHAR(Conf->Affix + i, level, type);
01108             }
01109             data->val = GETCHAR(Conf->Affix + i, level, type);
01110             if (Conf->Affix[i].replen == level + 1)
01111             {                   /* affix stopped */
01112                 aff[naff++] = Conf->Affix + i;
01113             }
01114         }
01115 
01116     data->node = mkANode(Conf, lownew, high, level + 1, type);
01117     if (naff)
01118     {
01119         data->naff = naff;
01120         data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
01121         memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
01122         naff = 0;
01123     }
01124 
01125     pfree(aff);
01126 
01127     return rs;
01128 }
01129 
01130 static void
01131 mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
01132 {
01133     int         i,
01134                 cnt = 0;
01135     int         start = (issuffix) ? startsuffix : 0;
01136     int         end = (issuffix) ? Conf->naffixes : startsuffix;
01137     AffixNode  *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
01138 
01139     Affix->length = 1;
01140     Affix->isvoid = 1;
01141 
01142     if (issuffix)
01143     {
01144         Affix->data->node = Conf->Suffix;
01145         Conf->Suffix = Affix;
01146     }
01147     else
01148     {
01149         Affix->data->node = Conf->Prefix;
01150         Conf->Prefix = Affix;
01151     }
01152 
01153 
01154     for (i = start; i < end; i++)
01155         if (Conf->Affix[i].replen == 0)
01156             cnt++;
01157 
01158     if (cnt == 0)
01159         return;
01160 
01161     Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
01162     Affix->data->naff = (uint32) cnt;
01163 
01164     cnt = 0;
01165     for (i = start; i < end; i++)
01166         if (Conf->Affix[i].replen == 0)
01167         {
01168             Affix->data->aff[cnt] = Conf->Affix + i;
01169             cnt++;
01170         }
01171 }
01172 
01173 static bool
01174 isAffixInUse(IspellDict *Conf, char flag)
01175 {
01176     int         i;
01177 
01178     for (i = 0; i < Conf->nAffixData; i++)
01179         if (strchr(Conf->AffixData[i], flag) != NULL)
01180             return true;
01181 
01182     return false;
01183 }
01184 
01185 void
01186 NISortAffixes(IspellDict *Conf)
01187 {
01188     AFFIX      *Affix;
01189     size_t      i;
01190     CMPDAffix  *ptr;
01191     int         firstsuffix = Conf->naffixes;
01192 
01193     if (Conf->naffixes == 0)
01194         return;
01195 
01196     if (Conf->naffixes > 1)
01197         qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
01198     Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
01199     ptr->affix = NULL;
01200 
01201     for (i = 0; i < Conf->naffixes; i++)
01202     {
01203         Affix = &(((AFFIX *) Conf->Affix)[i]);
01204         if (Affix->type == FF_SUFFIX && i < firstsuffix)
01205             firstsuffix = i;
01206 
01207         if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
01208             isAffixInUse(Conf, (char) Affix->flag))
01209         {
01210             if (ptr == Conf->CompoundAffix ||
01211                 ptr->issuffix != (ptr - 1)->issuffix ||
01212                 strbncmp((const unsigned char *) (ptr - 1)->affix,
01213                          (const unsigned char *) Affix->repl,
01214                          (ptr - 1)->len))
01215             {
01216                 /* leave only unique and minimals suffixes */
01217                 ptr->affix = Affix->repl;
01218                 ptr->len = Affix->replen;
01219                 ptr->issuffix = (Affix->type == FF_SUFFIX) ? true : false;
01220                 ptr++;
01221             }
01222         }
01223     }
01224     ptr->affix = NULL;
01225     Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
01226 
01227     Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
01228     Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
01229     mkVoidAffix(Conf, true, firstsuffix);
01230     mkVoidAffix(Conf, false, firstsuffix);
01231 }
01232 
01233 static AffixNodeData *
01234 FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
01235 {
01236     AffixNodeData *StopLow,
01237                *StopHigh,
01238                *StopMiddle;
01239     uint8 symbol;
01240 
01241     if (node->isvoid)
01242     {                           /* search void affixes */
01243         if (node->data->naff)
01244             return node->data;
01245         node = node->data->node;
01246     }
01247 
01248     while (node && *level < wrdlen)
01249     {
01250         StopLow = node->data;
01251         StopHigh = node->data + node->length;
01252         while (StopLow < StopHigh)
01253         {
01254             StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
01255             symbol = GETWCHAR(word, wrdlen, *level, type);
01256 
01257             if (StopMiddle->val == symbol)
01258             {
01259                 (*level)++;
01260                 if (StopMiddle->naff)
01261                     return StopMiddle;
01262                 node = StopMiddle->node;
01263                 break;
01264             }
01265             else if (StopMiddle->val < symbol)
01266                 StopLow = StopMiddle + 1;
01267             else
01268                 StopHigh = StopMiddle;
01269         }
01270         if (StopLow >= StopHigh)
01271             break;
01272     }
01273     return NULL;
01274 }
01275 
01276 static char *
01277 CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
01278 {
01279     /*
01280      * Check compound allow flags
01281      */
01282 
01283     if (flagflags == 0)
01284     {
01285         if (Affix->flagflags & FF_COMPOUNDONLY)
01286             return NULL;
01287     }
01288     else if (flagflags & FF_COMPOUNDBEGIN)
01289     {
01290         if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
01291             return NULL;
01292         if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
01293             if (Affix->type == FF_SUFFIX)
01294                 return NULL;
01295     }
01296     else if (flagflags & FF_COMPOUNDMIDDLE)
01297     {
01298         if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
01299             (Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
01300             return NULL;
01301     }
01302     else if (flagflags & FF_COMPOUNDLAST)
01303     {
01304         if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
01305             return NULL;
01306         if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
01307             if (Affix->type == FF_PREFIX)
01308                 return NULL;
01309     }
01310 
01311     /*
01312      * make replace pattern of affix
01313      */
01314     if (Affix->type == FF_SUFFIX)
01315     {
01316         strcpy(newword, word);
01317         strcpy(newword + len - Affix->replen, Affix->find);
01318         if (baselen)            /* store length of non-changed part of word */
01319             *baselen = len - Affix->replen;
01320     }
01321     else
01322     {
01323         /*
01324          * if prefix is a all non-chaged part's length then all word contains
01325          * only prefix and suffix, so out
01326          */
01327         if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
01328             return NULL;
01329         strcpy(newword, Affix->find);
01330         strcat(newword, word + Affix->replen);
01331     }
01332 
01333     /*
01334      * check resulting word
01335      */
01336     if (Affix->issimple)
01337         return newword;
01338     else if (Affix->isregis)
01339     {
01340         if (RS_execute(&(Affix->reg.regis), newword))
01341             return newword;
01342     }
01343     else
01344     {
01345         int         err;
01346         pg_wchar   *data;
01347         size_t      data_len;
01348         int         newword_len;
01349 
01350         /* Convert data string to wide characters */
01351         newword_len = strlen(newword);
01352         data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
01353         data_len = pg_mb2wchar_with_len(newword, data, newword_len);
01354 
01355         if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
01356         {
01357             pfree(data);
01358             return newword;
01359         }
01360         pfree(data);
01361     }
01362 
01363     return NULL;
01364 }
01365 
01366 static int
01367 addToResult(char **forms, char **cur, char *word)
01368 {
01369     if (cur - forms >= MAX_NORM - 1)
01370         return 0;
01371     if (forms == cur || strcmp(word, *(cur - 1)) != 0)
01372     {
01373         *cur = pstrdup(word);
01374         *(cur + 1) = NULL;
01375         return 1;
01376     }
01377 
01378     return 0;
01379 }
01380 
01381 static char **
01382 NormalizeSubWord(IspellDict *Conf, char *word, int flag)
01383 {
01384     AffixNodeData *suffix = NULL,
01385                *prefix = NULL;
01386     int         slevel = 0,
01387                 plevel = 0;
01388     int         wrdlen = strlen(word),
01389                 swrdlen;
01390     char      **forms;
01391     char      **cur;
01392     char        newword[2 * MAXNORMLEN] = "";
01393     char        pnewword[2 * MAXNORMLEN] = "";
01394     AffixNode  *snode = Conf->Suffix,
01395                *pnode;
01396     int         i,
01397                 j;
01398 
01399     if (wrdlen > MAXNORMLEN)
01400         return NULL;
01401     cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
01402     *cur = NULL;
01403 
01404 
01405     /* Check that the word itself is normal form */
01406     if (FindWord(Conf, word, 0, flag))
01407     {
01408         *cur = pstrdup(word);
01409         cur++;
01410         *cur = NULL;
01411     }
01412 
01413     /* Find all other NORMAL forms of the 'word' (check only prefix) */
01414     pnode = Conf->Prefix;
01415     plevel = 0;
01416     while (pnode)
01417     {
01418         prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
01419         if (!prefix)
01420             break;
01421         for (j = 0; j < prefix->naff; j++)
01422         {
01423             if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
01424             {
01425                 /* prefix success */
01426                 if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
01427                     cur += addToResult(forms, cur, newword);
01428             }
01429         }
01430         pnode = prefix->node;
01431     }
01432 
01433     /*
01434      * Find all other NORMAL forms of the 'word' (check suffix and then
01435      * prefix)
01436      */
01437     while (snode)
01438     {
01439         int         baselen = 0;
01440 
01441         /* find possible suffix */
01442         suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
01443         if (!suffix)
01444             break;
01445         /* foreach suffix check affix */
01446         for (i = 0; i < suffix->naff; i++)
01447         {
01448             if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
01449             {
01450                 /* suffix success */
01451                 if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
01452                     cur += addToResult(forms, cur, newword);
01453 
01454                 /* now we will look changed word with prefixes */
01455                 pnode = Conf->Prefix;
01456                 plevel = 0;
01457                 swrdlen = strlen(newword);
01458                 while (pnode)
01459                 {
01460                     prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
01461                     if (!prefix)
01462                         break;
01463                     for (j = 0; j < prefix->naff; j++)
01464                     {
01465                         if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
01466                         {
01467                             /* prefix success */
01468                             int         ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
01469                             0 : prefix->aff[j]->flag;
01470 
01471                             if (FindWord(Conf, pnewword, ff, flag))
01472                                 cur += addToResult(forms, cur, pnewword);
01473                         }
01474                     }
01475                     pnode = prefix->node;
01476                 }
01477             }
01478         }
01479 
01480         snode = suffix->node;
01481     }
01482 
01483     if (cur == forms)
01484     {
01485         pfree(forms);
01486         return (NULL);
01487     }
01488     return (forms);
01489 }
01490 
01491 typedef struct SplitVar
01492 {
01493     int         nstem;
01494     int         lenstem;
01495     char      **stem;
01496     struct SplitVar *next;
01497 } SplitVar;
01498 
01499 static int
01500 CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
01501 {
01502     bool        issuffix;
01503 
01504     if (CheckInPlace)
01505     {
01506         while ((*ptr)->affix)
01507         {
01508             if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
01509             {
01510                 len = (*ptr)->len;
01511                 issuffix = (*ptr)->issuffix;
01512                 (*ptr)++;
01513                 return (issuffix) ? len : 0;
01514             }
01515             (*ptr)++;
01516         }
01517     }
01518     else
01519     {
01520         char       *affbegin;
01521 
01522         while ((*ptr)->affix)
01523         {
01524             if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
01525             {
01526                 len = (*ptr)->len + (affbegin - word);
01527                 issuffix = (*ptr)->issuffix;
01528                 (*ptr)++;
01529                 return (issuffix) ? len : 0;
01530             }
01531             (*ptr)++;
01532         }
01533     }
01534     return -1;
01535 }
01536 
01537 static SplitVar *
01538 CopyVar(SplitVar *s, int makedup)
01539 {
01540     SplitVar   *v = (SplitVar *) palloc(sizeof(SplitVar));
01541 
01542     v->next = NULL;
01543     if (s)
01544     {
01545         int         i;
01546 
01547         v->lenstem = s->lenstem;
01548         v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
01549         v->nstem = s->nstem;
01550         for (i = 0; i < s->nstem; i++)
01551             v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
01552     }
01553     else
01554     {
01555         v->lenstem = 16;
01556         v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
01557         v->nstem = 0;
01558     }
01559     return v;
01560 }
01561 
01562 static void
01563 AddStem(SplitVar *v, char *word)
01564 {
01565     if (v->nstem >= v->lenstem)
01566     {
01567         v->lenstem *= 2;
01568         v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
01569     }
01570 
01571     v->stem[v->nstem] = word;
01572     v->nstem++;
01573 }
01574 
01575 static SplitVar *
01576 SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
01577 {
01578     SplitVar   *var = NULL;
01579     SPNodeData *StopLow,
01580                *StopHigh,
01581                *StopMiddle = NULL;
01582     SPNode     *node = (snode) ? snode : Conf->Dictionary;
01583     int         level = (snode) ? minpos : startpos;    /* recursive
01584                                                          * minpos==level */
01585     int         lenaff;
01586     CMPDAffix  *caff;
01587     char       *notprobed;
01588     int         compoundflag = 0;
01589 
01590     notprobed = (char *) palloc(wordlen);
01591     memset(notprobed, 1, wordlen);
01592     var = CopyVar(orig, 1);
01593 
01594     while (level < wordlen)
01595     {
01596         /* find word with epenthetic or/and compound affix */
01597         caff = Conf->CompoundAffix;
01598         while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
01599         {
01600             /*
01601              * there is one of compound affixes, so check word for existings
01602              */
01603             char        buf[MAXNORMLEN];
01604             char      **subres;
01605 
01606             lenaff = level - startpos + lenaff;
01607 
01608             if (!notprobed[startpos + lenaff - 1])
01609                 continue;
01610 
01611             if (level + lenaff - 1 <= minpos)
01612                 continue;
01613 
01614             if (lenaff >= MAXNORMLEN)
01615                 continue;       /* skip too big value */
01616             if (lenaff > 0)
01617                 memcpy(buf, word + startpos, lenaff);
01618             buf[lenaff] = '\0';
01619 
01620             if (level == 0)
01621                 compoundflag = FF_COMPOUNDBEGIN;
01622             else if (level == wordlen - 1)
01623                 compoundflag = FF_COMPOUNDLAST;
01624             else
01625                 compoundflag = FF_COMPOUNDMIDDLE;
01626             subres = NormalizeSubWord(Conf, buf, compoundflag);
01627             if (subres)
01628             {
01629                 /* Yes, it was a word from dictionary */
01630                 SplitVar   *new = CopyVar(var, 0);
01631                 SplitVar   *ptr = var;
01632                 char      **sptr = subres;
01633 
01634                 notprobed[startpos + lenaff - 1] = 0;
01635 
01636                 while (*sptr)
01637                 {
01638                     AddStem(new, *sptr);
01639                     sptr++;
01640                 }
01641                 pfree(subres);
01642 
01643                 while (ptr->next)
01644                     ptr = ptr->next;
01645                 ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
01646 
01647                 pfree(new->stem);
01648                 pfree(new);
01649             }
01650         }
01651 
01652         if (!node)
01653             break;
01654 
01655         StopLow = node->data;
01656         StopHigh = node->data + node->length;
01657         while (StopLow < StopHigh)
01658         {
01659             StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
01660             if (StopMiddle->val == ((uint8 *) (word))[level])
01661                 break;
01662             else if (StopMiddle->val < ((uint8 *) (word))[level])
01663                 StopLow = StopMiddle + 1;
01664             else
01665                 StopHigh = StopMiddle;
01666         }
01667 
01668         if (StopLow < StopHigh)
01669         {
01670             if (level == FF_COMPOUNDBEGIN)
01671                 compoundflag = FF_COMPOUNDBEGIN;
01672             else if (level == wordlen - 1)
01673                 compoundflag = FF_COMPOUNDLAST;
01674             else
01675                 compoundflag = FF_COMPOUNDMIDDLE;
01676 
01677             /* find infinitive */
01678             if (StopMiddle->isword &&
01679                 (StopMiddle->compoundflag & compoundflag) &&
01680                 notprobed[level])
01681             {
01682                 /* ok, we found full compoundallowed word */
01683                 if (level > minpos)
01684                 {
01685                     /* and its length more than minimal */
01686                     if (wordlen == level + 1)
01687                     {
01688                         /* well, it was last word */
01689                         AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
01690                         pfree(notprobed);
01691                         return var;
01692                     }
01693                     else
01694                     {
01695                         /* then we will search more big word at the same point */
01696                         SplitVar   *ptr = var;
01697 
01698                         while (ptr->next)
01699                             ptr = ptr->next;
01700                         ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
01701                         /* we can find next word */
01702                         level++;
01703                         AddStem(var, pnstrdup(word + startpos, level - startpos));
01704                         node = Conf->Dictionary;
01705                         startpos = level;
01706                         continue;
01707                     }
01708                 }
01709             }
01710             node = StopMiddle->node;
01711         }
01712         else
01713             node = NULL;
01714         level++;
01715     }
01716 
01717     AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
01718     pfree(notprobed);
01719     return var;
01720 }
01721 
01722 static void
01723 addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
01724 {
01725     if (*lres == NULL)
01726         *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
01727 
01728     if (*lcur - *lres < MAX_NORM - 1)
01729     {
01730         (*lcur)->lexeme = word;
01731         (*lcur)->flags = flags;
01732         (*lcur)->nvariant = NVariant;
01733         (*lcur)++;
01734         (*lcur)->lexeme = NULL;
01735     }
01736 }
01737 
01738 TSLexeme *
01739 NINormalizeWord(IspellDict *Conf, char *word)
01740 {
01741     char      **res;
01742     TSLexeme   *lcur = NULL,
01743                *lres = NULL;
01744     uint16      NVariant = 1;
01745 
01746     res = NormalizeSubWord(Conf, word, 0);
01747 
01748     if (res)
01749     {
01750         char      **ptr = res;
01751 
01752         while (*ptr && (lcur - lres) < MAX_NORM)
01753         {
01754             addNorm(&lres, &lcur, *ptr, 0, NVariant++);
01755             ptr++;
01756         }
01757         pfree(res);
01758     }
01759 
01760     if (Conf->usecompound)
01761     {
01762         int         wordlen = strlen(word);
01763         SplitVar   *ptr,
01764                    *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
01765         int         i;
01766 
01767         while (var)
01768         {
01769             if (var->nstem > 1)
01770             {
01771                 char      **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
01772 
01773                 if (subres)
01774                 {
01775                     char      **subptr = subres;
01776 
01777                     while (*subptr)
01778                     {
01779                         for (i = 0; i < var->nstem - 1; i++)
01780                         {
01781                             addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
01782                         }
01783 
01784                         addNorm(&lres, &lcur, *subptr, 0, NVariant);
01785                         subptr++;
01786                         NVariant++;
01787                     }
01788 
01789                     pfree(subres);
01790                     var->stem[0] = NULL;
01791                     pfree(var->stem[var->nstem - 1]);
01792                 }
01793             }
01794 
01795             for (i = 0; i < var->nstem && var->stem[i]; i++)
01796                 pfree(var->stem[i]);
01797             ptr = var->next;
01798             pfree(var->stem);
01799             pfree(var);
01800             var = ptr;
01801         }
01802     }
01803 
01804     return lres;
01805 }
Header And Logo

spell.c