00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include "postgres.h"
00016
00017 #include "catalog/pg_collation.h"
00018 #include "tsearch/dicts/spell.h"
00019 #include "tsearch/ts_locale.h"
00020 #include "utils/memutils.h"
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz))
00031 #define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz))
00032
00033
00034
00035
00036
00037
00038 void
00039 NIStartBuild(IspellDict *Conf)
00040 {
00041
00042
00043
00044
00045 Conf->buildCxt = AllocSetContextCreate(CurTransactionContext,
00046 "Ispell dictionary init context",
00047 ALLOCSET_DEFAULT_MINSIZE,
00048 ALLOCSET_DEFAULT_INITSIZE,
00049 ALLOCSET_DEFAULT_MAXSIZE);
00050 }
00051
00052
00053
00054
00055 void
00056 NIFinishBuild(IspellDict *Conf)
00057 {
00058
00059 MemoryContextDelete(Conf->buildCxt);
00060
00061 Conf->buildCxt = NULL;
00062 Conf->Spell = NULL;
00063 Conf->firstfree = NULL;
00064 }
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078 #define COMPACT_ALLOC_CHUNK 8192
00079 #define COMPACT_MAX_REQ 1024
00080
00081 static void *
00082 compact_palloc0(IspellDict *Conf, size_t size)
00083 {
00084 void *result;
00085
00086
00087 Assert(Conf->buildCxt != NULL);
00088
00089
00090 if (size > COMPACT_MAX_REQ)
00091 return palloc0(size);
00092
00093
00094 size = MAXALIGN(size);
00095
00096
00097 if (size > Conf->avail)
00098 {
00099 Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
00100 Conf->avail = COMPACT_ALLOC_CHUNK;
00101 }
00102
00103 result = (void *) Conf->firstfree;
00104 Conf->firstfree += size;
00105 Conf->avail -= size;
00106
00107 return result;
00108 }
00109
00110 #define cpalloc(size) compact_palloc0(Conf, size)
00111 #define cpalloc0(size) compact_palloc0(Conf, size)
00112
00113 static char *
00114 cpstrdup(IspellDict *Conf, const char *str)
00115 {
00116 char *res = cpalloc(strlen(str) + 1);
00117
00118 strcpy(res, str);
00119 return res;
00120 }
00121
00122
00123
00124
00125
00126 static char *
00127 lowerstr_ctx(IspellDict *Conf, const char *src)
00128 {
00129 MemoryContext saveCtx;
00130 char *dst;
00131
00132 saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
00133 dst = lowerstr(src);
00134 MemoryContextSwitchTo(saveCtx);
00135
00136 return dst;
00137 }
00138
00139 #define MAX_NORM 1024
00140 #define MAXNORMLEN 256
00141
00142 #define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
00143 #define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
00144 #define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
00145
00146 static char *VoidString = "";
00147
00148 static int
00149 cmpspell(const void *s1, const void *s2)
00150 {
00151 return (strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word));
00152 }
00153 static int
00154 cmpspellaffix(const void *s1, const void *s2)
00155 {
00156 return (strncmp((*(SPELL *const *) s1)->p.flag, (*(SPELL *const *) s2)->p.flag, MAXFLAGLEN));
00157 }
00158
00159 static char *
00160 findchar(char *str, int c)
00161 {
00162 while (*str)
00163 {
00164 if (t_iseq(str, c))
00165 return str;
00166 str += pg_mblen(str);
00167 }
00168
00169 return NULL;
00170 }
00171
00172
00173
00174 static int
00175 strbcmp(const unsigned char *s1, const unsigned char *s2)
00176 {
00177 int l1 = strlen((const char *) s1) - 1,
00178 l2 = strlen((const char *) s2) - 1;
00179
00180 while (l1 >= 0 && l2 >= 0)
00181 {
00182 if (s1[l1] < s2[l2])
00183 return -1;
00184 if (s1[l1] > s2[l2])
00185 return 1;
00186 l1--;
00187 l2--;
00188 }
00189 if (l1 < l2)
00190 return -1;
00191 if (l1 > l2)
00192 return 1;
00193
00194 return 0;
00195 }
00196
00197 static int
00198 strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
00199 {
00200 int l1 = strlen((const char *) s1) - 1,
00201 l2 = strlen((const char *) s2) - 1,
00202 l = count;
00203
00204 while (l1 >= 0 && l2 >= 0 && l > 0)
00205 {
00206 if (s1[l1] < s2[l2])
00207 return -1;
00208 if (s1[l1] > s2[l2])
00209 return 1;
00210 l1--;
00211 l2--;
00212 l--;
00213 }
00214 if (l == 0)
00215 return 0;
00216 if (l1 < l2)
00217 return -1;
00218 if (l1 > l2)
00219 return 1;
00220 return 0;
00221 }
00222
00223 static int
00224 cmpaffix(const void *s1, const void *s2)
00225 {
00226 const AFFIX *a1 = (const AFFIX *) s1;
00227 const AFFIX *a2 = (const AFFIX *) s2;
00228
00229 if (a1->type < a2->type)
00230 return -1;
00231 if (a1->type > a2->type)
00232 return 1;
00233 if (a1->type == FF_PREFIX)
00234 return strcmp(a1->repl, a2->repl);
00235 else
00236 return strbcmp((const unsigned char *) a1->repl,
00237 (const unsigned char *) a2->repl);
00238 }
00239
00240 static void
00241 NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
00242 {
00243 if (Conf->nspell >= Conf->mspell)
00244 {
00245 if (Conf->mspell)
00246 {
00247 Conf->mspell *= 2;
00248 Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
00249 }
00250 else
00251 {
00252 Conf->mspell = 1024 * 20;
00253 Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
00254 }
00255 }
00256 Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
00257 strcpy(Conf->Spell[Conf->nspell]->word, word);
00258 strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
00259 Conf->nspell++;
00260 }
00261
00262
00263
00264
00265
00266
00267 void
00268 NIImportDictionary(IspellDict *Conf, const char *filename)
00269 {
00270 tsearch_readline_state trst;
00271 char *line;
00272
00273 if (!tsearch_readline_begin(&trst, filename))
00274 ereport(ERROR,
00275 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00276 errmsg("could not open dictionary file \"%s\": %m",
00277 filename)));
00278
00279 while ((line = tsearch_readline(&trst)) != NULL)
00280 {
00281 char *s,
00282 *pstr;
00283 const char *flag;
00284
00285
00286 flag = NULL;
00287 if ((s = findchar(line, '/')))
00288 {
00289 *s++ = '\0';
00290 flag = s;
00291 while (*s)
00292 {
00293
00294 if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
00295 s++;
00296 else
00297 {
00298 *s = '\0';
00299 break;
00300 }
00301 }
00302 }
00303 else
00304 flag = "";
00305
00306
00307 s = line;
00308 while (*s)
00309 {
00310 if (t_isspace(s))
00311 {
00312 *s = '\0';
00313 break;
00314 }
00315 s += pg_mblen(s);
00316 }
00317 pstr = lowerstr_ctx(Conf, line);
00318
00319 NIAddSpell(Conf, pstr, flag);
00320 pfree(pstr);
00321
00322 pfree(line);
00323 }
00324 tsearch_readline_end(&trst);
00325 }
00326
00327
00328 static int
00329 FindWord(IspellDict *Conf, const char *word, int affixflag, int flag)
00330 {
00331 SPNode *node = Conf->Dictionary;
00332 SPNodeData *StopLow,
00333 *StopHigh,
00334 *StopMiddle;
00335 const uint8 *ptr = (const uint8 *) word;
00336
00337 flag &= FF_DICTFLAGMASK;
00338
00339 while (node && *ptr)
00340 {
00341 StopLow = node->data;
00342 StopHigh = node->data + node->length;
00343 while (StopLow < StopHigh)
00344 {
00345 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
00346 if (StopMiddle->val == *ptr)
00347 {
00348 if (*(ptr + 1) == '\0' && StopMiddle->isword)
00349 {
00350 if (flag == 0)
00351 {
00352 if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
00353 return 0;
00354 }
00355 else if ((flag & StopMiddle->compoundflag) == 0)
00356 return 0;
00357
00358 if ((affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
00359 return 1;
00360 }
00361 node = StopMiddle->node;
00362 ptr++;
00363 break;
00364 }
00365 else if (StopMiddle->val < *ptr)
00366 StopLow = StopMiddle + 1;
00367 else
00368 StopHigh = StopMiddle;
00369 }
00370 if (StopLow >= StopHigh)
00371 break;
00372 }
00373 return 0;
00374 }
00375
00376 static void
00377 NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
00378 {
00379 AFFIX *Affix;
00380
00381 if (Conf->naffixes >= Conf->maffixes)
00382 {
00383 if (Conf->maffixes)
00384 {
00385 Conf->maffixes *= 2;
00386 Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
00387 }
00388 else
00389 {
00390 Conf->maffixes = 16;
00391 Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
00392 }
00393 }
00394
00395 Affix = Conf->Affix + Conf->naffixes;
00396
00397 if (strcmp(mask, ".") == 0)
00398 {
00399 Affix->issimple = 1;
00400 Affix->isregis = 0;
00401 }
00402 else if (RS_isRegis(mask))
00403 {
00404 Affix->issimple = 0;
00405 Affix->isregis = 1;
00406 RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX) ? true : false,
00407 (mask && *mask) ? mask : VoidString);
00408 }
00409 else
00410 {
00411 int masklen;
00412 int wmasklen;
00413 int err;
00414 pg_wchar *wmask;
00415 char *tmask;
00416
00417 Affix->issimple = 0;
00418 Affix->isregis = 0;
00419 tmask = (char *) tmpalloc(strlen(mask) + 3);
00420 if (type == FF_SUFFIX)
00421 sprintf(tmask, "%s$", mask);
00422 else
00423 sprintf(tmask, "^%s", mask);
00424
00425 masklen = strlen(tmask);
00426 wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
00427 wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
00428
00429 err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen,
00430 REG_ADVANCED | REG_NOSUB,
00431 DEFAULT_COLLATION_OID);
00432 if (err)
00433 {
00434 char errstr[100];
00435
00436 pg_regerror(err, &(Affix->reg.regex), errstr, sizeof(errstr));
00437 ereport(ERROR,
00438 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
00439 errmsg("invalid regular expression: %s", errstr)));
00440 }
00441 }
00442
00443 Affix->flagflags = flagflags;
00444 if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
00445 {
00446 if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
00447 Affix->flagflags |= FF_COMPOUNDFLAG;
00448 }
00449 Affix->flag = flag;
00450 Affix->type = type;
00451
00452 Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
00453 if ((Affix->replen = strlen(repl)) > 0)
00454 Affix->repl = cpstrdup(Conf, repl);
00455 else
00456 Affix->repl = VoidString;
00457 Conf->naffixes++;
00458 }
00459
00460 #define PAE_WAIT_MASK 0
00461 #define PAE_INMASK 1
00462 #define PAE_WAIT_FIND 2
00463 #define PAE_INFIND 3
00464 #define PAE_WAIT_REPL 4
00465 #define PAE_INREPL 5
00466
00467 static bool
00468 parse_affentry(char *str, char *mask, char *find, char *repl)
00469 {
00470 int state = PAE_WAIT_MASK;
00471 char *pmask = mask,
00472 *pfind = find,
00473 *prepl = repl;
00474
00475 *mask = *find = *repl = '\0';
00476
00477 while (*str)
00478 {
00479 if (state == PAE_WAIT_MASK)
00480 {
00481 if (t_iseq(str, '#'))
00482 return false;
00483 else if (!t_isspace(str))
00484 {
00485 COPYCHAR(pmask, str);
00486 pmask += pg_mblen(str);
00487 state = PAE_INMASK;
00488 }
00489 }
00490 else if (state == PAE_INMASK)
00491 {
00492 if (t_iseq(str, '>'))
00493 {
00494 *pmask = '\0';
00495 state = PAE_WAIT_FIND;
00496 }
00497 else if (!t_isspace(str))
00498 {
00499 COPYCHAR(pmask, str);
00500 pmask += pg_mblen(str);
00501 }
00502 }
00503 else if (state == PAE_WAIT_FIND)
00504 {
00505 if (t_iseq(str, '-'))
00506 {
00507 state = PAE_INFIND;
00508 }
00509 else if (t_isalpha(str) || t_iseq(str, '\'') )
00510 {
00511 COPYCHAR(prepl, str);
00512 prepl += pg_mblen(str);
00513 state = PAE_INREPL;
00514 }
00515 else if (!t_isspace(str))
00516 ereport(ERROR,
00517 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00518 errmsg("syntax error")));
00519 }
00520 else if (state == PAE_INFIND)
00521 {
00522 if (t_iseq(str, ','))
00523 {
00524 *pfind = '\0';
00525 state = PAE_WAIT_REPL;
00526 }
00527 else if (t_isalpha(str))
00528 {
00529 COPYCHAR(pfind, str);
00530 pfind += pg_mblen(str);
00531 }
00532 else if (!t_isspace(str))
00533 ereport(ERROR,
00534 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00535 errmsg("syntax error")));
00536 }
00537 else if (state == PAE_WAIT_REPL)
00538 {
00539 if (t_iseq(str, '-'))
00540 {
00541 break;
00542 }
00543 else if (t_isalpha(str))
00544 {
00545 COPYCHAR(prepl, str);
00546 prepl += pg_mblen(str);
00547 state = PAE_INREPL;
00548 }
00549 else if (!t_isspace(str))
00550 ereport(ERROR,
00551 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00552 errmsg("syntax error")));
00553 }
00554 else if (state == PAE_INREPL)
00555 {
00556 if (t_iseq(str, '#'))
00557 {
00558 *prepl = '\0';
00559 break;
00560 }
00561 else if (t_isalpha(str))
00562 {
00563 COPYCHAR(prepl, str);
00564 prepl += pg_mblen(str);
00565 }
00566 else if (!t_isspace(str))
00567 ereport(ERROR,
00568 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00569 errmsg("syntax error")));
00570 }
00571 else
00572 elog(ERROR, "unrecognized state in parse_affentry: %d", state);
00573
00574 str += pg_mblen(str);
00575 }
00576
00577 *pmask = *pfind = *prepl = '\0';
00578
00579 return (*mask && (*find || *repl)) ? true : false;
00580 }
00581
00582 static void
00583 addFlagValue(IspellDict *Conf, char *s, uint32 val)
00584 {
00585 while (*s && t_isspace(s))
00586 s += pg_mblen(s);
00587
00588 if (!*s)
00589 ereport(ERROR,
00590 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00591 errmsg("syntax error")));
00592
00593 if (pg_mblen(s) != 1)
00594 ereport(ERROR,
00595 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00596 errmsg("multibyte flag character is not allowed")));
00597
00598 Conf->flagval[*(unsigned char *) s] = (unsigned char) val;
00599 Conf->usecompound = true;
00600 }
00601
00602 static void
00603 NIImportOOAffixes(IspellDict *Conf, const char *filename)
00604 {
00605 char type[BUFSIZ],
00606 *ptype = NULL;
00607 char sflag[BUFSIZ];
00608 char mask[BUFSIZ],
00609 *pmask;
00610 char find[BUFSIZ],
00611 *pfind;
00612 char repl[BUFSIZ],
00613 *prepl;
00614 bool isSuffix = false;
00615 int flag = 0;
00616 char flagflags = 0;
00617 tsearch_readline_state trst;
00618 int scanread = 0;
00619 char scanbuf[BUFSIZ];
00620 char *recoded;
00621
00622
00623 memset(Conf->flagval, 0, sizeof(Conf->flagval));
00624 Conf->usecompound = false;
00625
00626 if (!tsearch_readline_begin(&trst, filename))
00627 ereport(ERROR,
00628 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00629 errmsg("could not open affix file \"%s\": %m",
00630 filename)));
00631
00632 while ((recoded = tsearch_readline(&trst)) != NULL)
00633 {
00634 if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
00635 {
00636 pfree(recoded);
00637 continue;
00638 }
00639
00640 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
00641 addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
00642 FF_COMPOUNDFLAG);
00643 else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
00644 addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
00645 FF_COMPOUNDBEGIN);
00646 else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
00647 addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
00648 FF_COMPOUNDLAST);
00649
00650 else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
00651 addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
00652 FF_COMPOUNDLAST);
00653 else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
00654 addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
00655 FF_COMPOUNDMIDDLE);
00656 else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
00657 addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
00658 FF_COMPOUNDONLY);
00659 else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
00660 addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
00661 FF_COMPOUNDPERMITFLAG);
00662 else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
00663 addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
00664 FF_COMPOUNDFORBIDFLAG);
00665 else if (STRNCMP(recoded, "FLAG") == 0)
00666 {
00667 char *s = recoded + strlen("FLAG");
00668
00669 while (*s && t_isspace(s))
00670 s += pg_mblen(s);
00671
00672 if (*s && STRNCMP(s, "default") != 0)
00673 ereport(ERROR,
00674 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00675 errmsg("Ispell dictionary supports only default flag value")));
00676 }
00677
00678 pfree(recoded);
00679 }
00680 tsearch_readline_end(&trst);
00681
00682 sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
00683
00684 if (!tsearch_readline_begin(&trst, filename))
00685 ereport(ERROR,
00686 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00687 errmsg("could not open affix file \"%s\": %m",
00688 filename)));
00689
00690 while ((recoded = tsearch_readline(&trst)) != NULL)
00691 {
00692 if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
00693 goto nextline;
00694
00695 scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
00696
00697 if (ptype)
00698 pfree(ptype);
00699 ptype = lowerstr_ctx(Conf, type);
00700 if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
00701 goto nextline;
00702
00703 if (scanread == 4)
00704 {
00705 if (strlen(sflag) != 1)
00706 goto nextline;
00707 flag = *sflag;
00708 isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
00709 if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
00710 flagflags = FF_CROSSPRODUCT;
00711 else
00712 flagflags = 0;
00713 }
00714 else
00715 {
00716 char *ptr;
00717 int aflg = 0;
00718
00719 if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
00720 goto nextline;
00721 prepl = lowerstr_ctx(Conf, repl);
00722
00723 if ((ptr = strchr(prepl, '/')) != NULL)
00724 {
00725 *ptr = '\0';
00726 ptr = repl + (ptr - prepl) + 1;
00727 while (*ptr)
00728 {
00729 aflg |= Conf->flagval[*(unsigned char *) ptr];
00730 ptr++;
00731 }
00732 }
00733 pfind = lowerstr_ctx(Conf, find);
00734 pmask = lowerstr_ctx(Conf, mask);
00735 if (t_iseq(find, '0'))
00736 *pfind = '\0';
00737 if (t_iseq(repl, '0'))
00738 *prepl = '\0';
00739
00740 NIAddAffix(Conf, flag, flagflags | aflg, pmask, pfind, prepl,
00741 isSuffix ? FF_SUFFIX : FF_PREFIX);
00742 pfree(prepl);
00743 pfree(pfind);
00744 pfree(pmask);
00745 }
00746
00747 nextline:
00748 pfree(recoded);
00749 }
00750
00751 tsearch_readline_end(&trst);
00752 if (ptype)
00753 pfree(ptype);
00754 }
00755
00756
00757
00758
00759
00760
00761 void
00762 NIImportAffixes(IspellDict *Conf, const char *filename)
00763 {
00764 char *pstr = NULL;
00765 char mask[BUFSIZ];
00766 char find[BUFSIZ];
00767 char repl[BUFSIZ];
00768 char *s;
00769 bool suffixes = false;
00770 bool prefixes = false;
00771 int flag = 0;
00772 char flagflags = 0;
00773 tsearch_readline_state trst;
00774 bool oldformat = false;
00775 char *recoded = NULL;
00776
00777 if (!tsearch_readline_begin(&trst, filename))
00778 ereport(ERROR,
00779 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00780 errmsg("could not open affix file \"%s\": %m",
00781 filename)));
00782
00783 memset(Conf->flagval, 0, sizeof(Conf->flagval));
00784 Conf->usecompound = false;
00785
00786 while ((recoded = tsearch_readline(&trst)) != NULL)
00787 {
00788 pstr = lowerstr(recoded);
00789
00790
00791 if (*pstr == '#' || *pstr == '\n')
00792 goto nextline;
00793
00794 if (STRNCMP(pstr, "compoundwords") == 0)
00795 {
00796 s = findchar(pstr, 'l');
00797 if (s)
00798 {
00799 s = recoded + (s - pstr);
00800
00801 while (*s && !t_isspace(s))
00802 s += pg_mblen(s);
00803 while (*s && t_isspace(s))
00804 s += pg_mblen(s);
00805
00806 if (*s && pg_mblen(s) == 1)
00807 {
00808 Conf->flagval[*(unsigned char *) s] = FF_COMPOUNDFLAG;
00809 Conf->usecompound = true;
00810 }
00811 oldformat = true;
00812 goto nextline;
00813 }
00814 }
00815 if (STRNCMP(pstr, "suffixes") == 0)
00816 {
00817 suffixes = true;
00818 prefixes = false;
00819 oldformat = true;
00820 goto nextline;
00821 }
00822 if (STRNCMP(pstr, "prefixes") == 0)
00823 {
00824 suffixes = false;
00825 prefixes = true;
00826 oldformat = true;
00827 goto nextline;
00828 }
00829 if (STRNCMP(pstr, "flag") == 0)
00830 {
00831 s = recoded + 4;
00832 flagflags = 0;
00833
00834 while (*s && t_isspace(s))
00835 s += pg_mblen(s);
00836 oldformat = true;
00837
00838
00839 if (pg_mblen(s) != 1)
00840 ereport(ERROR,
00841 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00842 errmsg("multibyte flag character is not allowed")));
00843
00844 if (*s == '*')
00845 {
00846 flagflags |= FF_CROSSPRODUCT;
00847 s++;
00848 }
00849 else if (*s == '~')
00850 {
00851 flagflags |= FF_COMPOUNDONLY;
00852 s++;
00853 }
00854
00855 if (*s == '\\')
00856 s++;
00857
00858
00859 if (pg_mblen(s) != 1)
00860 ereport(ERROR,
00861 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00862 errmsg("multibyte flag character is not allowed")));
00863
00864 flag = *(unsigned char *) s;
00865 goto nextline;
00866 }
00867 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 || STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
00868 STRNCMP(recoded, "PFX") == 0 || STRNCMP(recoded, "SFX") == 0)
00869 {
00870 if (oldformat)
00871 ereport(ERROR,
00872 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00873 errmsg("wrong affix file format for flag")));
00874 tsearch_readline_end(&trst);
00875 NIImportOOAffixes(Conf, filename);
00876 return;
00877 }
00878 if ((!suffixes) && (!prefixes))
00879 goto nextline;
00880
00881 if (!parse_affentry(pstr, mask, find, repl))
00882 goto nextline;
00883
00884 NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
00885
00886 nextline:
00887 pfree(recoded);
00888 pfree(pstr);
00889 }
00890 tsearch_readline_end(&trst);
00891 }
00892
00893 static int
00894 MergeAffix(IspellDict *Conf, int a1, int a2)
00895 {
00896 char **ptr;
00897
00898 while (Conf->nAffixData + 1 >= Conf->lenAffixData)
00899 {
00900 Conf->lenAffixData *= 2;
00901 Conf->AffixData = (char **) repalloc(Conf->AffixData,
00902 sizeof(char *) * Conf->lenAffixData);
00903 }
00904
00905 ptr = Conf->AffixData + Conf->nAffixData;
00906 *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
00907 strlen(Conf->AffixData[a2]) +
00908 1 + 1 );
00909 sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]);
00910 ptr++;
00911 *ptr = NULL;
00912 Conf->nAffixData++;
00913
00914 return Conf->nAffixData - 1;
00915 }
00916
00917 static uint32
00918 makeCompoundFlags(IspellDict *Conf, int affix)
00919 {
00920 uint32 flag = 0;
00921 char *str = Conf->AffixData[affix];
00922
00923 while (str && *str)
00924 {
00925 flag |= Conf->flagval[*(unsigned char *) str];
00926 str++;
00927 }
00928
00929 return (flag & FF_DICTFLAGMASK);
00930 }
00931
00932 static SPNode *
00933 mkSPNode(IspellDict *Conf, int low, int high, int level)
00934 {
00935 int i;
00936 int nchar = 0;
00937 char lastchar = '\0';
00938 SPNode *rs;
00939 SPNodeData *data;
00940 int lownew = low;
00941
00942 for (i = low; i < high; i++)
00943 if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
00944 {
00945 nchar++;
00946 lastchar = Conf->Spell[i]->word[level];
00947 }
00948
00949 if (!nchar)
00950 return NULL;
00951
00952 rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
00953 rs->length = nchar;
00954 data = rs->data;
00955
00956 lastchar = '\0';
00957 for (i = low; i < high; i++)
00958 if (Conf->Spell[i]->p.d.len > level)
00959 {
00960 if (lastchar != Conf->Spell[i]->word[level])
00961 {
00962 if (lastchar)
00963 {
00964 data->node = mkSPNode(Conf, lownew, i, level + 1);
00965 lownew = i;
00966 data++;
00967 }
00968 lastchar = Conf->Spell[i]->word[level];
00969 }
00970 data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
00971 if (Conf->Spell[i]->p.d.len == level + 1)
00972 {
00973 bool clearCompoundOnly = false;
00974
00975 if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
00976 {
00977
00978
00979
00980
00981
00982
00983 clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
00984 & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
00985 ? false : true;
00986 data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
00987 }
00988 else
00989 data->affix = Conf->Spell[i]->p.d.affix;
00990 data->isword = 1;
00991
00992 data->compoundflag = makeCompoundFlags(Conf, data->affix);
00993
00994 if ((data->compoundflag & FF_COMPOUNDONLY) &&
00995 (data->compoundflag & FF_COMPOUNDFLAG) == 0)
00996 data->compoundflag |= FF_COMPOUNDFLAG;
00997
00998 if (clearCompoundOnly)
00999 data->compoundflag &= ~FF_COMPOUNDONLY;
01000 }
01001 }
01002
01003 data->node = mkSPNode(Conf, lownew, high, level + 1);
01004
01005 return rs;
01006 }
01007
01008
01009
01010
01011
01012 void
01013 NISortDictionary(IspellDict *Conf)
01014 {
01015 int i;
01016 int naffix = 0;
01017 int curaffix;
01018
01019
01020
01021
01022
01023 qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
01024
01025 naffix = 0;
01026 for (i = 0; i < Conf->nspell; i++)
01027 {
01028 if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
01029 naffix++;
01030 }
01031
01032
01033
01034
01035
01036
01037 Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
01038
01039 curaffix = -1;
01040 for (i = 0; i < Conf->nspell; i++)
01041 {
01042 if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
01043 {
01044 curaffix++;
01045 Assert(curaffix < naffix);
01046 Conf->AffixData[curaffix] = cpstrdup(Conf, Conf->Spell[i]->p.flag);
01047 }
01048
01049 Conf->Spell[i]->p.d.affix = curaffix;
01050 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
01051 }
01052
01053 Conf->lenAffixData = Conf->nAffixData = naffix;
01054
01055 qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
01056 Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
01057 }
01058
01059 static AffixNode *
01060 mkANode(IspellDict *Conf, int low, int high, int level, int type)
01061 {
01062 int i;
01063 int nchar = 0;
01064 uint8 lastchar = '\0';
01065 AffixNode *rs;
01066 AffixNodeData *data;
01067 int lownew = low;
01068 int naff;
01069 AFFIX **aff;
01070
01071 for (i = low; i < high; i++)
01072 if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
01073 {
01074 nchar++;
01075 lastchar = GETCHAR(Conf->Affix + i, level, type);
01076 }
01077
01078 if (!nchar)
01079 return NULL;
01080
01081 aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
01082 naff = 0;
01083
01084 rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
01085 rs->length = nchar;
01086 data = rs->data;
01087
01088 lastchar = '\0';
01089 for (i = low; i < high; i++)
01090 if (Conf->Affix[i].replen > level)
01091 {
01092 if (lastchar != GETCHAR(Conf->Affix + i, level, type))
01093 {
01094 if (lastchar)
01095 {
01096 data->node = mkANode(Conf, lownew, i, level + 1, type);
01097 if (naff)
01098 {
01099 data->naff = naff;
01100 data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
01101 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
01102 naff = 0;
01103 }
01104 data++;
01105 lownew = i;
01106 }
01107 lastchar = GETCHAR(Conf->Affix + i, level, type);
01108 }
01109 data->val = GETCHAR(Conf->Affix + i, level, type);
01110 if (Conf->Affix[i].replen == level + 1)
01111 {
01112 aff[naff++] = Conf->Affix + i;
01113 }
01114 }
01115
01116 data->node = mkANode(Conf, lownew, high, level + 1, type);
01117 if (naff)
01118 {
01119 data->naff = naff;
01120 data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
01121 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
01122 naff = 0;
01123 }
01124
01125 pfree(aff);
01126
01127 return rs;
01128 }
01129
01130 static void
01131 mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
01132 {
01133 int i,
01134 cnt = 0;
01135 int start = (issuffix) ? startsuffix : 0;
01136 int end = (issuffix) ? Conf->naffixes : startsuffix;
01137 AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
01138
01139 Affix->length = 1;
01140 Affix->isvoid = 1;
01141
01142 if (issuffix)
01143 {
01144 Affix->data->node = Conf->Suffix;
01145 Conf->Suffix = Affix;
01146 }
01147 else
01148 {
01149 Affix->data->node = Conf->Prefix;
01150 Conf->Prefix = Affix;
01151 }
01152
01153
01154 for (i = start; i < end; i++)
01155 if (Conf->Affix[i].replen == 0)
01156 cnt++;
01157
01158 if (cnt == 0)
01159 return;
01160
01161 Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
01162 Affix->data->naff = (uint32) cnt;
01163
01164 cnt = 0;
01165 for (i = start; i < end; i++)
01166 if (Conf->Affix[i].replen == 0)
01167 {
01168 Affix->data->aff[cnt] = Conf->Affix + i;
01169 cnt++;
01170 }
01171 }
01172
01173 static bool
01174 isAffixInUse(IspellDict *Conf, char flag)
01175 {
01176 int i;
01177
01178 for (i = 0; i < Conf->nAffixData; i++)
01179 if (strchr(Conf->AffixData[i], flag) != NULL)
01180 return true;
01181
01182 return false;
01183 }
01184
01185 void
01186 NISortAffixes(IspellDict *Conf)
01187 {
01188 AFFIX *Affix;
01189 size_t i;
01190 CMPDAffix *ptr;
01191 int firstsuffix = Conf->naffixes;
01192
01193 if (Conf->naffixes == 0)
01194 return;
01195
01196 if (Conf->naffixes > 1)
01197 qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
01198 Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
01199 ptr->affix = NULL;
01200
01201 for (i = 0; i < Conf->naffixes; i++)
01202 {
01203 Affix = &(((AFFIX *) Conf->Affix)[i]);
01204 if (Affix->type == FF_SUFFIX && i < firstsuffix)
01205 firstsuffix = i;
01206
01207 if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
01208 isAffixInUse(Conf, (char) Affix->flag))
01209 {
01210 if (ptr == Conf->CompoundAffix ||
01211 ptr->issuffix != (ptr - 1)->issuffix ||
01212 strbncmp((const unsigned char *) (ptr - 1)->affix,
01213 (const unsigned char *) Affix->repl,
01214 (ptr - 1)->len))
01215 {
01216
01217 ptr->affix = Affix->repl;
01218 ptr->len = Affix->replen;
01219 ptr->issuffix = (Affix->type == FF_SUFFIX) ? true : false;
01220 ptr++;
01221 }
01222 }
01223 }
01224 ptr->affix = NULL;
01225 Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
01226
01227 Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
01228 Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
01229 mkVoidAffix(Conf, true, firstsuffix);
01230 mkVoidAffix(Conf, false, firstsuffix);
01231 }
01232
01233 static AffixNodeData *
01234 FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
01235 {
01236 AffixNodeData *StopLow,
01237 *StopHigh,
01238 *StopMiddle;
01239 uint8 symbol;
01240
01241 if (node->isvoid)
01242 {
01243 if (node->data->naff)
01244 return node->data;
01245 node = node->data->node;
01246 }
01247
01248 while (node && *level < wrdlen)
01249 {
01250 StopLow = node->data;
01251 StopHigh = node->data + node->length;
01252 while (StopLow < StopHigh)
01253 {
01254 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
01255 symbol = GETWCHAR(word, wrdlen, *level, type);
01256
01257 if (StopMiddle->val == symbol)
01258 {
01259 (*level)++;
01260 if (StopMiddle->naff)
01261 return StopMiddle;
01262 node = StopMiddle->node;
01263 break;
01264 }
01265 else if (StopMiddle->val < symbol)
01266 StopLow = StopMiddle + 1;
01267 else
01268 StopHigh = StopMiddle;
01269 }
01270 if (StopLow >= StopHigh)
01271 break;
01272 }
01273 return NULL;
01274 }
01275
01276 static char *
01277 CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
01278 {
01279
01280
01281
01282
01283 if (flagflags == 0)
01284 {
01285 if (Affix->flagflags & FF_COMPOUNDONLY)
01286 return NULL;
01287 }
01288 else if (flagflags & FF_COMPOUNDBEGIN)
01289 {
01290 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
01291 return NULL;
01292 if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
01293 if (Affix->type == FF_SUFFIX)
01294 return NULL;
01295 }
01296 else if (flagflags & FF_COMPOUNDMIDDLE)
01297 {
01298 if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
01299 (Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
01300 return NULL;
01301 }
01302 else if (flagflags & FF_COMPOUNDLAST)
01303 {
01304 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
01305 return NULL;
01306 if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
01307 if (Affix->type == FF_PREFIX)
01308 return NULL;
01309 }
01310
01311
01312
01313
01314 if (Affix->type == FF_SUFFIX)
01315 {
01316 strcpy(newword, word);
01317 strcpy(newword + len - Affix->replen, Affix->find);
01318 if (baselen)
01319 *baselen = len - Affix->replen;
01320 }
01321 else
01322 {
01323
01324
01325
01326
01327 if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
01328 return NULL;
01329 strcpy(newword, Affix->find);
01330 strcat(newword, word + Affix->replen);
01331 }
01332
01333
01334
01335
01336 if (Affix->issimple)
01337 return newword;
01338 else if (Affix->isregis)
01339 {
01340 if (RS_execute(&(Affix->reg.regis), newword))
01341 return newword;
01342 }
01343 else
01344 {
01345 int err;
01346 pg_wchar *data;
01347 size_t data_len;
01348 int newword_len;
01349
01350
01351 newword_len = strlen(newword);
01352 data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
01353 data_len = pg_mb2wchar_with_len(newword, data, newword_len);
01354
01355 if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
01356 {
01357 pfree(data);
01358 return newword;
01359 }
01360 pfree(data);
01361 }
01362
01363 return NULL;
01364 }
01365
01366 static int
01367 addToResult(char **forms, char **cur, char *word)
01368 {
01369 if (cur - forms >= MAX_NORM - 1)
01370 return 0;
01371 if (forms == cur || strcmp(word, *(cur - 1)) != 0)
01372 {
01373 *cur = pstrdup(word);
01374 *(cur + 1) = NULL;
01375 return 1;
01376 }
01377
01378 return 0;
01379 }
01380
01381 static char **
01382 NormalizeSubWord(IspellDict *Conf, char *word, int flag)
01383 {
01384 AffixNodeData *suffix = NULL,
01385 *prefix = NULL;
01386 int slevel = 0,
01387 plevel = 0;
01388 int wrdlen = strlen(word),
01389 swrdlen;
01390 char **forms;
01391 char **cur;
01392 char newword[2 * MAXNORMLEN] = "";
01393 char pnewword[2 * MAXNORMLEN] = "";
01394 AffixNode *snode = Conf->Suffix,
01395 *pnode;
01396 int i,
01397 j;
01398
01399 if (wrdlen > MAXNORMLEN)
01400 return NULL;
01401 cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
01402 *cur = NULL;
01403
01404
01405
01406 if (FindWord(Conf, word, 0, flag))
01407 {
01408 *cur = pstrdup(word);
01409 cur++;
01410 *cur = NULL;
01411 }
01412
01413
01414 pnode = Conf->Prefix;
01415 plevel = 0;
01416 while (pnode)
01417 {
01418 prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
01419 if (!prefix)
01420 break;
01421 for (j = 0; j < prefix->naff; j++)
01422 {
01423 if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
01424 {
01425
01426 if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
01427 cur += addToResult(forms, cur, newword);
01428 }
01429 }
01430 pnode = prefix->node;
01431 }
01432
01433
01434
01435
01436
01437 while (snode)
01438 {
01439 int baselen = 0;
01440
01441
01442 suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
01443 if (!suffix)
01444 break;
01445
01446 for (i = 0; i < suffix->naff; i++)
01447 {
01448 if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
01449 {
01450
01451 if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
01452 cur += addToResult(forms, cur, newword);
01453
01454
01455 pnode = Conf->Prefix;
01456 plevel = 0;
01457 swrdlen = strlen(newword);
01458 while (pnode)
01459 {
01460 prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
01461 if (!prefix)
01462 break;
01463 for (j = 0; j < prefix->naff; j++)
01464 {
01465 if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
01466 {
01467
01468 int ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
01469 0 : prefix->aff[j]->flag;
01470
01471 if (FindWord(Conf, pnewword, ff, flag))
01472 cur += addToResult(forms, cur, pnewword);
01473 }
01474 }
01475 pnode = prefix->node;
01476 }
01477 }
01478 }
01479
01480 snode = suffix->node;
01481 }
01482
01483 if (cur == forms)
01484 {
01485 pfree(forms);
01486 return (NULL);
01487 }
01488 return (forms);
01489 }
01490
01491 typedef struct SplitVar
01492 {
01493 int nstem;
01494 int lenstem;
01495 char **stem;
01496 struct SplitVar *next;
01497 } SplitVar;
01498
01499 static int
01500 CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
01501 {
01502 bool issuffix;
01503
01504 if (CheckInPlace)
01505 {
01506 while ((*ptr)->affix)
01507 {
01508 if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
01509 {
01510 len = (*ptr)->len;
01511 issuffix = (*ptr)->issuffix;
01512 (*ptr)++;
01513 return (issuffix) ? len : 0;
01514 }
01515 (*ptr)++;
01516 }
01517 }
01518 else
01519 {
01520 char *affbegin;
01521
01522 while ((*ptr)->affix)
01523 {
01524 if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
01525 {
01526 len = (*ptr)->len + (affbegin - word);
01527 issuffix = (*ptr)->issuffix;
01528 (*ptr)++;
01529 return (issuffix) ? len : 0;
01530 }
01531 (*ptr)++;
01532 }
01533 }
01534 return -1;
01535 }
01536
01537 static SplitVar *
01538 CopyVar(SplitVar *s, int makedup)
01539 {
01540 SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
01541
01542 v->next = NULL;
01543 if (s)
01544 {
01545 int i;
01546
01547 v->lenstem = s->lenstem;
01548 v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
01549 v->nstem = s->nstem;
01550 for (i = 0; i < s->nstem; i++)
01551 v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
01552 }
01553 else
01554 {
01555 v->lenstem = 16;
01556 v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
01557 v->nstem = 0;
01558 }
01559 return v;
01560 }
01561
01562 static void
01563 AddStem(SplitVar *v, char *word)
01564 {
01565 if (v->nstem >= v->lenstem)
01566 {
01567 v->lenstem *= 2;
01568 v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
01569 }
01570
01571 v->stem[v->nstem] = word;
01572 v->nstem++;
01573 }
01574
01575 static SplitVar *
01576 SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
01577 {
01578 SplitVar *var = NULL;
01579 SPNodeData *StopLow,
01580 *StopHigh,
01581 *StopMiddle = NULL;
01582 SPNode *node = (snode) ? snode : Conf->Dictionary;
01583 int level = (snode) ? minpos : startpos;
01584
01585 int lenaff;
01586 CMPDAffix *caff;
01587 char *notprobed;
01588 int compoundflag = 0;
01589
01590 notprobed = (char *) palloc(wordlen);
01591 memset(notprobed, 1, wordlen);
01592 var = CopyVar(orig, 1);
01593
01594 while (level < wordlen)
01595 {
01596
01597 caff = Conf->CompoundAffix;
01598 while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
01599 {
01600
01601
01602
01603 char buf[MAXNORMLEN];
01604 char **subres;
01605
01606 lenaff = level - startpos + lenaff;
01607
01608 if (!notprobed[startpos + lenaff - 1])
01609 continue;
01610
01611 if (level + lenaff - 1 <= minpos)
01612 continue;
01613
01614 if (lenaff >= MAXNORMLEN)
01615 continue;
01616 if (lenaff > 0)
01617 memcpy(buf, word + startpos, lenaff);
01618 buf[lenaff] = '\0';
01619
01620 if (level == 0)
01621 compoundflag = FF_COMPOUNDBEGIN;
01622 else if (level == wordlen - 1)
01623 compoundflag = FF_COMPOUNDLAST;
01624 else
01625 compoundflag = FF_COMPOUNDMIDDLE;
01626 subres = NormalizeSubWord(Conf, buf, compoundflag);
01627 if (subres)
01628 {
01629
01630 SplitVar *new = CopyVar(var, 0);
01631 SplitVar *ptr = var;
01632 char **sptr = subres;
01633
01634 notprobed[startpos + lenaff - 1] = 0;
01635
01636 while (*sptr)
01637 {
01638 AddStem(new, *sptr);
01639 sptr++;
01640 }
01641 pfree(subres);
01642
01643 while (ptr->next)
01644 ptr = ptr->next;
01645 ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
01646
01647 pfree(new->stem);
01648 pfree(new);
01649 }
01650 }
01651
01652 if (!node)
01653 break;
01654
01655 StopLow = node->data;
01656 StopHigh = node->data + node->length;
01657 while (StopLow < StopHigh)
01658 {
01659 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
01660 if (StopMiddle->val == ((uint8 *) (word))[level])
01661 break;
01662 else if (StopMiddle->val < ((uint8 *) (word))[level])
01663 StopLow = StopMiddle + 1;
01664 else
01665 StopHigh = StopMiddle;
01666 }
01667
01668 if (StopLow < StopHigh)
01669 {
01670 if (level == FF_COMPOUNDBEGIN)
01671 compoundflag = FF_COMPOUNDBEGIN;
01672 else if (level == wordlen - 1)
01673 compoundflag = FF_COMPOUNDLAST;
01674 else
01675 compoundflag = FF_COMPOUNDMIDDLE;
01676
01677
01678 if (StopMiddle->isword &&
01679 (StopMiddle->compoundflag & compoundflag) &&
01680 notprobed[level])
01681 {
01682
01683 if (level > minpos)
01684 {
01685
01686 if (wordlen == level + 1)
01687 {
01688
01689 AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
01690 pfree(notprobed);
01691 return var;
01692 }
01693 else
01694 {
01695
01696 SplitVar *ptr = var;
01697
01698 while (ptr->next)
01699 ptr = ptr->next;
01700 ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
01701
01702 level++;
01703 AddStem(var, pnstrdup(word + startpos, level - startpos));
01704 node = Conf->Dictionary;
01705 startpos = level;
01706 continue;
01707 }
01708 }
01709 }
01710 node = StopMiddle->node;
01711 }
01712 else
01713 node = NULL;
01714 level++;
01715 }
01716
01717 AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
01718 pfree(notprobed);
01719 return var;
01720 }
01721
01722 static void
01723 addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
01724 {
01725 if (*lres == NULL)
01726 *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
01727
01728 if (*lcur - *lres < MAX_NORM - 1)
01729 {
01730 (*lcur)->lexeme = word;
01731 (*lcur)->flags = flags;
01732 (*lcur)->nvariant = NVariant;
01733 (*lcur)++;
01734 (*lcur)->lexeme = NULL;
01735 }
01736 }
01737
01738 TSLexeme *
01739 NINormalizeWord(IspellDict *Conf, char *word)
01740 {
01741 char **res;
01742 TSLexeme *lcur = NULL,
01743 *lres = NULL;
01744 uint16 NVariant = 1;
01745
01746 res = NormalizeSubWord(Conf, word, 0);
01747
01748 if (res)
01749 {
01750 char **ptr = res;
01751
01752 while (*ptr && (lcur - lres) < MAX_NORM)
01753 {
01754 addNorm(&lres, &lcur, *ptr, 0, NVariant++);
01755 ptr++;
01756 }
01757 pfree(res);
01758 }
01759
01760 if (Conf->usecompound)
01761 {
01762 int wordlen = strlen(word);
01763 SplitVar *ptr,
01764 *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
01765 int i;
01766
01767 while (var)
01768 {
01769 if (var->nstem > 1)
01770 {
01771 char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
01772
01773 if (subres)
01774 {
01775 char **subptr = subres;
01776
01777 while (*subptr)
01778 {
01779 for (i = 0; i < var->nstem - 1; i++)
01780 {
01781 addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
01782 }
01783
01784 addNorm(&lres, &lcur, *subptr, 0, NVariant);
01785 subptr++;
01786 NVariant++;
01787 }
01788
01789 pfree(subres);
01790 var->stem[0] = NULL;
01791 pfree(var->stem[var->nstem - 1]);
01792 }
01793 }
01794
01795 for (i = 0; i < var->nstem && var->stem[i]; i++)
01796 pfree(var->stem[i]);
01797 ptr = var->next;
01798 pfree(var->stem);
01799 pfree(var);
01800 var = ptr;
01801 }
01802 }
01803
01804 return lres;
01805 }