00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include "postgres.h"
00016
00017 #include "tsearch/ts_cache.h"
00018 #include "tsearch/ts_utils.h"
00019
00020 #define IGNORE_LONGLEXEME 1
00021
00022
00023
00024
00025
00026 typedef struct ParsedLex
00027 {
00028 int type;
00029 char *lemm;
00030 int lenlemm;
00031 struct ParsedLex *next;
00032 } ParsedLex;
00033
00034 typedef struct ListParsedLex
00035 {
00036 ParsedLex *head;
00037 ParsedLex *tail;
00038 } ListParsedLex;
00039
00040 typedef struct
00041 {
00042 TSConfigCacheEntry *cfg;
00043 Oid curDictId;
00044 int posDict;
00045 DictSubState dictState;
00046 ParsedLex *curSub;
00047 ListParsedLex towork;
00048 ListParsedLex waste;
00049
00050
00051
00052
00053
00054
00055 ParsedLex *lastRes;
00056 TSLexeme *tmpRes;
00057 } LexizeData;
00058
00059 static void
00060 LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
00061 {
00062 ld->cfg = cfg;
00063 ld->curDictId = InvalidOid;
00064 ld->posDict = 0;
00065 ld->towork.head = ld->towork.tail = ld->curSub = NULL;
00066 ld->waste.head = ld->waste.tail = NULL;
00067 ld->lastRes = NULL;
00068 ld->tmpRes = NULL;
00069 }
00070
00071 static void
00072 LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
00073 {
00074 if (list->tail)
00075 {
00076 list->tail->next = newpl;
00077 list->tail = newpl;
00078 }
00079 else
00080 list->head = list->tail = newpl;
00081 newpl->next = NULL;
00082 }
00083
00084 static ParsedLex *
00085 LPLRemoveHead(ListParsedLex *list)
00086 {
00087 ParsedLex *res = list->head;
00088
00089 if (list->head)
00090 list->head = list->head->next;
00091
00092 if (list->head == NULL)
00093 list->tail = NULL;
00094
00095 return res;
00096 }
00097
00098 static void
00099 LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
00100 {
00101 ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
00102
00103 newpl->type = type;
00104 newpl->lemm = lemm;
00105 newpl->lenlemm = lenlemm;
00106 LPLAddTail(&ld->towork, newpl);
00107 ld->curSub = ld->towork.tail;
00108 }
00109
00110 static void
00111 RemoveHead(LexizeData *ld)
00112 {
00113 LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
00114
00115 ld->posDict = 0;
00116 }
00117
00118 static void
00119 setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
00120 {
00121 if (correspondLexem)
00122 {
00123 *correspondLexem = ld->waste.head;
00124 }
00125 else
00126 {
00127 ParsedLex *tmp,
00128 *ptr = ld->waste.head;
00129
00130 while (ptr)
00131 {
00132 tmp = ptr->next;
00133 pfree(ptr);
00134 ptr = tmp;
00135 }
00136 }
00137 ld->waste.head = ld->waste.tail = NULL;
00138 }
00139
00140 static void
00141 moveToWaste(LexizeData *ld, ParsedLex *stop)
00142 {
00143 bool go = true;
00144
00145 while (ld->towork.head && go)
00146 {
00147 if (ld->towork.head == stop)
00148 {
00149 ld->curSub = stop->next;
00150 go = false;
00151 }
00152 RemoveHead(ld);
00153 }
00154 }
00155
00156 static void
00157 setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
00158 {
00159 if (ld->tmpRes)
00160 {
00161 TSLexeme *ptr;
00162
00163 for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
00164 pfree(ptr->lexeme);
00165 pfree(ld->tmpRes);
00166 }
00167 ld->tmpRes = res;
00168 ld->lastRes = lex;
00169 }
00170
00171 static TSLexeme *
00172 LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
00173 {
00174 int i;
00175 ListDictionary *map;
00176 TSDictionaryCacheEntry *dict;
00177 TSLexeme *res;
00178
00179 if (ld->curDictId == InvalidOid)
00180 {
00181
00182
00183
00184
00185
00186 while (ld->towork.head)
00187 {
00188 ParsedLex *curVal = ld->towork.head;
00189 char *curValLemm = curVal->lemm;
00190 int curValLenLemm = curVal->lenlemm;
00191
00192 map = ld->cfg->map + curVal->type;
00193
00194 if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
00195 {
00196
00197 RemoveHead(ld);
00198 continue;
00199 }
00200
00201 for (i = ld->posDict; i < map->len; i++)
00202 {
00203 dict = lookup_ts_dictionary_cache(map->dictIds[i]);
00204
00205 ld->dictState.isend = ld->dictState.getnext = false;
00206 ld->dictState.private_state = NULL;
00207 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
00208 &(dict->lexize),
00209 PointerGetDatum(dict->dictData),
00210 PointerGetDatum(curValLemm),
00211 Int32GetDatum(curValLenLemm),
00212 PointerGetDatum(&ld->dictState)
00213 ));
00214
00215 if (ld->dictState.getnext)
00216 {
00217
00218
00219
00220
00221
00222 ld->curDictId = DatumGetObjectId(map->dictIds[i]);
00223 ld->posDict = i + 1;
00224 ld->curSub = curVal->next;
00225 if (res)
00226 setNewTmpRes(ld, curVal, res);
00227 return LexizeExec(ld, correspondLexem);
00228 }
00229
00230 if (!res)
00231 continue;
00232
00233 if (res->flags & TSL_FILTER)
00234 {
00235 curValLemm = res->lexeme;
00236 curValLenLemm = strlen(res->lexeme);
00237 continue;
00238 }
00239
00240 RemoveHead(ld);
00241 setCorrLex(ld, correspondLexem);
00242 return res;
00243 }
00244
00245 RemoveHead(ld);
00246 }
00247 }
00248 else
00249 {
00250 dict = lookup_ts_dictionary_cache(ld->curDictId);
00251
00252
00253
00254
00255
00256 while (ld->curSub)
00257 {
00258 ParsedLex *curVal = ld->curSub;
00259
00260 map = ld->cfg->map + curVal->type;
00261
00262 if (curVal->type != 0)
00263 {
00264 bool dictExists = false;
00265
00266 if (curVal->type >= ld->cfg->lenmap || map->len == 0)
00267 {
00268
00269 ld->curSub = curVal->next;
00270 continue;
00271 }
00272
00273
00274
00275
00276
00277
00278 for (i = 0; i < map->len && !dictExists; i++)
00279 if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
00280 dictExists = true;
00281
00282 if (!dictExists)
00283 {
00284
00285
00286
00287
00288 ld->curDictId = InvalidOid;
00289 return LexizeExec(ld, correspondLexem);
00290 }
00291 }
00292
00293 ld->dictState.isend = (curVal->type == 0) ? true : false;
00294 ld->dictState.getnext = false;
00295
00296 res = (TSLexeme *) DatumGetPointer(FunctionCall4(
00297 &(dict->lexize),
00298 PointerGetDatum(dict->dictData),
00299 PointerGetDatum(curVal->lemm),
00300 Int32GetDatum(curVal->lenlemm),
00301 PointerGetDatum(&ld->dictState)
00302 ));
00303
00304 if (ld->dictState.getnext)
00305 {
00306
00307 ld->curSub = curVal->next;
00308 if (res)
00309 setNewTmpRes(ld, curVal, res);
00310 continue;
00311 }
00312
00313 if (res || ld->tmpRes)
00314 {
00315
00316
00317
00318
00319
00320 if (res)
00321 {
00322 moveToWaste(ld, ld->curSub);
00323 }
00324 else
00325 {
00326 res = ld->tmpRes;
00327 moveToWaste(ld, ld->lastRes);
00328 }
00329
00330
00331 ld->curDictId = InvalidOid;
00332 ld->posDict = 0;
00333 ld->lastRes = NULL;
00334 ld->tmpRes = NULL;
00335 setCorrLex(ld, correspondLexem);
00336 return res;
00337 }
00338
00339
00340
00341
00342
00343 ld->curDictId = InvalidOid;
00344 return LexizeExec(ld, correspondLexem);
00345 }
00346 }
00347
00348 setCorrLex(ld, correspondLexem);
00349 return NULL;
00350 }
00351
00352
00353
00354
00355
00356
00357 void
00358 parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
00359 {
00360 int type,
00361 lenlemm;
00362 char *lemm = NULL;
00363 LexizeData ldata;
00364 TSLexeme *norms;
00365 TSConfigCacheEntry *cfg;
00366 TSParserCacheEntry *prsobj;
00367 void *prsdata;
00368
00369 cfg = lookup_ts_config_cache(cfgId);
00370 prsobj = lookup_ts_parser_cache(cfg->prsId);
00371
00372 prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
00373 PointerGetDatum(buf),
00374 Int32GetDatum(buflen)));
00375
00376 LexizeInit(&ldata, cfg);
00377
00378 do
00379 {
00380 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
00381 PointerGetDatum(prsdata),
00382 PointerGetDatum(&lemm),
00383 PointerGetDatum(&lenlemm)));
00384
00385 if (type > 0 && lenlemm >= MAXSTRLEN)
00386 {
00387 #ifdef IGNORE_LONGLEXEME
00388 ereport(NOTICE,
00389 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00390 errmsg("word is too long to be indexed"),
00391 errdetail("Words longer than %d characters are ignored.",
00392 MAXSTRLEN)));
00393 continue;
00394 #else
00395 ereport(ERROR,
00396 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00397 errmsg("word is too long to be indexed"),
00398 errdetail("Words longer than %d characters are ignored.",
00399 MAXSTRLEN)));
00400 #endif
00401 }
00402
00403 LexizeAddLemm(&ldata, type, lemm, lenlemm);
00404
00405 while ((norms = LexizeExec(&ldata, NULL)) != NULL)
00406 {
00407 TSLexeme *ptr = norms;
00408
00409 prs->pos++;
00410
00411 while (ptr->lexeme)
00412 {
00413 if (prs->curwords == prs->lenwords)
00414 {
00415 prs->lenwords *= 2;
00416 prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
00417 }
00418
00419 if (ptr->flags & TSL_ADDPOS)
00420 prs->pos++;
00421 prs->words[prs->curwords].len = strlen(ptr->lexeme);
00422 prs->words[prs->curwords].word = ptr->lexeme;
00423 prs->words[prs->curwords].nvariant = ptr->nvariant;
00424 prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
00425 prs->words[prs->curwords].alen = 0;
00426 prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
00427 ptr++;
00428 prs->curwords++;
00429 }
00430 pfree(norms);
00431 }
00432 } while (type > 0);
00433
00434 FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
00435 }
00436
00437
00438
00439
00440 static void
00441 hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
00442 {
00443 while (prs->curwords >= prs->lenwords)
00444 {
00445 prs->lenwords *= 2;
00446 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
00447 }
00448 memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
00449 prs->words[prs->curwords].type = (uint8) type;
00450 prs->words[prs->curwords].len = buflen;
00451 prs->words[prs->curwords].word = palloc(buflen);
00452 memcpy(prs->words[prs->curwords].word, buf, buflen);
00453 prs->curwords++;
00454 }
00455
00456 static void
00457 hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
00458 {
00459 int i;
00460 QueryItem *item = GETQUERY(query);
00461 HeadlineWordEntry *word;
00462
00463 while (prs->curwords + query->size >= prs->lenwords)
00464 {
00465 prs->lenwords *= 2;
00466 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
00467 }
00468
00469 word = &(prs->words[prs->curwords - 1]);
00470 for (i = 0; i < query->size; i++)
00471 {
00472 if (item->type == QI_VAL &&
00473 tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
00474 buf, buflen, item->qoperand.prefix) == 0)
00475 {
00476 if (word->item)
00477 {
00478 memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
00479 prs->words[prs->curwords].item = &item->qoperand;
00480 prs->words[prs->curwords].repeated = 1;
00481 prs->curwords++;
00482 }
00483 else
00484 word->item = &item->qoperand;
00485 }
00486 item++;
00487 }
00488 }
00489
00490 static void
00491 addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
00492 {
00493 ParsedLex *tmplexs;
00494 TSLexeme *ptr;
00495
00496 while (lexs)
00497 {
00498
00499 if (lexs->type > 0)
00500 hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
00501
00502 ptr = norms;
00503 while (ptr && ptr->lexeme)
00504 {
00505 hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
00506 ptr++;
00507 }
00508
00509 tmplexs = lexs->next;
00510 pfree(lexs);
00511 lexs = tmplexs;
00512 }
00513
00514 if (norms)
00515 {
00516 ptr = norms;
00517 while (ptr->lexeme)
00518 {
00519 pfree(ptr->lexeme);
00520 ptr++;
00521 }
00522 pfree(norms);
00523 }
00524 }
00525
00526 void
00527 hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
00528 {
00529 int type,
00530 lenlemm;
00531 char *lemm = NULL;
00532 LexizeData ldata;
00533 TSLexeme *norms;
00534 ParsedLex *lexs;
00535 TSConfigCacheEntry *cfg;
00536 TSParserCacheEntry *prsobj;
00537 void *prsdata;
00538
00539 cfg = lookup_ts_config_cache(cfgId);
00540 prsobj = lookup_ts_parser_cache(cfg->prsId);
00541
00542 prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
00543 PointerGetDatum(buf),
00544 Int32GetDatum(buflen)));
00545
00546 LexizeInit(&ldata, cfg);
00547
00548 do
00549 {
00550 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
00551 PointerGetDatum(prsdata),
00552 PointerGetDatum(&lemm),
00553 PointerGetDatum(&lenlemm)));
00554
00555 if (type > 0 && lenlemm >= MAXSTRLEN)
00556 {
00557 #ifdef IGNORE_LONGLEXEME
00558 ereport(NOTICE,
00559 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00560 errmsg("word is too long to be indexed"),
00561 errdetail("Words longer than %d characters are ignored.",
00562 MAXSTRLEN)));
00563 continue;
00564 #else
00565 ereport(ERROR,
00566 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00567 errmsg("word is too long to be indexed"),
00568 errdetail("Words longer than %d characters are ignored.",
00569 MAXSTRLEN)));
00570 #endif
00571 }
00572
00573 LexizeAddLemm(&ldata, type, lemm, lenlemm);
00574
00575 do
00576 {
00577 if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
00578 addHLParsedLex(prs, query, lexs, norms);
00579 else
00580 addHLParsedLex(prs, query, lexs, NULL);
00581 } while (norms);
00582
00583 } while (type > 0);
00584
00585 FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
00586 }
00587
00588 text *
00589 generateHeadline(HeadlineParsedText *prs)
00590 {
00591 text *out;
00592 char *ptr;
00593 int len = 128;
00594 int numfragments = 0;
00595 int16 infrag = 0;
00596
00597 HeadlineWordEntry *wrd = prs->words;
00598
00599 out = (text *) palloc(len);
00600 ptr = ((char *) out) + VARHDRSZ;
00601
00602 while (wrd - prs->words < prs->curwords)
00603 {
00604 while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
00605 {
00606 int dist = ptr - ((char *) out);
00607
00608 len *= 2;
00609 out = (text *) repalloc(out, len);
00610 ptr = ((char *) out) + dist;
00611 }
00612
00613 if (wrd->in && !wrd->repeated)
00614 {
00615 if (!infrag)
00616 {
00617
00618
00619 infrag = 1;
00620 numfragments++;
00621
00622 if (numfragments > 1)
00623 {
00624 memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
00625 ptr += prs->fragdelimlen;
00626 }
00627
00628 }
00629 if (wrd->replace)
00630 {
00631 *ptr = ' ';
00632 ptr++;
00633 }
00634 else if (!wrd->skip)
00635 {
00636 if (wrd->selected)
00637 {
00638 memcpy(ptr, prs->startsel, prs->startsellen);
00639 ptr += prs->startsellen;
00640 }
00641 memcpy(ptr, wrd->word, wrd->len);
00642 ptr += wrd->len;
00643 if (wrd->selected)
00644 {
00645 memcpy(ptr, prs->stopsel, prs->stopsellen);
00646 ptr += prs->stopsellen;
00647 }
00648 }
00649 }
00650 else if (!wrd->repeated)
00651 {
00652 if (infrag)
00653 infrag = 0;
00654 pfree(wrd->word);
00655 }
00656
00657 wrd++;
00658 }
00659
00660 SET_VARSIZE(out, ptr - ((char *) out));
00661 return out;
00662 }