00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include "postgres.h"
00016
00017 #include "libpq/pqformat.h"
00018 #include "tsearch/ts_locale.h"
00019 #include "tsearch/ts_utils.h"
00020 #include "utils/memutils.h"
00021
00022 typedef struct
00023 {
00024 WordEntry entry;
00025 WordEntryPos *pos;
00026 int poslen;
00027 } WordEntryIN;
00028
00029
00030
00031 static int
00032 comparePos(const void *a, const void *b)
00033 {
00034 int apos = WEP_GETPOS(*(const WordEntryPos *) a);
00035 int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
00036
00037 if (apos == bpos)
00038 return 0;
00039 return (apos > bpos) ? 1 : -1;
00040 }
00041
00042
00043
00044
00045
00046
00047
00048 static int
00049 uniquePos(WordEntryPos *a, int l)
00050 {
00051 WordEntryPos *ptr,
00052 *res;
00053
00054 if (l <= 1)
00055 return l;
00056
00057 qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
00058
00059 res = a;
00060 ptr = a + 1;
00061 while (ptr - a < l)
00062 {
00063 if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
00064 {
00065 res++;
00066 *res = *ptr;
00067 if (res - a >= MAXNUMPOS - 1 ||
00068 WEP_GETPOS(*res) == MAXENTRYPOS - 1)
00069 break;
00070 }
00071 else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
00072 WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
00073 ptr++;
00074 }
00075
00076 return res + 1 - a;
00077 }
00078
00079
00080 static int
00081 compareentry(const void *va, const void *vb, void *arg)
00082 {
00083 const WordEntryIN *a = (const WordEntryIN *) va;
00084 const WordEntryIN *b = (const WordEntryIN *) vb;
00085 char *BufferStr = (char *) arg;
00086
00087 return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
00088 &BufferStr[b->entry.pos], b->entry.len,
00089 false);
00090 }
00091
00092
00093
00094
00095
00096 static int
00097 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
00098 {
00099 int buflen;
00100 WordEntryIN *ptr,
00101 *res;
00102
00103 Assert(l >= 1);
00104
00105 if (l > 1)
00106 qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
00107 (void *) buf);
00108
00109 buflen = 0;
00110 res = a;
00111 ptr = a + 1;
00112 while (ptr - a < l)
00113 {
00114 if (!(ptr->entry.len == res->entry.len &&
00115 strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
00116 res->entry.len) == 0))
00117 {
00118
00119 buflen += res->entry.len;
00120 if (res->entry.haspos)
00121 {
00122 res->poslen = uniquePos(res->pos, res->poslen);
00123 buflen = SHORTALIGN(buflen);
00124 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
00125 }
00126 res++;
00127 memcpy(res, ptr, sizeof(WordEntryIN));
00128 }
00129 else if (ptr->entry.haspos)
00130 {
00131 if (res->entry.haspos)
00132 {
00133
00134 int newlen = ptr->poslen + res->poslen;
00135
00136 res->pos = (WordEntryPos *)
00137 repalloc(res->pos, newlen * sizeof(WordEntryPos));
00138 memcpy(&res->pos[res->poslen], ptr->pos,
00139 ptr->poslen * sizeof(WordEntryPos));
00140 res->poslen = newlen;
00141 pfree(ptr->pos);
00142 }
00143 else
00144 {
00145
00146 res->entry.haspos = 1;
00147 res->pos = ptr->pos;
00148 res->poslen = ptr->poslen;
00149 }
00150 }
00151 ptr++;
00152 }
00153
00154
00155 buflen += res->entry.len;
00156 if (res->entry.haspos)
00157 {
00158 res->poslen = uniquePos(res->pos, res->poslen);
00159 buflen = SHORTALIGN(buflen);
00160 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
00161 }
00162
00163 *outbuflen = buflen;
00164 return res + 1 - a;
00165 }
00166
00167 static int
00168 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
00169 {
00170 return compareentry(a, b, buf);
00171 }
00172
00173
00174 Datum
00175 tsvectorin(PG_FUNCTION_ARGS)
00176 {
00177 char *buf = PG_GETARG_CSTRING(0);
00178 TSVectorParseState state;
00179 WordEntryIN *arr;
00180 int totallen;
00181 int arrlen;
00182 WordEntry *inarr;
00183 int len = 0;
00184 TSVector in;
00185 int i;
00186 char *token;
00187 int toklen;
00188 WordEntryPos *pos;
00189 int poslen;
00190 char *strbuf;
00191 int stroff;
00192
00193
00194
00195
00196
00197 char *tmpbuf;
00198 char *cur;
00199 int buflen = 256;
00200
00201 state = init_tsvector_parser(buf, false, false);
00202
00203 arrlen = 64;
00204 arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
00205 cur = tmpbuf = (char *) palloc(buflen);
00206
00207 while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
00208 {
00209 if (toklen >= MAXSTRLEN)
00210 ereport(ERROR,
00211 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00212 errmsg("word is too long (%ld bytes, max %ld bytes)",
00213 (long) toklen,
00214 (long) (MAXSTRLEN - 1))));
00215
00216 if (cur - tmpbuf > MAXSTRPOS)
00217 ereport(ERROR,
00218 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00219 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
00220 (long) (cur - tmpbuf), (long) MAXSTRPOS)));
00221
00222
00223
00224
00225 if (len >= arrlen)
00226 {
00227 arrlen *= 2;
00228 arr = (WordEntryIN *)
00229 repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
00230 }
00231 while ((cur - tmpbuf) + toklen >= buflen)
00232 {
00233 int dist = cur - tmpbuf;
00234
00235 buflen *= 2;
00236 tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
00237 cur = tmpbuf + dist;
00238 }
00239 arr[len].entry.len = toklen;
00240 arr[len].entry.pos = cur - tmpbuf;
00241 memcpy((void *) cur, (void *) token, toklen);
00242 cur += toklen;
00243
00244 if (poslen != 0)
00245 {
00246 arr[len].entry.haspos = 1;
00247 arr[len].pos = pos;
00248 arr[len].poslen = poslen;
00249 }
00250 else
00251 {
00252 arr[len].entry.haspos = 0;
00253 arr[len].pos = NULL;
00254 arr[len].poslen = 0;
00255 }
00256 len++;
00257 }
00258
00259 close_tsvector_parser(state);
00260
00261 if (len > 0)
00262 len = uniqueentry(arr, len, tmpbuf, &buflen);
00263 else
00264 buflen = 0;
00265
00266 if (buflen > MAXSTRPOS)
00267 ereport(ERROR,
00268 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00269 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
00270
00271 totallen = CALCDATASIZE(len, buflen);
00272 in = (TSVector) palloc0(totallen);
00273 SET_VARSIZE(in, totallen);
00274 in->size = len;
00275 inarr = ARRPTR(in);
00276 strbuf = STRPTR(in);
00277 stroff = 0;
00278 for (i = 0; i < len; i++)
00279 {
00280 memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
00281 arr[i].entry.pos = stroff;
00282 stroff += arr[i].entry.len;
00283 if (arr[i].entry.haspos)
00284 {
00285 if (arr[i].poslen > 0xFFFF)
00286 elog(ERROR, "positions array too long");
00287
00288
00289 stroff = SHORTALIGN(stroff);
00290 *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
00291 stroff += sizeof(uint16);
00292
00293
00294 memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
00295 stroff += arr[i].poslen * sizeof(WordEntryPos);
00296
00297 pfree(arr[i].pos);
00298 }
00299 inarr[i] = arr[i].entry;
00300 }
00301
00302 Assert((strbuf + stroff - (char *) in) == totallen);
00303
00304 PG_RETURN_TSVECTOR(in);
00305 }
00306
00307 Datum
00308 tsvectorout(PG_FUNCTION_ARGS)
00309 {
00310 TSVector out = PG_GETARG_TSVECTOR(0);
00311 char *outbuf;
00312 int32 i,
00313 lenbuf = 0,
00314 pp;
00315 WordEntry *ptr = ARRPTR(out);
00316 char *curbegin,
00317 *curin,
00318 *curout;
00319
00320 lenbuf = out->size * 2 + out->size - 1 + 2 ;
00321 for (i = 0; i < out->size; i++)
00322 {
00323 lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() ;
00324 if (ptr[i].haspos)
00325 lenbuf += 1 + 7 * POSDATALEN(out, &(ptr[i]));
00326 }
00327
00328 curout = outbuf = (char *) palloc(lenbuf);
00329 for (i = 0; i < out->size; i++)
00330 {
00331 curbegin = curin = STRPTR(out) + ptr->pos;
00332 if (i != 0)
00333 *curout++ = ' ';
00334 *curout++ = '\'';
00335 while (curin - curbegin < ptr->len)
00336 {
00337 int len = pg_mblen(curin);
00338
00339 if (t_iseq(curin, '\''))
00340 *curout++ = '\'';
00341 else if (t_iseq(curin, '\\'))
00342 *curout++ = '\\';
00343
00344 while (len--)
00345 *curout++ = *curin++;
00346 }
00347
00348 *curout++ = '\'';
00349 if ((pp = POSDATALEN(out, ptr)) != 0)
00350 {
00351 WordEntryPos *wptr;
00352
00353 *curout++ = ':';
00354 wptr = POSDATAPTR(out, ptr);
00355 while (pp)
00356 {
00357 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
00358 switch (WEP_GETWEIGHT(*wptr))
00359 {
00360 case 3:
00361 *curout++ = 'A';
00362 break;
00363 case 2:
00364 *curout++ = 'B';
00365 break;
00366 case 1:
00367 *curout++ = 'C';
00368 break;
00369 case 0:
00370 default:
00371 break;
00372 }
00373
00374 if (pp > 1)
00375 *curout++ = ',';
00376 pp--;
00377 wptr++;
00378 }
00379 }
00380 ptr++;
00381 }
00382
00383 *curout = '\0';
00384 PG_FREE_IF_COPY(out, 0);
00385 PG_RETURN_CSTRING(outbuf);
00386 }
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400 Datum
00401 tsvectorsend(PG_FUNCTION_ARGS)
00402 {
00403 TSVector vec = PG_GETARG_TSVECTOR(0);
00404 StringInfoData buf;
00405 int i,
00406 j;
00407 WordEntry *weptr = ARRPTR(vec);
00408
00409 pq_begintypsend(&buf);
00410
00411 pq_sendint(&buf, vec->size, sizeof(int32));
00412 for (i = 0; i < vec->size; i++)
00413 {
00414 uint16 npos;
00415
00416
00417
00418
00419
00420 pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
00421 pq_sendbyte(&buf, '\0');
00422
00423 npos = POSDATALEN(vec, weptr);
00424 pq_sendint(&buf, npos, sizeof(uint16));
00425
00426 if (npos > 0)
00427 {
00428 WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
00429
00430 for (j = 0; j < npos; j++)
00431 pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
00432 }
00433 weptr++;
00434 }
00435
00436 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
00437 }
00438
00439 Datum
00440 tsvectorrecv(PG_FUNCTION_ARGS)
00441 {
00442 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
00443 TSVector vec;
00444 int i;
00445 int32 nentries;
00446 int datalen;
00447
00448
00449 Size hdrlen;
00450 Size len;
00451 bool needSort = false;
00452
00453 nentries = pq_getmsgint(buf, sizeof(int32));
00454 if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
00455 elog(ERROR, "invalid size of tsvector");
00456
00457 hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
00458
00459 len = hdrlen * 2;
00460 vec = (TSVector) palloc0(len);
00461 vec->size = nentries;
00462
00463 datalen = 0;
00464 for (i = 0; i < nentries; i++)
00465 {
00466 const char *lexeme;
00467 uint16 npos;
00468 size_t lex_len;
00469
00470 lexeme = pq_getmsgstring(buf);
00471 npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
00472
00473
00474
00475 lex_len = strlen(lexeme);
00476 if (lex_len > MAXSTRLEN)
00477 elog(ERROR, "invalid tsvector: lexeme too long");
00478
00479 if (datalen > MAXSTRPOS)
00480 elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
00481
00482 if (npos > MAXNUMPOS)
00483 elog(ERROR, "unexpected number of tsvector positions");
00484
00485
00486
00487
00488
00489
00490 while (hdrlen + SHORTALIGN(datalen + lex_len) +
00491 (npos + 1) * sizeof(WordEntryPos) >= len)
00492 {
00493 len *= 2;
00494 vec = (TSVector) repalloc(vec, len);
00495 }
00496
00497 vec->entries[i].haspos = (npos > 0) ? 1 : 0;
00498 vec->entries[i].len = lex_len;
00499 vec->entries[i].pos = datalen;
00500
00501 memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
00502
00503 datalen += lex_len;
00504
00505 if (i > 0 && WordEntryCMP(&vec->entries[i],
00506 &vec->entries[i - 1],
00507 STRPTR(vec)) <= 0)
00508 needSort = true;
00509
00510
00511 if (npos > 0)
00512 {
00513 uint16 j;
00514 WordEntryPos *wepptr;
00515
00516
00517
00518
00519
00520
00521 if (datalen != SHORTALIGN(datalen))
00522 {
00523 *(STRPTR(vec) + datalen) = '\0';
00524 datalen = SHORTALIGN(datalen);
00525 }
00526
00527 memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
00528
00529 wepptr = POSDATAPTR(vec, &vec->entries[i]);
00530 for (j = 0; j < npos; j++)
00531 {
00532 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
00533 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
00534 elog(ERROR, "position information is misordered");
00535 }
00536
00537 datalen += (npos + 1) * sizeof(WordEntry);
00538 }
00539 }
00540
00541 SET_VARSIZE(vec, hdrlen + datalen);
00542
00543 if (needSort)
00544 qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
00545 compareentry, (void *) STRPTR(vec));
00546
00547 PG_RETURN_TSVECTOR(vec);
00548 }