00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "postgres.h"
00015
00016 #include "catalog/namespace.h"
00017 #include "commands/defrem.h"
00018 #include "tsearch/ts_cache.h"
00019 #include "tsearch/ts_locale.h"
00020 #include "tsearch/ts_public.h"
00021 #include "utils/builtins.h"
00022
00023 PG_MODULE_MAGIC;
00024
00025
00026
00027
00028
00029
00030
00031 typedef struct SuffixChar
00032 {
00033 struct SuffixChar *nextChar;
00034 char *replaceTo;
00035 int replacelen;
00036 } SuffixChar;
00037
00038
00039
00040
00041 static SuffixChar *
00042 placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
00043 {
00044 SuffixChar *curnode;
00045
00046 if (!node)
00047 {
00048 node = palloc(sizeof(SuffixChar) * 256);
00049 memset(node, 0, sizeof(SuffixChar) * 256);
00050 }
00051
00052 curnode = node + *str;
00053
00054 if (lenstr == 1)
00055 {
00056 if (curnode->replaceTo)
00057 elog(WARNING, "duplicate TO argument, use first one");
00058 else
00059 {
00060 curnode->replacelen = replacelen;
00061 curnode->replaceTo = palloc(replacelen);
00062 memcpy(curnode->replaceTo, replaceTo, replacelen);
00063 }
00064 }
00065 else
00066 {
00067 curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
00068 }
00069
00070 return node;
00071 }
00072
00073
00074
00075
00076
00077 static SuffixChar *
00078 initSuffixTree(char *filename)
00079 {
00080 SuffixChar *volatile rootSuffixTree = NULL;
00081 MemoryContext ccxt = CurrentMemoryContext;
00082 tsearch_readline_state trst;
00083 volatile bool skip;
00084
00085 filename = get_tsearch_config_filename(filename, "rules");
00086 if (!tsearch_readline_begin(&trst, filename))
00087 ereport(ERROR,
00088 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00089 errmsg("could not open unaccent file \"%s\": %m",
00090 filename)));
00091
00092 do
00093 {
00094
00095
00096
00097
00098
00099 skip = true;
00100
00101 PG_TRY();
00102 {
00103 char *line;
00104
00105 while ((line = tsearch_readline(&trst)) != NULL)
00106 {
00107
00108
00109
00110
00111
00112
00113 int state;
00114 char *ptr;
00115 char *src = NULL;
00116 char *trg = NULL;
00117 int ptrlen;
00118 int srclen = 0;
00119 int trglen = 0;
00120
00121 state = 0;
00122 for (ptr = line; *ptr; ptr += ptrlen)
00123 {
00124 ptrlen = pg_mblen(ptr);
00125
00126 if (t_isspace(ptr))
00127 {
00128 if (state == 1)
00129 state = 2;
00130 else if (state == 3)
00131 state = 4;
00132 continue;
00133 }
00134 switch (state)
00135 {
00136 case 0:
00137
00138 src = ptr;
00139 srclen = ptrlen;
00140 state = 1;
00141 break;
00142 case 1:
00143
00144 srclen += ptrlen;
00145 break;
00146 case 2:
00147
00148 trg = ptr;
00149 trglen = ptrlen;
00150 state = 3;
00151 break;
00152 case 3:
00153
00154 trglen += ptrlen;
00155 break;
00156 default:
00157
00158 state = -1;
00159 break;
00160 }
00161 }
00162
00163 if (state >= 3)
00164 rootSuffixTree = placeChar(rootSuffixTree,
00165 (unsigned char *) src, srclen,
00166 trg, trglen);
00167
00168 pfree(line);
00169 }
00170 skip = false;
00171 }
00172 PG_CATCH();
00173 {
00174 ErrorData *errdata;
00175 MemoryContext ecxt;
00176
00177 ecxt = MemoryContextSwitchTo(ccxt);
00178 errdata = CopyErrorData();
00179 if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
00180 {
00181 FlushErrorState();
00182 }
00183 else
00184 {
00185 MemoryContextSwitchTo(ecxt);
00186 PG_RE_THROW();
00187 }
00188 }
00189 PG_END_TRY();
00190 }
00191 while (skip);
00192
00193 tsearch_readline_end(&trst);
00194
00195 return rootSuffixTree;
00196 }
00197
00198
00199
00200
00201 static SuffixChar *
00202 findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
00203 {
00204 while (node)
00205 {
00206 node = node + *src;
00207 if (srclen == 1)
00208 return node;
00209
00210 src++;
00211 srclen--;
00212 node = node->nextChar;
00213 }
00214
00215 return NULL;
00216 }
00217
00218 PG_FUNCTION_INFO_V1(unaccent_init);
00219 Datum unaccent_init(PG_FUNCTION_ARGS);
00220 Datum
00221 unaccent_init(PG_FUNCTION_ARGS)
00222 {
00223 List *dictoptions = (List *) PG_GETARG_POINTER(0);
00224 SuffixChar *rootSuffixTree = NULL;
00225 bool fileloaded = false;
00226 ListCell *l;
00227
00228 foreach(l, dictoptions)
00229 {
00230 DefElem *defel = (DefElem *) lfirst(l);
00231
00232 if (pg_strcasecmp("Rules", defel->defname) == 0)
00233 {
00234 if (fileloaded)
00235 ereport(ERROR,
00236 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00237 errmsg("multiple Rules parameters")));
00238 rootSuffixTree = initSuffixTree(defGetString(defel));
00239 fileloaded = true;
00240 }
00241 else
00242 {
00243 ereport(ERROR,
00244 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00245 errmsg("unrecognized Unaccent parameter: \"%s\"",
00246 defel->defname)));
00247 }
00248 }
00249
00250 if (!fileloaded)
00251 {
00252 ereport(ERROR,
00253 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00254 errmsg("missing Rules parameter")));
00255 }
00256
00257 PG_RETURN_POINTER(rootSuffixTree);
00258 }
00259
00260 PG_FUNCTION_INFO_V1(unaccent_lexize);
00261 Datum unaccent_lexize(PG_FUNCTION_ARGS);
00262 Datum
00263 unaccent_lexize(PG_FUNCTION_ARGS)
00264 {
00265 SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
00266 char *srcchar = (char *) PG_GETARG_POINTER(1);
00267 int32 len = PG_GETARG_INT32(2);
00268 char *srcstart,
00269 *trgchar = NULL;
00270 int charlen;
00271 TSLexeme *res = NULL;
00272 SuffixChar *node;
00273
00274 srcstart = srcchar;
00275 while (srcchar - srcstart < len)
00276 {
00277 charlen = pg_mblen(srcchar);
00278
00279 node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
00280 if (node && node->replaceTo)
00281 {
00282 if (!res)
00283 {
00284
00285 res = palloc0(sizeof(TSLexeme) * 2);
00286 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 );
00287 res->flags = TSL_FILTER;
00288 if (srcchar != srcstart)
00289 {
00290 memcpy(trgchar, srcstart, srcchar - srcstart);
00291 trgchar += (srcchar - srcstart);
00292 }
00293 }
00294 memcpy(trgchar, node->replaceTo, node->replacelen);
00295 trgchar += node->replacelen;
00296 }
00297 else if (res)
00298 {
00299 memcpy(trgchar, srcchar, charlen);
00300 trgchar += charlen;
00301 }
00302
00303 srcchar += charlen;
00304 }
00305
00306 if (res)
00307 *trgchar = '\0';
00308
00309 PG_RETURN_POINTER(res);
00310 }
00311
00312
00313
00314
00315 PG_FUNCTION_INFO_V1(unaccent_dict);
00316 Datum unaccent_dict(PG_FUNCTION_ARGS);
00317 Datum
00318 unaccent_dict(PG_FUNCTION_ARGS)
00319 {
00320 text *str;
00321 int strArg;
00322 Oid dictOid;
00323 TSDictionaryCacheEntry *dict;
00324 TSLexeme *res;
00325
00326 if (PG_NARGS() == 1)
00327 {
00328 dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
00329 strArg = 0;
00330 }
00331 else
00332 {
00333 dictOid = PG_GETARG_OID(0);
00334 strArg = 1;
00335 }
00336 str = PG_GETARG_TEXT_P(strArg);
00337
00338 dict = lookup_ts_dictionary_cache(dictOid);
00339
00340 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
00341 PointerGetDatum(dict->dictData),
00342 PointerGetDatum(VARDATA(str)),
00343 Int32GetDatum(VARSIZE(str) - VARHDRSZ),
00344 PointerGetDatum(NULL)));
00345
00346 PG_FREE_IF_COPY(str, strArg);
00347
00348 if (res == NULL)
00349 {
00350 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
00351 }
00352 else if (res->lexeme == NULL)
00353 {
00354 pfree(res);
00355 PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
00356 }
00357 else
00358 {
00359 text *txt = cstring_to_text(res->lexeme);
00360
00361 pfree(res->lexeme);
00362 pfree(res);
00363
00364 PG_RETURN_TEXT_P(txt);
00365 }
00366 }