Header And Logo

PostgreSQL
| The world's most advanced open source database.

unaccent.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * unaccent.c
00004  *    Text search unaccent dictionary
00005  *
00006  * Copyright (c) 2009-2013, PostgreSQL Global Development Group
00007  *
00008  * IDENTIFICATION
00009  *    contrib/unaccent/unaccent.c
00010  *
00011  *-------------------------------------------------------------------------
00012  */
00013 
00014 #include "postgres.h"
00015 
00016 #include "catalog/namespace.h"
00017 #include "commands/defrem.h"
00018 #include "tsearch/ts_cache.h"
00019 #include "tsearch/ts_locale.h"
00020 #include "tsearch/ts_public.h"
00021 #include "utils/builtins.h"
00022 
00023 PG_MODULE_MAGIC;
00024 
00025 /*
00026  * Unaccent dictionary uses uncompressed suffix tree to find a
00027  * character to replace. Each node of tree is an array of
00028  * SuffixChar struct with length = 256 (n-th element of array
00029  * corresponds to byte)
00030  */
00031 typedef struct SuffixChar
00032 {
00033     struct SuffixChar *nextChar;
00034     char       *replaceTo;
00035     int         replacelen;
00036 } SuffixChar;
00037 
00038 /*
00039  * placeChar - put str into tree's structure, byte by byte.
00040  */
00041 static SuffixChar *
00042 placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
00043 {
00044     SuffixChar *curnode;
00045 
00046     if (!node)
00047     {
00048         node = palloc(sizeof(SuffixChar) * 256);
00049         memset(node, 0, sizeof(SuffixChar) * 256);
00050     }
00051 
00052     curnode = node + *str;
00053 
00054     if (lenstr == 1)
00055     {
00056         if (curnode->replaceTo)
00057             elog(WARNING, "duplicate TO argument, use first one");
00058         else
00059         {
00060             curnode->replacelen = replacelen;
00061             curnode->replaceTo = palloc(replacelen);
00062             memcpy(curnode->replaceTo, replaceTo, replacelen);
00063         }
00064     }
00065     else
00066     {
00067         curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
00068     }
00069 
00070     return node;
00071 }
00072 
00073 /*
00074  * initSuffixTree  - create suffix tree from file. Function converts
00075  * UTF8-encoded file into current encoding.
00076  */
00077 static SuffixChar *
00078 initSuffixTree(char *filename)
00079 {
00080     SuffixChar *volatile rootSuffixTree = NULL;
00081     MemoryContext ccxt = CurrentMemoryContext;
00082     tsearch_readline_state trst;
00083     volatile bool skip;
00084 
00085     filename = get_tsearch_config_filename(filename, "rules");
00086     if (!tsearch_readline_begin(&trst, filename))
00087         ereport(ERROR,
00088                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00089                  errmsg("could not open unaccent file \"%s\": %m",
00090                         filename)));
00091 
00092     do
00093     {
00094         /*
00095          * pg_do_encoding_conversion() (called by tsearch_readline()) will
00096          * emit exception if it finds untranslatable characters in current
00097          * locale. We just skip such lines, continuing with the next.
00098          */
00099         skip = true;
00100 
00101         PG_TRY();
00102         {
00103             char       *line;
00104 
00105             while ((line = tsearch_readline(&trst)) != NULL)
00106             {
00107                 /*
00108                  * The format of each line must be "src trg" where src and trg
00109                  * are sequences of one or more non-whitespace characters,
00110                  * separated by whitespace.  Whitespace at start or end of
00111                  * line is ignored.
00112                  */
00113                 int         state;
00114                 char       *ptr;
00115                 char       *src = NULL;
00116                 char       *trg = NULL;
00117                 int         ptrlen;
00118                 int         srclen = 0;
00119                 int         trglen = 0;
00120 
00121                 state = 0;
00122                 for (ptr = line; *ptr; ptr += ptrlen)
00123                 {
00124                     ptrlen = pg_mblen(ptr);
00125                     /* ignore whitespace, but end src or trg */
00126                     if (t_isspace(ptr))
00127                     {
00128                         if (state == 1)
00129                             state = 2;
00130                         else if (state == 3)
00131                             state = 4;
00132                         continue;
00133                     }
00134                     switch (state)
00135                     {
00136                         case 0:
00137                             /* start of src */
00138                             src = ptr;
00139                             srclen = ptrlen;
00140                             state = 1;
00141                             break;
00142                         case 1:
00143                             /* continue src */
00144                             srclen += ptrlen;
00145                             break;
00146                         case 2:
00147                             /* start of trg */
00148                             trg = ptr;
00149                             trglen = ptrlen;
00150                             state = 3;
00151                             break;
00152                         case 3:
00153                             /* continue trg */
00154                             trglen += ptrlen;
00155                             break;
00156                         default:
00157                             /* bogus line format */
00158                             state = -1;
00159                             break;
00160                     }
00161                 }
00162 
00163                 if (state >= 3)
00164                     rootSuffixTree = placeChar(rootSuffixTree,
00165                                                (unsigned char *) src, srclen,
00166                                                trg, trglen);
00167 
00168                 pfree(line);
00169             }
00170             skip = false;
00171         }
00172         PG_CATCH();
00173         {
00174             ErrorData  *errdata;
00175             MemoryContext ecxt;
00176 
00177             ecxt = MemoryContextSwitchTo(ccxt);
00178             errdata = CopyErrorData();
00179             if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
00180             {
00181                 FlushErrorState();
00182             }
00183             else
00184             {
00185                 MemoryContextSwitchTo(ecxt);
00186                 PG_RE_THROW();
00187             }
00188         }
00189         PG_END_TRY();
00190     }
00191     while (skip);
00192 
00193     tsearch_readline_end(&trst);
00194 
00195     return rootSuffixTree;
00196 }
00197 
00198 /*
00199  * findReplaceTo - find multibyte character in tree
00200  */
00201 static SuffixChar *
00202 findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
00203 {
00204     while (node)
00205     {
00206         node = node + *src;
00207         if (srclen == 1)
00208             return node;
00209 
00210         src++;
00211         srclen--;
00212         node = node->nextChar;
00213     }
00214 
00215     return NULL;
00216 }
00217 
00218 PG_FUNCTION_INFO_V1(unaccent_init);
00219 Datum       unaccent_init(PG_FUNCTION_ARGS);
00220 Datum
00221 unaccent_init(PG_FUNCTION_ARGS)
00222 {
00223     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
00224     SuffixChar *rootSuffixTree = NULL;
00225     bool        fileloaded = false;
00226     ListCell   *l;
00227 
00228     foreach(l, dictoptions)
00229     {
00230         DefElem    *defel = (DefElem *) lfirst(l);
00231 
00232         if (pg_strcasecmp("Rules", defel->defname) == 0)
00233         {
00234             if (fileloaded)
00235                 ereport(ERROR,
00236                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00237                          errmsg("multiple Rules parameters")));
00238             rootSuffixTree = initSuffixTree(defGetString(defel));
00239             fileloaded = true;
00240         }
00241         else
00242         {
00243             ereport(ERROR,
00244                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00245                      errmsg("unrecognized Unaccent parameter: \"%s\"",
00246                             defel->defname)));
00247         }
00248     }
00249 
00250     if (!fileloaded)
00251     {
00252         ereport(ERROR,
00253                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00254                  errmsg("missing Rules parameter")));
00255     }
00256 
00257     PG_RETURN_POINTER(rootSuffixTree);
00258 }
00259 
00260 PG_FUNCTION_INFO_V1(unaccent_lexize);
00261 Datum       unaccent_lexize(PG_FUNCTION_ARGS);
00262 Datum
00263 unaccent_lexize(PG_FUNCTION_ARGS)
00264 {
00265     SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
00266     char       *srcchar = (char *) PG_GETARG_POINTER(1);
00267     int32       len = PG_GETARG_INT32(2);
00268     char       *srcstart,
00269                *trgchar = NULL;
00270     int         charlen;
00271     TSLexeme   *res = NULL;
00272     SuffixChar *node;
00273 
00274     srcstart = srcchar;
00275     while (srcchar - srcstart < len)
00276     {
00277         charlen = pg_mblen(srcchar);
00278 
00279         node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
00280         if (node && node->replaceTo)
00281         {
00282             if (!res)
00283             {
00284                 /* allocate res only if it's needed */
00285                 res = palloc0(sizeof(TSLexeme) * 2);
00286                 res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
00287                 res->flags = TSL_FILTER;
00288                 if (srcchar != srcstart)
00289                 {
00290                     memcpy(trgchar, srcstart, srcchar - srcstart);
00291                     trgchar += (srcchar - srcstart);
00292                 }
00293             }
00294             memcpy(trgchar, node->replaceTo, node->replacelen);
00295             trgchar += node->replacelen;
00296         }
00297         else if (res)
00298         {
00299             memcpy(trgchar, srcchar, charlen);
00300             trgchar += charlen;
00301         }
00302 
00303         srcchar += charlen;
00304     }
00305 
00306     if (res)
00307         *trgchar = '\0';
00308 
00309     PG_RETURN_POINTER(res);
00310 }
00311 
00312 /*
00313  * Function-like wrapper for dictionary
00314  */
00315 PG_FUNCTION_INFO_V1(unaccent_dict);
00316 Datum       unaccent_dict(PG_FUNCTION_ARGS);
00317 Datum
00318 unaccent_dict(PG_FUNCTION_ARGS)
00319 {
00320     text       *str;
00321     int         strArg;
00322     Oid         dictOid;
00323     TSDictionaryCacheEntry *dict;
00324     TSLexeme   *res;
00325 
00326     if (PG_NARGS() == 1)
00327     {
00328         dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
00329         strArg = 0;
00330     }
00331     else
00332     {
00333         dictOid = PG_GETARG_OID(0);
00334         strArg = 1;
00335     }
00336     str = PG_GETARG_TEXT_P(strArg);
00337 
00338     dict = lookup_ts_dictionary_cache(dictOid);
00339 
00340     res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
00341                                              PointerGetDatum(dict->dictData),
00342                                                PointerGetDatum(VARDATA(str)),
00343                                       Int32GetDatum(VARSIZE(str) - VARHDRSZ),
00344                                                      PointerGetDatum(NULL)));
00345 
00346     PG_FREE_IF_COPY(str, strArg);
00347 
00348     if (res == NULL)
00349     {
00350         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
00351     }
00352     else if (res->lexeme == NULL)
00353     {
00354         pfree(res);
00355         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
00356     }
00357     else
00358     {
00359         text       *txt = cstring_to_text(res->lexeme);
00360 
00361         pfree(res->lexeme);
00362         pfree(res);
00363 
00364         PG_RETURN_TEXT_P(txt);
00365     }
00366 }