PostgreSQL Source Code: src/backend/tsearch/dict

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * dict_synonym.c
00004  *      Synonym dictionary: replace word by its synonym
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/tsearch/dict_synonym.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 #include "postgres.h"
00015 
00016 #include "commands/defrem.h"
00017 #include "tsearch/ts_locale.h"
00018 #include "tsearch/ts_utils.h"
00019 
00020 typedef struct
00021 {
00022     char       *in;
00023     char       *out;
00024     int         outlen;
00025     uint16      flags;
00026 } Syn;
00027 
00028 typedef struct
00029 {
00030     int         len;            /* length of syn array */
00031     Syn        *syn;
00032     bool        case_sensitive;
00033 } DictSyn;
00034 
00035 /*
00036  * Finds the next whitespace-delimited word within the 'in' string.
00037  * Returns a pointer to the first character of the word, and a pointer
00038  * to the next byte after the last character in the word (in *end).
00039  * Character '*' at the end of word will not be threated as word
00040  * charater if flags is not null.
00041  */
00042 static char *
00043 findwrd(char *in, char **end, uint16 *flags)
00044 {
00045     char       *start;
00046     char       *lastchar;
00047 
00048     /* Skip leading spaces */
00049     while (*in && t_isspace(in))
00050         in += pg_mblen(in);
00051 
00052     /* Return NULL on empty lines */
00053     if (*in == '\0')
00054     {
00055         *end = NULL;
00056         return NULL;
00057     }
00058 
00059     lastchar = start = in;
00060 
00061     /* Find end of word */
00062     while (*in && !t_isspace(in))
00063     {
00064         lastchar = in;
00065         in += pg_mblen(in);
00066     }
00067 
00068     if (in - lastchar == 1 && t_iseq(lastchar, '*') && flags)
00069     {
00070         *flags = TSL_PREFIX;
00071         *end = lastchar;
00072     }
00073     else
00074     {
00075         if (flags)
00076             *flags = 0;
00077         *end = in;
00078     }
00079 
00080     return start;
00081 }
00082 
00083 static int
00084 compareSyn(const void *a, const void *b)
00085 {
00086     return strcmp(((const Syn *) a)->in, ((const Syn *) b)->in);
00087 }
00088 
00089 
00090 Datum
00091 dsynonym_init(PG_FUNCTION_ARGS)
00092 {
00093     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
00094     DictSyn    *d;
00095     ListCell   *l;
00096     char       *filename = NULL;
00097     bool        case_sensitive = false;
00098     tsearch_readline_state trst;
00099     char       *starti,
00100                *starto,
00101                *end = NULL;
00102     int         cur = 0;
00103     char       *line = NULL;
00104     uint16      flags = 0;
00105 
00106     foreach(l, dictoptions)
00107     {
00108         DefElem    *defel = (DefElem *) lfirst(l);
00109 
00110         if (pg_strcasecmp("Synonyms", defel->defname) == 0)
00111             filename = defGetString(defel);
00112         else if (pg_strcasecmp("CaseSensitive", defel->defname) == 0)
00113             case_sensitive = defGetBoolean(defel);
00114         else
00115             ereport(ERROR,
00116                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00117                      errmsg("unrecognized synonym parameter: \"%s\"",
00118                             defel->defname)));
00119     }
00120 
00121     if (!filename)
00122         ereport(ERROR,
00123                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00124                  errmsg("missing Synonyms parameter")));
00125 
00126     filename = get_tsearch_config_filename(filename, "syn");
00127 
00128     if (!tsearch_readline_begin(&trst, filename))
00129         ereport(ERROR,
00130                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00131                  errmsg("could not open synonym file \"%s\": %m",
00132                         filename)));
00133 
00134     d = (DictSyn *) palloc0(sizeof(DictSyn));
00135 
00136     while ((line = tsearch_readline(&trst)) != NULL)
00137     {
00138         starti = findwrd(line, &end, NULL);
00139         if (!starti)
00140         {
00141             /* Empty line */
00142             goto skipline;
00143         }
00144         if (*end == '\0')
00145         {
00146             /* A line with only one word. Ignore silently. */
00147             goto skipline;
00148         }
00149         *end = '\0';
00150 
00151         starto = findwrd(end + 1, &end, &flags);
00152         if (!starto)
00153         {
00154             /* A line with only one word (+whitespace). Ignore silently. */
00155             goto skipline;
00156         }
00157         *end = '\0';
00158 
00159         /*
00160          * starti now points to the first word, and starto to the second word
00161          * on the line, with a \0 terminator at the end of both words.
00162          */
00163 
00164         if (cur >= d->len)
00165         {
00166             if (d->len == 0)
00167             {
00168                 d->len = 64;
00169                 d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
00170             }
00171             else
00172             {
00173                 d->len *= 2;
00174                 d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
00175             }
00176         }
00177 
00178         if (case_sensitive)
00179         {
00180             d->syn[cur].in = pstrdup(starti);
00181             d->syn[cur].out = pstrdup(starto);
00182         }
00183         else
00184         {
00185             d->syn[cur].in = lowerstr(starti);
00186             d->syn[cur].out = lowerstr(starto);
00187         }
00188 
00189         d->syn[cur].outlen = strlen(starto);
00190         d->syn[cur].flags = flags;
00191 
00192         cur++;
00193 
00194 skipline:
00195         pfree(line);
00196     }
00197 
00198     tsearch_readline_end(&trst);
00199 
00200     d->len = cur;
00201     qsort(d->syn, d->len, sizeof(Syn), compareSyn);
00202 
00203     d->case_sensitive = case_sensitive;
00204 
00205     PG_RETURN_POINTER(d);
00206 }
00207 
00208 Datum
00209 dsynonym_lexize(PG_FUNCTION_ARGS)
00210 {
00211     DictSyn    *d = (DictSyn *) PG_GETARG_POINTER(0);
00212     char       *in = (char *) PG_GETARG_POINTER(1);
00213     int32       len = PG_GETARG_INT32(2);
00214     Syn         key,
00215                *found;
00216     TSLexeme   *res;
00217 
00218     /* note: d->len test protects against Solaris bsearch-of-no-items bug */
00219     if (len <= 0 || d->len <= 0)
00220         PG_RETURN_POINTER(NULL);
00221 
00222     if (d->case_sensitive)
00223         key.in = pnstrdup(in, len);
00224     else
00225         key.in = lowerstr_with_len(in, len);
00226 
00227     key.out = NULL;
00228 
00229     found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn);
00230     pfree(key.in);
00231 
00232     if (!found)
00233         PG_RETURN_POINTER(NULL);
00234 
00235     res = palloc0(sizeof(TSLexeme) * 2);
00236     res[0].lexeme = pnstrdup(found->out, found->outlen);
00237     res[0].flags = found->flags;
00238 
00239     PG_RETURN_POINTER(res);
00240 }
Header And Logo

dict_synonym.c