Header And Logo

PostgreSQL
| The world's most advanced open source database.

dict_xsyn.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * dict_xsyn.c
00004  *    Extended synonym dictionary
00005  *
00006  * Copyright (c) 2007-2013, PostgreSQL Global Development Group
00007  *
00008  * IDENTIFICATION
00009  *    contrib/dict_xsyn/dict_xsyn.c
00010  *
00011  *-------------------------------------------------------------------------
00012  */
00013 #include "postgres.h"
00014 
00015 #include <ctype.h>
00016 
00017 #include "commands/defrem.h"
00018 #include "tsearch/ts_locale.h"
00019 #include "tsearch/ts_utils.h"
00020 
00021 PG_MODULE_MAGIC;
00022 
00023 typedef struct
00024 {
00025     char       *key;            /* Word */
00026     char       *value;          /* Unparsed list of synonyms, including the
00027                                  * word itself */
00028 } Syn;
00029 
00030 typedef struct
00031 {
00032     int         len;
00033     Syn        *syn;
00034 
00035     bool        matchorig;
00036     bool        keeporig;
00037     bool        matchsynonyms;
00038     bool        keepsynonyms;
00039 } DictSyn;
00040 
00041 
00042 PG_FUNCTION_INFO_V1(dxsyn_init);
00043 Datum       dxsyn_init(PG_FUNCTION_ARGS);
00044 
00045 PG_FUNCTION_INFO_V1(dxsyn_lexize);
00046 Datum       dxsyn_lexize(PG_FUNCTION_ARGS);
00047 
00048 static char *
00049 find_word(char *in, char **end)
00050 {
00051     char       *start;
00052 
00053     *end = NULL;
00054     while (*in && t_isspace(in))
00055         in += pg_mblen(in);
00056 
00057     if (!*in || *in == '#')
00058         return NULL;
00059     start = in;
00060 
00061     while (*in && !t_isspace(in))
00062         in += pg_mblen(in);
00063 
00064     *end = in;
00065 
00066     return start;
00067 }
00068 
00069 static int
00070 compare_syn(const void *a, const void *b)
00071 {
00072     return strcmp(((const Syn *) a)->key, ((const Syn *) b)->key);
00073 }
00074 
00075 static void
00076 read_dictionary(DictSyn *d, char *filename)
00077 {
00078     char       *real_filename = get_tsearch_config_filename(filename, "rules");
00079     tsearch_readline_state trst;
00080     char       *line;
00081     int         cur = 0;
00082 
00083     if (!tsearch_readline_begin(&trst, real_filename))
00084         ereport(ERROR,
00085                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
00086                  errmsg("could not open synonym file \"%s\": %m",
00087                         real_filename)));
00088 
00089     while ((line = tsearch_readline(&trst)) != NULL)
00090     {
00091         char       *value;
00092         char       *key;
00093         char       *pos;
00094         char       *end;
00095 
00096         if (*line == '\0')
00097             continue;
00098 
00099         value = lowerstr(line);
00100         pfree(line);
00101 
00102         pos = value;
00103         while ((key = find_word(pos, &end)) != NULL)
00104         {
00105             /* Enlarge syn structure if full */
00106             if (cur == d->len)
00107             {
00108                 d->len = (d->len > 0) ? 2 * d->len : 16;
00109                 if (d->syn)
00110                     d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
00111                 else
00112                     d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
00113             }
00114 
00115             /* Save first word only if we will match it */
00116             if (pos != value || d->matchorig)
00117             {
00118                 d->syn[cur].key = pnstrdup(key, end - key);
00119                 d->syn[cur].value = pstrdup(value);
00120 
00121                 cur++;
00122             }
00123 
00124             pos = end;
00125 
00126             /* Don't bother scanning synonyms if we will not match them */
00127             if (!d->matchsynonyms)
00128                 break;
00129         }
00130 
00131         pfree(value);
00132     }
00133 
00134     tsearch_readline_end(&trst);
00135 
00136     d->len = cur;
00137     if (cur > 1)
00138         qsort(d->syn, d->len, sizeof(Syn), compare_syn);
00139 
00140     pfree(real_filename);
00141 }
00142 
00143 Datum
00144 dxsyn_init(PG_FUNCTION_ARGS)
00145 {
00146     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
00147     DictSyn    *d;
00148     ListCell   *l;
00149     char       *filename = NULL;
00150 
00151     d = (DictSyn *) palloc0(sizeof(DictSyn));
00152     d->len = 0;
00153     d->syn = NULL;
00154     d->matchorig = true;
00155     d->keeporig = true;
00156     d->matchsynonyms = false;
00157     d->keepsynonyms = true;
00158 
00159     foreach(l, dictoptions)
00160     {
00161         DefElem    *defel = (DefElem *) lfirst(l);
00162 
00163         if (pg_strcasecmp(defel->defname, "MATCHORIG") == 0)
00164         {
00165             d->matchorig = defGetBoolean(defel);
00166         }
00167         else if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
00168         {
00169             d->keeporig = defGetBoolean(defel);
00170         }
00171         else if (pg_strcasecmp(defel->defname, "MATCHSYNONYMS") == 0)
00172         {
00173             d->matchsynonyms = defGetBoolean(defel);
00174         }
00175         else if (pg_strcasecmp(defel->defname, "KEEPSYNONYMS") == 0)
00176         {
00177             d->keepsynonyms = defGetBoolean(defel);
00178         }
00179         else if (pg_strcasecmp(defel->defname, "RULES") == 0)
00180         {
00181             /* we can't read the rules before parsing all options! */
00182             filename = defGetString(defel);
00183         }
00184         else
00185         {
00186             ereport(ERROR,
00187                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00188                      errmsg("unrecognized xsyn parameter: \"%s\"",
00189                             defel->defname)));
00190         }
00191     }
00192 
00193     if (filename)
00194         read_dictionary(d, filename);
00195 
00196     PG_RETURN_POINTER(d);
00197 }
00198 
00199 Datum
00200 dxsyn_lexize(PG_FUNCTION_ARGS)
00201 {
00202     DictSyn    *d = (DictSyn *) PG_GETARG_POINTER(0);
00203     char       *in = (char *) PG_GETARG_POINTER(1);
00204     int         length = PG_GETARG_INT32(2);
00205     Syn         word;
00206     Syn        *found;
00207     TSLexeme   *res = NULL;
00208 
00209     if (!length || d->len == 0)
00210         PG_RETURN_POINTER(NULL);
00211 
00212     /* Create search pattern */
00213     {
00214         char       *temp = pnstrdup(in, length);
00215 
00216         word.key = lowerstr(temp);
00217         pfree(temp);
00218         word.value = NULL;
00219     }
00220 
00221     /* Look for matching syn */
00222     found = (Syn *) bsearch(&word, d->syn, d->len, sizeof(Syn), compare_syn);
00223     pfree(word.key);
00224 
00225     if (!found)
00226         PG_RETURN_POINTER(NULL);
00227 
00228     /* Parse string of synonyms and return array of words */
00229     {
00230         char       *value = found->value;
00231         char       *syn;
00232         char       *pos;
00233         char       *end;
00234         int         nsyns = 0;
00235 
00236         res = palloc(sizeof(TSLexeme));
00237 
00238         pos = value;
00239         while ((syn = find_word(pos, &end)) != NULL)
00240         {
00241             res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
00242 
00243             /* The first word is output only if keeporig=true */
00244             if (pos != value || d->keeporig)
00245             {
00246                 res[nsyns].lexeme = pnstrdup(syn, end - syn);
00247                 res[nsyns].nvariant = 0;
00248                 res[nsyns].flags = 0;
00249                 nsyns++;
00250             }
00251 
00252             pos = end;
00253 
00254             /* Stop if we are not to output the synonyms */
00255             if (!d->keepsynonyms)
00256                 break;
00257         }
00258         res[nsyns].lexeme = NULL;
00259     }
00260 
00261     PG_RETURN_POINTER(res);
00262 }