Header And Logo

PostgreSQL
| The world's most advanced open source database.

dict_snowball.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * dict_snowball.c
00004  *      Snowball dictionary
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  * IDENTIFICATION
00009  *    src/backend/snowball/dict_snowball.c
00010  *
00011  *-------------------------------------------------------------------------
00012  */
00013 #include "postgres.h"
00014 
00015 #include "commands/defrem.h"
00016 #include "tsearch/ts_locale.h"
00017 #include "tsearch/ts_utils.h"
00018 
00019 /* Some platforms define MAXINT and/or MININT, causing conflicts */
00020 #ifdef MAXINT
00021 #undef MAXINT
00022 #endif
00023 #ifdef MININT
00024 #undef MININT
00025 #endif
00026 
00027 /* Now we can include the original Snowball header.h */
00028 #include "snowball/libstemmer/header.h"
00029 #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
00030 #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
00031 #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
00032 #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
00033 #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
00034 #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
00035 #include "snowball/libstemmer/stem_ISO_8859_1_hungarian.h"
00036 #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
00037 #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
00038 #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
00039 #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
00040 #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
00041 #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
00042 #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
00043 #include "snowball/libstemmer/stem_KOI8_R_russian.h"
00044 #include "snowball/libstemmer/stem_UTF_8_danish.h"
00045 #include "snowball/libstemmer/stem_UTF_8_dutch.h"
00046 #include "snowball/libstemmer/stem_UTF_8_english.h"
00047 #include "snowball/libstemmer/stem_UTF_8_finnish.h"
00048 #include "snowball/libstemmer/stem_UTF_8_french.h"
00049 #include "snowball/libstemmer/stem_UTF_8_german.h"
00050 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
00051 #include "snowball/libstemmer/stem_UTF_8_italian.h"
00052 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
00053 #include "snowball/libstemmer/stem_UTF_8_porter.h"
00054 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
00055 #include "snowball/libstemmer/stem_UTF_8_romanian.h"
00056 #include "snowball/libstemmer/stem_UTF_8_russian.h"
00057 #include "snowball/libstemmer/stem_UTF_8_spanish.h"
00058 #include "snowball/libstemmer/stem_UTF_8_swedish.h"
00059 #include "snowball/libstemmer/stem_UTF_8_turkish.h"
00060 
00061 
00062 PG_MODULE_MAGIC;
00063 
00064 PG_FUNCTION_INFO_V1(dsnowball_init);
00065 Datum       dsnowball_init(PG_FUNCTION_ARGS);
00066 
00067 PG_FUNCTION_INFO_V1(dsnowball_lexize);
00068 Datum       dsnowball_lexize(PG_FUNCTION_ARGS);
00069 
00070 /* List of supported modules */
00071 typedef struct stemmer_module
00072 {
00073     const char *name;
00074     pg_enc      enc;
00075     struct SN_env *(*create) (void);
00076     void        (*close) (struct SN_env *);
00077     int         (*stem) (struct SN_env *);
00078 } stemmer_module;
00079 
00080 static const stemmer_module stemmer_modules[] =
00081 {
00082     /*
00083      * Stemmers list from Snowball distribution
00084      */
00085     {"danish", PG_LATIN1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
00086     {"dutch", PG_LATIN1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
00087     {"english", PG_LATIN1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
00088     {"finnish", PG_LATIN1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
00089     {"french", PG_LATIN1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
00090     {"german", PG_LATIN1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
00091     {"hungarian", PG_LATIN1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
00092     {"italian", PG_LATIN1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
00093     {"norwegian", PG_LATIN1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
00094     {"porter", PG_LATIN1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
00095     {"portuguese", PG_LATIN1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
00096     {"spanish", PG_LATIN1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
00097     {"swedish", PG_LATIN1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
00098     {"romanian", PG_LATIN2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
00099     {"russian", PG_KOI8R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
00100     {"danish", PG_UTF8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
00101     {"dutch", PG_UTF8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
00102     {"english", PG_UTF8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
00103     {"finnish", PG_UTF8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
00104     {"french", PG_UTF8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
00105     {"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
00106     {"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
00107     {"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
00108     {"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
00109     {"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
00110     {"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
00111     {"romanian", PG_UTF8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
00112     {"russian", PG_UTF8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
00113     {"spanish", PG_UTF8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
00114     {"swedish", PG_UTF8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
00115     {"turkish", PG_UTF8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
00116 
00117     /*
00118      * Stemmer with PG_SQL_ASCII encoding should be valid for any server
00119      * encoding
00120      */
00121     {"english", PG_SQL_ASCII, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
00122 
00123     {NULL, 0, NULL, NULL, NULL} /* list end marker */
00124 };
00125 
00126 
00127 typedef struct DictSnowball
00128 {
00129     struct SN_env *z;
00130     StopList    stoplist;
00131     bool        needrecode;     /* needs recoding before/after call stem */
00132     int         (*stem) (struct SN_env * z);
00133 
00134     /*
00135      * snowball saves alloced memory between calls, so we should run it in our
00136      * private memory context. Note, init function is executed in long lived
00137      * context, so we just remember CurrentMemoryContext
00138      */
00139     MemoryContext dictCtx;
00140 } DictSnowball;
00141 
00142 
00143 static void
00144 locate_stem_module(DictSnowball *d, char *lang)
00145 {
00146     const stemmer_module *m;
00147 
00148     /*
00149      * First, try to find exact match of stemmer module. Stemmer with
00150      * PG_SQL_ASCII encoding is treated as working with any server encoding
00151      */
00152     for (m = stemmer_modules; m->name; m++)
00153     {
00154         if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
00155             pg_strcasecmp(m->name, lang) == 0)
00156         {
00157             d->stem = m->stem;
00158             d->z = m->create();
00159             d->needrecode = false;
00160             return;
00161         }
00162     }
00163 
00164     /*
00165      * Second, try to find stemmer for needed language for UTF8 encoding.
00166      */
00167     for (m = stemmer_modules; m->name; m++)
00168     {
00169         if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
00170         {
00171             d->stem = m->stem;
00172             d->z = m->create();
00173             d->needrecode = true;
00174             return;
00175         }
00176     }
00177 
00178     ereport(ERROR,
00179             (errcode(ERRCODE_UNDEFINED_OBJECT),
00180              errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
00181                     lang, GetDatabaseEncodingName())));
00182 }
00183 
00184 Datum
00185 dsnowball_init(PG_FUNCTION_ARGS)
00186 {
00187     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
00188     DictSnowball *d;
00189     bool        stoploaded = false;
00190     ListCell   *l;
00191 
00192     d = (DictSnowball *) palloc0(sizeof(DictSnowball));
00193 
00194     foreach(l, dictoptions)
00195     {
00196         DefElem    *defel = (DefElem *) lfirst(l);
00197 
00198         if (pg_strcasecmp("StopWords", defel->defname) == 0)
00199         {
00200             if (stoploaded)
00201                 ereport(ERROR,
00202                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00203                          errmsg("multiple StopWords parameters")));
00204             readstoplist(defGetString(defel), &d->stoplist, lowerstr);
00205             stoploaded = true;
00206         }
00207         else if (pg_strcasecmp("Language", defel->defname) == 0)
00208         {
00209             if (d->stem)
00210                 ereport(ERROR,
00211                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00212                          errmsg("multiple Language parameters")));
00213             locate_stem_module(d, defGetString(defel));
00214         }
00215         else
00216         {
00217             ereport(ERROR,
00218                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00219                      errmsg("unrecognized Snowball parameter: \"%s\"",
00220                             defel->defname)));
00221         }
00222     }
00223 
00224     if (!d->stem)
00225         ereport(ERROR,
00226                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00227                  errmsg("missing Language parameter")));
00228 
00229     d->dictCtx = CurrentMemoryContext;
00230 
00231     PG_RETURN_POINTER(d);
00232 }
00233 
00234 Datum
00235 dsnowball_lexize(PG_FUNCTION_ARGS)
00236 {
00237     DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
00238     char       *in = (char *) PG_GETARG_POINTER(1);
00239     int32       len = PG_GETARG_INT32(2);
00240     char       *txt = lowerstr_with_len(in, len);
00241     TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
00242 
00243     if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
00244     {
00245         pfree(txt);
00246     }
00247     else
00248     {
00249         MemoryContext saveCtx;
00250 
00251         /*
00252          * recode to utf8 if stemmer is utf8 and doesn't match server encoding
00253          */
00254         if (d->needrecode)
00255         {
00256             char       *recoded;
00257 
00258             recoded = (char *) pg_do_encoding_conversion((unsigned char *) txt,
00259                                                          strlen(txt),
00260                                                        GetDatabaseEncoding(),
00261                                                          PG_UTF8);
00262             if (recoded != txt)
00263             {
00264                 pfree(txt);
00265                 txt = recoded;
00266             }
00267         }
00268 
00269         /* see comment about d->dictCtx */
00270         saveCtx = MemoryContextSwitchTo(d->dictCtx);
00271         SN_set_current(d->z, strlen(txt), (symbol *) txt);
00272         d->stem(d->z);
00273         MemoryContextSwitchTo(saveCtx);
00274 
00275         if (d->z->p && d->z->l)
00276         {
00277             txt = repalloc(txt, d->z->l + 1);
00278             memcpy(txt, d->z->p, d->z->l);
00279             txt[d->z->l] = '\0';
00280         }
00281 
00282         /* back recode if needed */
00283         if (d->needrecode)
00284         {
00285             char       *recoded;
00286 
00287             recoded = (char *) pg_do_encoding_conversion((unsigned char *) txt,
00288                                                          strlen(txt),
00289                                                          PG_UTF8,
00290                                                       GetDatabaseEncoding());
00291             if (recoded != txt)
00292             {
00293                 pfree(txt);
00294                 txt = recoded;
00295             }
00296         }
00297 
00298         res->lexeme = txt;
00299     }
00300 
00301     PG_RETURN_POINTER(res);
00302 }