00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #include "postgres.h"
00014
00015 #include "commands/defrem.h"
00016 #include "tsearch/ts_locale.h"
00017 #include "tsearch/ts_utils.h"
00018
00019
00020 #ifdef MAXINT
00021 #undef MAXINT
00022 #endif
00023 #ifdef MININT
00024 #undef MININT
00025 #endif
00026
00027
00028 #include "snowball/libstemmer/header.h"
00029 #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
00030 #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
00031 #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
00032 #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
00033 #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
00034 #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
00035 #include "snowball/libstemmer/stem_ISO_8859_1_hungarian.h"
00036 #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
00037 #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
00038 #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
00039 #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
00040 #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
00041 #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
00042 #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
00043 #include "snowball/libstemmer/stem_KOI8_R_russian.h"
00044 #include "snowball/libstemmer/stem_UTF_8_danish.h"
00045 #include "snowball/libstemmer/stem_UTF_8_dutch.h"
00046 #include "snowball/libstemmer/stem_UTF_8_english.h"
00047 #include "snowball/libstemmer/stem_UTF_8_finnish.h"
00048 #include "snowball/libstemmer/stem_UTF_8_french.h"
00049 #include "snowball/libstemmer/stem_UTF_8_german.h"
00050 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
00051 #include "snowball/libstemmer/stem_UTF_8_italian.h"
00052 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
00053 #include "snowball/libstemmer/stem_UTF_8_porter.h"
00054 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
00055 #include "snowball/libstemmer/stem_UTF_8_romanian.h"
00056 #include "snowball/libstemmer/stem_UTF_8_russian.h"
00057 #include "snowball/libstemmer/stem_UTF_8_spanish.h"
00058 #include "snowball/libstemmer/stem_UTF_8_swedish.h"
00059 #include "snowball/libstemmer/stem_UTF_8_turkish.h"
00060
00061
00062 PG_MODULE_MAGIC;
00063
00064 PG_FUNCTION_INFO_V1(dsnowball_init);
00065 Datum dsnowball_init(PG_FUNCTION_ARGS);
00066
00067 PG_FUNCTION_INFO_V1(dsnowball_lexize);
00068 Datum dsnowball_lexize(PG_FUNCTION_ARGS);
00069
00070
00071 typedef struct stemmer_module
00072 {
00073 const char *name;
00074 pg_enc enc;
00075 struct SN_env *(*create) (void);
00076 void (*close) (struct SN_env *);
00077 int (*stem) (struct SN_env *);
00078 } stemmer_module;
00079
00080 static const stemmer_module stemmer_modules[] =
00081 {
00082
00083
00084
00085 {"danish", PG_LATIN1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
00086 {"dutch", PG_LATIN1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
00087 {"english", PG_LATIN1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
00088 {"finnish", PG_LATIN1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
00089 {"french", PG_LATIN1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
00090 {"german", PG_LATIN1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
00091 {"hungarian", PG_LATIN1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
00092 {"italian", PG_LATIN1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
00093 {"norwegian", PG_LATIN1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
00094 {"porter", PG_LATIN1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
00095 {"portuguese", PG_LATIN1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
00096 {"spanish", PG_LATIN1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
00097 {"swedish", PG_LATIN1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
00098 {"romanian", PG_LATIN2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
00099 {"russian", PG_KOI8R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
00100 {"danish", PG_UTF8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
00101 {"dutch", PG_UTF8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
00102 {"english", PG_UTF8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
00103 {"finnish", PG_UTF8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
00104 {"french", PG_UTF8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
00105 {"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
00106 {"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
00107 {"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
00108 {"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
00109 {"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
00110 {"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
00111 {"romanian", PG_UTF8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
00112 {"russian", PG_UTF8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
00113 {"spanish", PG_UTF8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
00114 {"swedish", PG_UTF8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
00115 {"turkish", PG_UTF8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
00116
00117
00118
00119
00120
00121 {"english", PG_SQL_ASCII, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
00122
00123 {NULL, 0, NULL, NULL, NULL}
00124 };
00125
00126
00127 typedef struct DictSnowball
00128 {
00129 struct SN_env *z;
00130 StopList stoplist;
00131 bool needrecode;
00132 int (*stem) (struct SN_env * z);
00133
00134
00135
00136
00137
00138
00139 MemoryContext dictCtx;
00140 } DictSnowball;
00141
00142
00143 static void
00144 locate_stem_module(DictSnowball *d, char *lang)
00145 {
00146 const stemmer_module *m;
00147
00148
00149
00150
00151
00152 for (m = stemmer_modules; m->name; m++)
00153 {
00154 if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
00155 pg_strcasecmp(m->name, lang) == 0)
00156 {
00157 d->stem = m->stem;
00158 d->z = m->create();
00159 d->needrecode = false;
00160 return;
00161 }
00162 }
00163
00164
00165
00166
00167 for (m = stemmer_modules; m->name; m++)
00168 {
00169 if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
00170 {
00171 d->stem = m->stem;
00172 d->z = m->create();
00173 d->needrecode = true;
00174 return;
00175 }
00176 }
00177
00178 ereport(ERROR,
00179 (errcode(ERRCODE_UNDEFINED_OBJECT),
00180 errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
00181 lang, GetDatabaseEncodingName())));
00182 }
00183
00184 Datum
00185 dsnowball_init(PG_FUNCTION_ARGS)
00186 {
00187 List *dictoptions = (List *) PG_GETARG_POINTER(0);
00188 DictSnowball *d;
00189 bool stoploaded = false;
00190 ListCell *l;
00191
00192 d = (DictSnowball *) palloc0(sizeof(DictSnowball));
00193
00194 foreach(l, dictoptions)
00195 {
00196 DefElem *defel = (DefElem *) lfirst(l);
00197
00198 if (pg_strcasecmp("StopWords", defel->defname) == 0)
00199 {
00200 if (stoploaded)
00201 ereport(ERROR,
00202 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00203 errmsg("multiple StopWords parameters")));
00204 readstoplist(defGetString(defel), &d->stoplist, lowerstr);
00205 stoploaded = true;
00206 }
00207 else if (pg_strcasecmp("Language", defel->defname) == 0)
00208 {
00209 if (d->stem)
00210 ereport(ERROR,
00211 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00212 errmsg("multiple Language parameters")));
00213 locate_stem_module(d, defGetString(defel));
00214 }
00215 else
00216 {
00217 ereport(ERROR,
00218 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00219 errmsg("unrecognized Snowball parameter: \"%s\"",
00220 defel->defname)));
00221 }
00222 }
00223
00224 if (!d->stem)
00225 ereport(ERROR,
00226 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00227 errmsg("missing Language parameter")));
00228
00229 d->dictCtx = CurrentMemoryContext;
00230
00231 PG_RETURN_POINTER(d);
00232 }
00233
00234 Datum
00235 dsnowball_lexize(PG_FUNCTION_ARGS)
00236 {
00237 DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
00238 char *in = (char *) PG_GETARG_POINTER(1);
00239 int32 len = PG_GETARG_INT32(2);
00240 char *txt = lowerstr_with_len(in, len);
00241 TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
00242
00243 if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
00244 {
00245 pfree(txt);
00246 }
00247 else
00248 {
00249 MemoryContext saveCtx;
00250
00251
00252
00253
00254 if (d->needrecode)
00255 {
00256 char *recoded;
00257
00258 recoded = (char *) pg_do_encoding_conversion((unsigned char *) txt,
00259 strlen(txt),
00260 GetDatabaseEncoding(),
00261 PG_UTF8);
00262 if (recoded != txt)
00263 {
00264 pfree(txt);
00265 txt = recoded;
00266 }
00267 }
00268
00269
00270 saveCtx = MemoryContextSwitchTo(d->dictCtx);
00271 SN_set_current(d->z, strlen(txt), (symbol *) txt);
00272 d->stem(d->z);
00273 MemoryContextSwitchTo(saveCtx);
00274
00275 if (d->z->p && d->z->l)
00276 {
00277 txt = repalloc(txt, d->z->l + 1);
00278 memcpy(txt, d->z->p, d->z->l);
00279 txt[d->z->l] = '\0';
00280 }
00281
00282
00283 if (d->needrecode)
00284 {
00285 char *recoded;
00286
00287 recoded = (char *) pg_do_encoding_conversion((unsigned char *) txt,
00288 strlen(txt),
00289 PG_UTF8,
00290 GetDatabaseEncoding());
00291 if (recoded != txt)
00292 {
00293 pfree(txt);
00294 txt = recoded;
00295 }
00296 }
00297
00298 res->lexeme = txt;
00299 }
00300
00301 PG_RETURN_POINTER(res);
00302 }