#include "postgres.h"
#include <ctype.h>
#include "mb/pg_wchar.h"
#include "utils/builtins.h"
#include "levenshtein.c"
Go to the source code of this file.
Defines | |
#define | SOUNDEX_LEN 4 |
#define | MAX_METAPHONE_STRLEN 255 |
#define | META_ERROR FALSE |
#define | META_SUCCESS TRUE |
#define | META_FAILURE FALSE |
#define | SH 'X' |
#define | TH '0' |
#define | isvowel(c) (getcode(c) & 1) |
#define | NOCHANGE(c) (getcode(c) & 2) |
#define | AFFECTH(c) (getcode(c) & 4) |
#define | MAKESOFT(c) (getcode(c) & 8) |
#define | NOGHTOF(c) (getcode(c) & 16) |
#define | LEVENSHTEIN_LESS_EQUAL |
#define | Next_Letter (toupper((unsigned char) word[w_idx+1])) |
#define | Curr_Letter (toupper((unsigned char) word[w_idx])) |
#define | Look_Back_Letter(n) (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0') |
#define | Prev_Letter (Look_Back_Letter(1)) |
#define | After_Next_Letter (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0') |
#define | Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n)) |
#define | Phonize(c) do {(*phoned_word)[p_idx++] = c;} while (0) |
#define | End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0) |
#define | Phone_Len (p_idx) |
#define | Isbreak(c) (!isalpha((unsigned char) (c))) |
Functions | |
Datum | levenshtein_with_costs (PG_FUNCTION_ARGS) |
Datum | levenshtein (PG_FUNCTION_ARGS) |
Datum | levenshtein_less_equal_with_costs (PG_FUNCTION_ARGS) |
Datum | levenshtein_less_equal (PG_FUNCTION_ARGS) |
Datum | metaphone (PG_FUNCTION_ARGS) |
Datum | soundex (PG_FUNCTION_ARGS) |
Datum | difference (PG_FUNCTION_ARGS) |
static void | _soundex (const char *instr, char *outstr) |
static char | soundex_code (char letter) |
static char | Lookahead (char *word, int how_far) |
static int | _metaphone (char *word, int max_phonemes, char **phoned_word) |
static int | getcode (char c) |
static bool | rest_of_char_same (const char *s1, const char *s2, int len) |
PG_FUNCTION_INFO_V1 (levenshtein_with_costs) | |
PG_FUNCTION_INFO_V1 (levenshtein) | |
PG_FUNCTION_INFO_V1 (levenshtein_less_equal_with_costs) | |
PG_FUNCTION_INFO_V1 (levenshtein_less_equal) | |
PG_FUNCTION_INFO_V1 (metaphone) | |
PG_FUNCTION_INFO_V1 (soundex) | |
PG_FUNCTION_INFO_V1 (difference) | |
Variables | |
PG_MODULE_MAGIC | |
static const char * | soundex_table = "01230120022455012623010202" |
static const char | _codes [26] |
Definition at line 161 of file fuzzystrmatch.c.
Referenced by _metaphone().
#define After_Next_Letter (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0') |
Definition at line 313 of file fuzzystrmatch.c.
Referenced by _metaphone().
#define Curr_Letter (toupper((unsigned char) word[w_idx])) |
Definition at line 306 of file fuzzystrmatch.c.
Referenced by _metaphone().
#define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0) |
Definition at line 338 of file fuzzystrmatch.c.
Definition at line 343 of file fuzzystrmatch.c.
Referenced by _metaphone().
Definition at line 155 of file fuzzystrmatch.c.
Referenced by _metaphone().
#define LEVENSHTEIN_LESS_EQUAL |
Definition at line 183 of file fuzzystrmatch.c.
#define Look_Ahead_Letter | ( | n | ) | toupper((unsigned char) Lookahead(word+w_idx, n)) |
Definition at line 315 of file fuzzystrmatch.c.
Referenced by _metaphone().
#define Look_Back_Letter | ( | n | ) | (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0') |
Definition at line 308 of file fuzzystrmatch.c.
Referenced by _metaphone().
Definition at line 164 of file fuzzystrmatch.c.
Referenced by _metaphone().
#define MAX_METAPHONE_STRLEN 255 |
Definition at line 83 of file fuzzystrmatch.c.
Referenced by metaphone().
#define META_ERROR FALSE |
Definition at line 115 of file fuzzystrmatch.c.
#define META_FAILURE FALSE |
Definition at line 117 of file fuzzystrmatch.c.
#define META_SUCCESS TRUE |
Definition at line 116 of file fuzzystrmatch.c.
Referenced by _metaphone(), and metaphone().
#define Next_Letter (toupper((unsigned char) word[w_idx+1])) |
Definition at line 304 of file fuzzystrmatch.c.
Referenced by _metaphone().
Definition at line 158 of file fuzzystrmatch.c.
Definition at line 167 of file fuzzystrmatch.c.
Referenced by _metaphone().
#define Phone_Len (p_idx) |
Definition at line 340 of file fuzzystrmatch.c.
Referenced by _metaphone().
Definition at line 336 of file fuzzystrmatch.c.
Referenced by _metaphone().
#define Prev_Letter (Look_Back_Letter(1)) |
Definition at line 311 of file fuzzystrmatch.c.
Referenced by _metaphone().
#define SH 'X' |
Definition at line 126 of file fuzzystrmatch.c.
Referenced by _metaphone().
#define SOUNDEX_LEN 4 |
Definition at line 65 of file fuzzystrmatch.c.
Referenced by _soundex(), difference(), and soundex().
#define TH '0' |
Definition at line 127 of file fuzzystrmatch.c.
Referenced by _metaphone().
static int _metaphone | ( | char * | word, | |
int | max_phonemes, | |||
char ** | phoned_word | |||
) | [static] |
Definition at line 347 of file fuzzystrmatch.c.
References AFFECTH, After_Next_Letter, Curr_Letter, elog, ERROR, Isbreak, isvowel, Look_Ahead_Letter, Look_Back_Letter, MAKESOFT, META_SUCCESS, Next_Letter, NOGHTOF, NULL, palloc(), Phone_Len, Phonize, Prev_Letter, SH, and TH.
Referenced by metaphone().
{ int w_idx = 0; /* point in the phonization we're at. */ int p_idx = 0; /* end of the phoned phrase */ /*-- Parameter checks --*/ /* * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001 */ /* Negative phoneme length is meaningless */ if (!(max_phonemes > 0)) /* internal error */ elog(ERROR, "metaphone: Requested output length must be > 0"); /* Empty/null string is meaningless */ if ((word == NULL) || !(strlen(word) > 0)) /* internal error */ elog(ERROR, "metaphone: Input string length must be > 0"); /*-- Allocate memory for our phoned_phrase --*/ if (max_phonemes == 0) { /* Assume largest possible */ *phoned_word = palloc(sizeof(char) * strlen(word) +1); } else { *phoned_word = palloc(sizeof(char) * max_phonemes + 1); } /*-- The first phoneme has to be processed specially. --*/ /* Find our first letter */ for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++) { /* On the off chance we were given nothing but crap... */ if (Curr_Letter == '\0') { End_Phoned_Word; return META_SUCCESS; /* For testing */ } } switch (Curr_Letter) { /* AE becomes E */ case 'A': if (Next_Letter == 'E') { Phonize('E'); w_idx += 2; } /* Remember, preserve vowels at the beginning */ else { Phonize('A'); w_idx++; } break; /* [GKP]N becomes N */ case 'G': case 'K': case 'P': if (Next_Letter == 'N') { Phonize('N'); w_idx += 2; } break; /* * WH becomes H, WR becomes R W if followed by a vowel */ case 'W': if (Next_Letter == 'H' || Next_Letter == 'R') { Phonize(Next_Letter); w_idx += 2; } else if (isvowel(Next_Letter)) { Phonize('W'); w_idx += 2; } /* else ignore */ break; /* X becomes S */ case 'X': Phonize('S'); w_idx++; break; /* Vowels are kept */ /* * We did A already case 'A': case 'a': */ case 'E': case 'I': case 'O': case 'U': Phonize(Curr_Letter); w_idx++; break; default: /* do nothing */ break; } /* On to the metaphoning */ for (; Curr_Letter != '\0' && (max_phonemes == 0 || Phone_Len < max_phonemes); w_idx++) { /* * How many letters to skip because an earlier encoding handled * multiple letters */ unsigned short int skip_letter = 0; /* * THOUGHT: It would be nice if, rather than having things like... * well, SCI. For SCI you encode the S, then have to remember to skip * the C. So the phonome SCI invades both S and C. It would be * better, IMHO, to skip the C from the S part of the encoding. Hell, * I'm trying it. */ /* Ignore non-alphas */ if (!isalpha((unsigned char) (Curr_Letter))) continue; /* Drop duplicates, except CC */ if (Curr_Letter == Prev_Letter && Curr_Letter != 'C') continue; switch (Curr_Letter) { /* B -> B unless in MB */ case 'B': if (Prev_Letter != 'M') Phonize('B'); break; /* * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-, * SCE-, -SCY- (handed in S) else K */ case 'C': if (MAKESOFT(Next_Letter)) { /* C[IEY] */ if (After_Next_Letter == 'A' && Next_Letter == 'I') { /* CIA */ Phonize(SH); } /* SC[IEY] */ else if (Prev_Letter == 'S') { /* Dropped */ } else Phonize('S'); } else if (Next_Letter == 'H') { #ifndef USE_TRADITIONAL_METAPHONE if (After_Next_Letter == 'R' || Prev_Letter == 'S') { /* Christ, School */ Phonize('K'); } else Phonize(SH); #else Phonize(SH); #endif skip_letter++; } else Phonize('K'); break; /* * J if in -DGE-, -DGI- or -DGY- else T */ case 'D': if (Next_Letter == 'G' && MAKESOFT(After_Next_Letter)) { Phonize('J'); skip_letter++; } else Phonize('T'); break; /* * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG * else K */ case 'G': if (Next_Letter == 'H') { if (!(NOGHTOF(Look_Back_Letter(3)) || Look_Back_Letter(4) == 'H')) { Phonize('F'); skip_letter++; } else { /* silent */ } } else if (Next_Letter == 'N') { if (Isbreak(After_Next_Letter) || (After_Next_Letter == 'E' && Look_Ahead_Letter(3) == 'D')) { /* dropped */ } else Phonize('K'); } else if (MAKESOFT(Next_Letter) && Prev_Letter != 'G') Phonize('J'); else Phonize('K'); break; /* H if before a vowel and not after C,G,P,S,T */ case 'H': if (isvowel(Next_Letter) && !AFFECTH(Prev_Letter)) Phonize('H'); break; /* * dropped if after C else K */ case 'K': if (Prev_Letter != 'C') Phonize('K'); break; /* * F if before H else P */ case 'P': if (Next_Letter == 'H') Phonize('F'); else Phonize('P'); break; /* * K */ case 'Q': Phonize('K'); break; /* * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S */ case 'S': if (Next_Letter == 'I' && (After_Next_Letter == 'O' || After_Next_Letter == 'A')) Phonize(SH); else if (Next_Letter == 'H') { Phonize(SH); skip_letter++; } #ifndef USE_TRADITIONAL_METAPHONE else if (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W') { Phonize(SH); skip_letter += 2; } #endif else Phonize('S'); break; /* * 'sh' in -TIA- or -TIO- else 'th' before H else T */ case 'T': if (Next_Letter == 'I' && (After_Next_Letter == 'O' || After_Next_Letter == 'A')) Phonize(SH); else if (Next_Letter == 'H') { Phonize(TH); skip_letter++; } else Phonize('T'); break; /* F */ case 'V': Phonize('F'); break; /* W before a vowel, else dropped */ case 'W': if (isvowel(Next_Letter)) Phonize('W'); break; /* KS */ case 'X': Phonize('K'); if (max_phonemes == 0 || Phone_Len < max_phonemes) Phonize('S'); break; /* Y if followed by a vowel */ case 'Y': if (isvowel(Next_Letter)) Phonize('Y'); break; /* S */ case 'Z': Phonize('S'); break; /* No transformation */ case 'F': case 'J': case 'L': case 'M': case 'N': case 'R': Phonize(Curr_Letter); break; default: /* nothing */ break; } /* END SWITCH */ w_idx += skip_letter; } /* END FOR */ End_Phoned_Word; return (META_SUCCESS); } /* END metaphone */
static void _soundex | ( | const char * | instr, | |
char * | outstr | |||
) | [static] |
Definition at line 728 of file fuzzystrmatch.c.
References AssertArg, soundex_code(), and SOUNDEX_LEN.
Referenced by difference(), and soundex().
{ int count; AssertArg(instr); AssertArg(outstr); outstr[SOUNDEX_LEN] = '\0'; /* Skip leading non-alphabetic characters */ while (!isalpha((unsigned char) instr[0]) && instr[0]) ++instr; /* No string left */ if (!instr[0]) { outstr[0] = (char) 0; return; } /* Take the first letter as is */ *outstr++ = (char) toupper((unsigned char) *instr++); count = 1; while (*instr && count < SOUNDEX_LEN) { if (isalpha((unsigned char) *instr) && soundex_code(*instr) != soundex_code(*(instr - 1))) { *outstr = soundex_code(instr[0]); if (*outstr != '0') { ++outstr; ++count; } } ++instr; } /* Fill with 0's */ while (count < SOUNDEX_LEN) { *outstr = '0'; ++outstr; ++count; } }
Datum difference | ( | PG_FUNCTION_ARGS | ) |
Definition at line 779 of file fuzzystrmatch.c.
References _soundex(), i, PG_GETARG_TEXT_P, PG_RETURN_INT32, SOUNDEX_LEN, and text_to_cstring().
Referenced by checkcondition_str().
{ char sndx1[SOUNDEX_LEN + 1], sndx2[SOUNDEX_LEN + 1]; int i, result; _soundex(text_to_cstring(PG_GETARG_TEXT_P(0)), sndx1); _soundex(text_to_cstring(PG_GETARG_TEXT_P(1)), sndx2); result = 0; for (i = 0; i < SOUNDEX_LEN; i++) { if (sndx1[i] == sndx2[i]) result++; } PG_RETURN_INT32(result); }
static int getcode | ( | char | c | ) | [static] |
Definition at line 143 of file fuzzystrmatch.c.
References _codes.
Datum levenshtein | ( | PG_FUNCTION_ARGS | ) |
Definition at line 202 of file fuzzystrmatch.c.
References levenshtein_internal(), PG_GETARG_TEXT_PP, and PG_RETURN_INT32.
{ text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1)); }
Datum levenshtein_less_equal | ( | PG_FUNCTION_ARGS | ) |
Definition at line 228 of file fuzzystrmatch.c.
References PG_GETARG_INT32, PG_GETARG_TEXT_PP, and PG_RETURN_INT32.
{ text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); int max_d = PG_GETARG_INT32(2); PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d)); }
Datum levenshtein_less_equal_with_costs | ( | PG_FUNCTION_ARGS | ) |
Definition at line 213 of file fuzzystrmatch.c.
References PG_GETARG_INT32, PG_GETARG_TEXT_PP, and PG_RETURN_INT32.
{ text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); int ins_c = PG_GETARG_INT32(2); int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); int max_d = PG_GETARG_INT32(5); PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d)); }
Datum levenshtein_with_costs | ( | PG_FUNCTION_ARGS | ) |
Definition at line 188 of file fuzzystrmatch.c.
References levenshtein_internal(), PG_GETARG_INT32, PG_GETARG_TEXT_PP, and PG_RETURN_INT32.
{ text *src = PG_GETARG_TEXT_PP(0); text *dst = PG_GETARG_TEXT_PP(1); int ins_c = PG_GETARG_INT32(2); int del_c = PG_GETARG_INT32(3); int sub_c = PG_GETARG_INT32(4); PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c)); }
static char Lookahead | ( | char * | word, | |
int | how_far | |||
) | [static] |
Definition at line 321 of file fuzzystrmatch.c.
Datum metaphone | ( | PG_FUNCTION_ARGS | ) |
Definition at line 245 of file fuzzystrmatch.c.
References _metaphone(), cstring_to_text(), elog, ereport, errcode(), errmsg(), ERROR, MAX_METAPHONE_STRLEN, META_SUCCESS, PG_GETARG_DATUM, PG_GETARG_INT32, PG_RETURN_NULL, PG_RETURN_TEXT_P, and TextDatumGetCString.
{ char *str_i = TextDatumGetCString(PG_GETARG_DATUM(0)); size_t str_i_len = strlen(str_i); int reqlen; char *metaph; int retval; /* return an empty string if we receive one */ if (!(str_i_len > 0)) PG_RETURN_TEXT_P(cstring_to_text("")); if (str_i_len > MAX_METAPHONE_STRLEN) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument exceeds the maximum length of %d bytes", MAX_METAPHONE_STRLEN))); if (!(str_i_len > 0)) ereport(ERROR, (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING), errmsg("argument is empty string"))); reqlen = PG_GETARG_INT32(1); if (reqlen > MAX_METAPHONE_STRLEN) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("output exceeds the maximum length of %d bytes", MAX_METAPHONE_STRLEN))); if (!(reqlen > 0)) ereport(ERROR, (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING), errmsg("output cannot be empty string"))); retval = _metaphone(str_i, reqlen, &metaph); if (retval == META_SUCCESS) PG_RETURN_TEXT_P(cstring_to_text(metaph)); else { /* internal error */ elog(ERROR, "metaphone: failure"); /* keep the compiler quiet */ PG_RETURN_NULL(); } }
PG_FUNCTION_INFO_V1 | ( | difference | ) |
PG_FUNCTION_INFO_V1 | ( | levenshtein_less_equal_with_costs | ) |
PG_FUNCTION_INFO_V1 | ( | soundex | ) |
PG_FUNCTION_INFO_V1 | ( | metaphone | ) |
PG_FUNCTION_INFO_V1 | ( | levenshtein_less_equal | ) |
PG_FUNCTION_INFO_V1 | ( | levenshtein_with_costs | ) |
PG_FUNCTION_INFO_V1 | ( | levenshtein | ) |
static bool rest_of_char_same | ( | const char * | s1, | |
const char * | s2, | |||
int | len | |||
) | [inline, static] |
Definition at line 171 of file fuzzystrmatch.c.
Referenced by levenshtein_internal().
Datum soundex | ( | PG_FUNCTION_ARGS | ) |
Definition at line 715 of file fuzzystrmatch.c.
References _soundex(), arg, cstring_to_text(), PG_GETARG_TEXT_P, PG_RETURN_TEXT_P, SOUNDEX_LEN, and text_to_cstring().
{ char outstr[SOUNDEX_LEN + 1]; char *arg; arg = text_to_cstring(PG_GETARG_TEXT_P(0)); _soundex(arg, outstr); PG_RETURN_TEXT_P(cstring_to_text(outstr)); }
static char soundex_code | ( | char | letter | ) | [static] |
Definition at line 71 of file fuzzystrmatch.c.
References soundex_table.
Referenced by _soundex().
{ letter = toupper((unsigned char) letter); /* Defend against non-ASCII letters */ if (letter >= 'A' && letter <= 'Z') return soundex_table[letter - 'A']; return letter; }
const char _codes[26] [static] |
{ 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0 }
Definition at line 137 of file fuzzystrmatch.c.
Referenced by getcode().
Definition at line 46 of file fuzzystrmatch.c.
const char* soundex_table = "01230120022455012623010202" [static] |
Definition at line 68 of file fuzzystrmatch.c.
Referenced by soundex_code().