#include "postgres.h"#include <ctype.h>#include "mb/pg_wchar.h"#include "utils/builtins.h"#include "levenshtein.c"
Go to the source code of this file.
Defines | |
| #define | SOUNDEX_LEN 4 |
| #define | MAX_METAPHONE_STRLEN 255 |
| #define | META_ERROR FALSE |
| #define | META_SUCCESS TRUE |
| #define | META_FAILURE FALSE |
| #define | SH 'X' |
| #define | TH '0' |
| #define | isvowel(c) (getcode(c) & 1) |
| #define | NOCHANGE(c) (getcode(c) & 2) |
| #define | AFFECTH(c) (getcode(c) & 4) |
| #define | MAKESOFT(c) (getcode(c) & 8) |
| #define | NOGHTOF(c) (getcode(c) & 16) |
| #define | LEVENSHTEIN_LESS_EQUAL |
| #define | Next_Letter (toupper((unsigned char) word[w_idx+1])) |
| #define | Curr_Letter (toupper((unsigned char) word[w_idx])) |
| #define | Look_Back_Letter(n) (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0') |
| #define | Prev_Letter (Look_Back_Letter(1)) |
| #define | After_Next_Letter (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0') |
| #define | Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n)) |
| #define | Phonize(c) do {(*phoned_word)[p_idx++] = c;} while (0) |
| #define | End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0) |
| #define | Phone_Len (p_idx) |
| #define | Isbreak(c) (!isalpha((unsigned char) (c))) |
Functions | |
| Datum | levenshtein_with_costs (PG_FUNCTION_ARGS) |
| Datum | levenshtein (PG_FUNCTION_ARGS) |
| Datum | levenshtein_less_equal_with_costs (PG_FUNCTION_ARGS) |
| Datum | levenshtein_less_equal (PG_FUNCTION_ARGS) |
| Datum | metaphone (PG_FUNCTION_ARGS) |
| Datum | soundex (PG_FUNCTION_ARGS) |
| Datum | difference (PG_FUNCTION_ARGS) |
| static void | _soundex (const char *instr, char *outstr) |
| static char | soundex_code (char letter) |
| static char | Lookahead (char *word, int how_far) |
| static int | _metaphone (char *word, int max_phonemes, char **phoned_word) |
| static int | getcode (char c) |
| static bool | rest_of_char_same (const char *s1, const char *s2, int len) |
| PG_FUNCTION_INFO_V1 (levenshtein_with_costs) | |
| PG_FUNCTION_INFO_V1 (levenshtein) | |
| PG_FUNCTION_INFO_V1 (levenshtein_less_equal_with_costs) | |
| PG_FUNCTION_INFO_V1 (levenshtein_less_equal) | |
| PG_FUNCTION_INFO_V1 (metaphone) | |
| PG_FUNCTION_INFO_V1 (soundex) | |
| PG_FUNCTION_INFO_V1 (difference) | |
Variables | |
| PG_MODULE_MAGIC | |
| static const char * | soundex_table = "01230120022455012623010202" |
| static const char | _codes [26] |
Definition at line 161 of file fuzzystrmatch.c.
Referenced by _metaphone().
| #define After_Next_Letter (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0') |
Definition at line 313 of file fuzzystrmatch.c.
Referenced by _metaphone().
| #define Curr_Letter (toupper((unsigned char) word[w_idx])) |
Definition at line 306 of file fuzzystrmatch.c.
Referenced by _metaphone().
| #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0) |
Definition at line 338 of file fuzzystrmatch.c.
Definition at line 343 of file fuzzystrmatch.c.
Referenced by _metaphone().
Definition at line 155 of file fuzzystrmatch.c.
Referenced by _metaphone().
| #define LEVENSHTEIN_LESS_EQUAL |
Definition at line 183 of file fuzzystrmatch.c.
| #define Look_Ahead_Letter | ( | n | ) | toupper((unsigned char) Lookahead(word+w_idx, n)) |
Definition at line 315 of file fuzzystrmatch.c.
Referenced by _metaphone().
| #define Look_Back_Letter | ( | n | ) | (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0') |
Definition at line 308 of file fuzzystrmatch.c.
Referenced by _metaphone().
Definition at line 164 of file fuzzystrmatch.c.
Referenced by _metaphone().
| #define MAX_METAPHONE_STRLEN 255 |
Definition at line 83 of file fuzzystrmatch.c.
Referenced by metaphone().
| #define META_ERROR FALSE |
Definition at line 115 of file fuzzystrmatch.c.
| #define META_FAILURE FALSE |
Definition at line 117 of file fuzzystrmatch.c.
| #define META_SUCCESS TRUE |
Definition at line 116 of file fuzzystrmatch.c.
Referenced by _metaphone(), and metaphone().
| #define Next_Letter (toupper((unsigned char) word[w_idx+1])) |
Definition at line 304 of file fuzzystrmatch.c.
Referenced by _metaphone().
Definition at line 158 of file fuzzystrmatch.c.
Definition at line 167 of file fuzzystrmatch.c.
Referenced by _metaphone().
| #define Phone_Len (p_idx) |
Definition at line 340 of file fuzzystrmatch.c.
Referenced by _metaphone().
Definition at line 336 of file fuzzystrmatch.c.
Referenced by _metaphone().
| #define Prev_Letter (Look_Back_Letter(1)) |
Definition at line 311 of file fuzzystrmatch.c.
Referenced by _metaphone().
| #define SH 'X' |
Definition at line 126 of file fuzzystrmatch.c.
Referenced by _metaphone().
| #define SOUNDEX_LEN 4 |
Definition at line 65 of file fuzzystrmatch.c.
Referenced by _soundex(), difference(), and soundex().
| #define TH '0' |
Definition at line 127 of file fuzzystrmatch.c.
Referenced by _metaphone().
| static int _metaphone | ( | char * | word, | |
| int | max_phonemes, | |||
| char ** | phoned_word | |||
| ) | [static] |
Definition at line 347 of file fuzzystrmatch.c.
References AFFECTH, After_Next_Letter, Curr_Letter, elog, ERROR, Isbreak, isvowel, Look_Ahead_Letter, Look_Back_Letter, MAKESOFT, META_SUCCESS, Next_Letter, NOGHTOF, NULL, palloc(), Phone_Len, Phonize, Prev_Letter, SH, and TH.
Referenced by metaphone().
{
int w_idx = 0; /* point in the phonization we're at. */
int p_idx = 0; /* end of the phoned phrase */
/*-- Parameter checks --*/
/*
* Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
*/
/* Negative phoneme length is meaningless */
if (!(max_phonemes > 0))
/* internal error */
elog(ERROR, "metaphone: Requested output length must be > 0");
/* Empty/null string is meaningless */
if ((word == NULL) || !(strlen(word) > 0))
/* internal error */
elog(ERROR, "metaphone: Input string length must be > 0");
/*-- Allocate memory for our phoned_phrase --*/
if (max_phonemes == 0)
{ /* Assume largest possible */
*phoned_word = palloc(sizeof(char) * strlen(word) +1);
}
else
{
*phoned_word = palloc(sizeof(char) * max_phonemes + 1);
}
/*-- The first phoneme has to be processed specially. --*/
/* Find our first letter */
for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
{
/* On the off chance we were given nothing but crap... */
if (Curr_Letter == '\0')
{
End_Phoned_Word;
return META_SUCCESS; /* For testing */
}
}
switch (Curr_Letter)
{
/* AE becomes E */
case 'A':
if (Next_Letter == 'E')
{
Phonize('E');
w_idx += 2;
}
/* Remember, preserve vowels at the beginning */
else
{
Phonize('A');
w_idx++;
}
break;
/* [GKP]N becomes N */
case 'G':
case 'K':
case 'P':
if (Next_Letter == 'N')
{
Phonize('N');
w_idx += 2;
}
break;
/*
* WH becomes H, WR becomes R W if followed by a vowel
*/
case 'W':
if (Next_Letter == 'H' ||
Next_Letter == 'R')
{
Phonize(Next_Letter);
w_idx += 2;
}
else if (isvowel(Next_Letter))
{
Phonize('W');
w_idx += 2;
}
/* else ignore */
break;
/* X becomes S */
case 'X':
Phonize('S');
w_idx++;
break;
/* Vowels are kept */
/*
* We did A already case 'A': case 'a':
*/
case 'E':
case 'I':
case 'O':
case 'U':
Phonize(Curr_Letter);
w_idx++;
break;
default:
/* do nothing */
break;
}
/* On to the metaphoning */
for (; Curr_Letter != '\0' &&
(max_phonemes == 0 || Phone_Len < max_phonemes);
w_idx++)
{
/*
* How many letters to skip because an earlier encoding handled
* multiple letters
*/
unsigned short int skip_letter = 0;
/*
* THOUGHT: It would be nice if, rather than having things like...
* well, SCI. For SCI you encode the S, then have to remember to skip
* the C. So the phonome SCI invades both S and C. It would be
* better, IMHO, to skip the C from the S part of the encoding. Hell,
* I'm trying it.
*/
/* Ignore non-alphas */
if (!isalpha((unsigned char) (Curr_Letter)))
continue;
/* Drop duplicates, except CC */
if (Curr_Letter == Prev_Letter &&
Curr_Letter != 'C')
continue;
switch (Curr_Letter)
{
/* B -> B unless in MB */
case 'B':
if (Prev_Letter != 'M')
Phonize('B');
break;
/*
* 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
* handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
* SCE-, -SCY- (handed in S) else K
*/
case 'C':
if (MAKESOFT(Next_Letter))
{ /* C[IEY] */
if (After_Next_Letter == 'A' &&
Next_Letter == 'I')
{ /* CIA */
Phonize(SH);
}
/* SC[IEY] */
else if (Prev_Letter == 'S')
{
/* Dropped */
}
else
Phonize('S');
}
else if (Next_Letter == 'H')
{
#ifndef USE_TRADITIONAL_METAPHONE
if (After_Next_Letter == 'R' ||
Prev_Letter == 'S')
{ /* Christ, School */
Phonize('K');
}
else
Phonize(SH);
#else
Phonize(SH);
#endif
skip_letter++;
}
else
Phonize('K');
break;
/*
* J if in -DGE-, -DGI- or -DGY- else T
*/
case 'D':
if (Next_Letter == 'G' &&
MAKESOFT(After_Next_Letter))
{
Phonize('J');
skip_letter++;
}
else
Phonize('T');
break;
/*
* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
* dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
* -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
* else K
*/
case 'G':
if (Next_Letter == 'H')
{
if (!(NOGHTOF(Look_Back_Letter(3)) ||
Look_Back_Letter(4) == 'H'))
{
Phonize('F');
skip_letter++;
}
else
{
/* silent */
}
}
else if (Next_Letter == 'N')
{
if (Isbreak(After_Next_Letter) ||
(After_Next_Letter == 'E' &&
Look_Ahead_Letter(3) == 'D'))
{
/* dropped */
}
else
Phonize('K');
}
else if (MAKESOFT(Next_Letter) &&
Prev_Letter != 'G')
Phonize('J');
else
Phonize('K');
break;
/* H if before a vowel and not after C,G,P,S,T */
case 'H':
if (isvowel(Next_Letter) &&
!AFFECTH(Prev_Letter))
Phonize('H');
break;
/*
* dropped if after C else K
*/
case 'K':
if (Prev_Letter != 'C')
Phonize('K');
break;
/*
* F if before H else P
*/
case 'P':
if (Next_Letter == 'H')
Phonize('F');
else
Phonize('P');
break;
/*
* K
*/
case 'Q':
Phonize('K');
break;
/*
* 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
*/
case 'S':
if (Next_Letter == 'I' &&
(After_Next_Letter == 'O' ||
After_Next_Letter == 'A'))
Phonize(SH);
else if (Next_Letter == 'H')
{
Phonize(SH);
skip_letter++;
}
#ifndef USE_TRADITIONAL_METAPHONE
else if (Next_Letter == 'C' &&
Look_Ahead_Letter(2) == 'H' &&
Look_Ahead_Letter(3) == 'W')
{
Phonize(SH);
skip_letter += 2;
}
#endif
else
Phonize('S');
break;
/*
* 'sh' in -TIA- or -TIO- else 'th' before H else T
*/
case 'T':
if (Next_Letter == 'I' &&
(After_Next_Letter == 'O' ||
After_Next_Letter == 'A'))
Phonize(SH);
else if (Next_Letter == 'H')
{
Phonize(TH);
skip_letter++;
}
else
Phonize('T');
break;
/* F */
case 'V':
Phonize('F');
break;
/* W before a vowel, else dropped */
case 'W':
if (isvowel(Next_Letter))
Phonize('W');
break;
/* KS */
case 'X':
Phonize('K');
if (max_phonemes == 0 || Phone_Len < max_phonemes)
Phonize('S');
break;
/* Y if followed by a vowel */
case 'Y':
if (isvowel(Next_Letter))
Phonize('Y');
break;
/* S */
case 'Z':
Phonize('S');
break;
/* No transformation */
case 'F':
case 'J':
case 'L':
case 'M':
case 'N':
case 'R':
Phonize(Curr_Letter);
break;
default:
/* nothing */
break;
} /* END SWITCH */
w_idx += skip_letter;
} /* END FOR */
End_Phoned_Word;
return (META_SUCCESS);
} /* END metaphone */
| static void _soundex | ( | const char * | instr, | |
| char * | outstr | |||
| ) | [static] |
Definition at line 728 of file fuzzystrmatch.c.
References AssertArg, soundex_code(), and SOUNDEX_LEN.
Referenced by difference(), and soundex().
{
int count;
AssertArg(instr);
AssertArg(outstr);
outstr[SOUNDEX_LEN] = '\0';
/* Skip leading non-alphabetic characters */
while (!isalpha((unsigned char) instr[0]) && instr[0])
++instr;
/* No string left */
if (!instr[0])
{
outstr[0] = (char) 0;
return;
}
/* Take the first letter as is */
*outstr++ = (char) toupper((unsigned char) *instr++);
count = 1;
while (*instr && count < SOUNDEX_LEN)
{
if (isalpha((unsigned char) *instr) &&
soundex_code(*instr) != soundex_code(*(instr - 1)))
{
*outstr = soundex_code(instr[0]);
if (*outstr != '0')
{
++outstr;
++count;
}
}
++instr;
}
/* Fill with 0's */
while (count < SOUNDEX_LEN)
{
*outstr = '0';
++outstr;
++count;
}
}
| Datum difference | ( | PG_FUNCTION_ARGS | ) |
Definition at line 779 of file fuzzystrmatch.c.
References _soundex(), i, PG_GETARG_TEXT_P, PG_RETURN_INT32, SOUNDEX_LEN, and text_to_cstring().
Referenced by checkcondition_str().
{
char sndx1[SOUNDEX_LEN + 1],
sndx2[SOUNDEX_LEN + 1];
int i,
result;
_soundex(text_to_cstring(PG_GETARG_TEXT_P(0)), sndx1);
_soundex(text_to_cstring(PG_GETARG_TEXT_P(1)), sndx2);
result = 0;
for (i = 0; i < SOUNDEX_LEN; i++)
{
if (sndx1[i] == sndx2[i])
result++;
}
PG_RETURN_INT32(result);
}
| static int getcode | ( | char | c | ) | [static] |
Definition at line 143 of file fuzzystrmatch.c.
References _codes.
| Datum levenshtein | ( | PG_FUNCTION_ARGS | ) |
Definition at line 202 of file fuzzystrmatch.c.
References levenshtein_internal(), PG_GETARG_TEXT_PP, and PG_RETURN_INT32.
{
text *src = PG_GETARG_TEXT_PP(0);
text *dst = PG_GETARG_TEXT_PP(1);
PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
}
| Datum levenshtein_less_equal | ( | PG_FUNCTION_ARGS | ) |
Definition at line 228 of file fuzzystrmatch.c.
References PG_GETARG_INT32, PG_GETARG_TEXT_PP, and PG_RETURN_INT32.
{
text *src = PG_GETARG_TEXT_PP(0);
text *dst = PG_GETARG_TEXT_PP(1);
int max_d = PG_GETARG_INT32(2);
PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d));
}
| Datum levenshtein_less_equal_with_costs | ( | PG_FUNCTION_ARGS | ) |
Definition at line 213 of file fuzzystrmatch.c.
References PG_GETARG_INT32, PG_GETARG_TEXT_PP, and PG_RETURN_INT32.
{
text *src = PG_GETARG_TEXT_PP(0);
text *dst = PG_GETARG_TEXT_PP(1);
int ins_c = PG_GETARG_INT32(2);
int del_c = PG_GETARG_INT32(3);
int sub_c = PG_GETARG_INT32(4);
int max_d = PG_GETARG_INT32(5);
PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d));
}
| Datum levenshtein_with_costs | ( | PG_FUNCTION_ARGS | ) |
Definition at line 188 of file fuzzystrmatch.c.
References levenshtein_internal(), PG_GETARG_INT32, PG_GETARG_TEXT_PP, and PG_RETURN_INT32.
{
text *src = PG_GETARG_TEXT_PP(0);
text *dst = PG_GETARG_TEXT_PP(1);
int ins_c = PG_GETARG_INT32(2);
int del_c = PG_GETARG_INT32(3);
int sub_c = PG_GETARG_INT32(4);
PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
}
| static char Lookahead | ( | char * | word, | |
| int | how_far | |||
| ) | [static] |
Definition at line 321 of file fuzzystrmatch.c.
| Datum metaphone | ( | PG_FUNCTION_ARGS | ) |
Definition at line 245 of file fuzzystrmatch.c.
References _metaphone(), cstring_to_text(), elog, ereport, errcode(), errmsg(), ERROR, MAX_METAPHONE_STRLEN, META_SUCCESS, PG_GETARG_DATUM, PG_GETARG_INT32, PG_RETURN_NULL, PG_RETURN_TEXT_P, and TextDatumGetCString.
{
char *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
size_t str_i_len = strlen(str_i);
int reqlen;
char *metaph;
int retval;
/* return an empty string if we receive one */
if (!(str_i_len > 0))
PG_RETURN_TEXT_P(cstring_to_text(""));
if (str_i_len > MAX_METAPHONE_STRLEN)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("argument exceeds the maximum length of %d bytes",
MAX_METAPHONE_STRLEN)));
if (!(str_i_len > 0))
ereport(ERROR,
(errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
errmsg("argument is empty string")));
reqlen = PG_GETARG_INT32(1);
if (reqlen > MAX_METAPHONE_STRLEN)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("output exceeds the maximum length of %d bytes",
MAX_METAPHONE_STRLEN)));
if (!(reqlen > 0))
ereport(ERROR,
(errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
errmsg("output cannot be empty string")));
retval = _metaphone(str_i, reqlen, &metaph);
if (retval == META_SUCCESS)
PG_RETURN_TEXT_P(cstring_to_text(metaph));
else
{
/* internal error */
elog(ERROR, "metaphone: failure");
/* keep the compiler quiet */
PG_RETURN_NULL();
}
}
| PG_FUNCTION_INFO_V1 | ( | difference | ) |
| PG_FUNCTION_INFO_V1 | ( | levenshtein_less_equal_with_costs | ) |
| PG_FUNCTION_INFO_V1 | ( | soundex | ) |
| PG_FUNCTION_INFO_V1 | ( | metaphone | ) |
| PG_FUNCTION_INFO_V1 | ( | levenshtein_less_equal | ) |
| PG_FUNCTION_INFO_V1 | ( | levenshtein_with_costs | ) |
| PG_FUNCTION_INFO_V1 | ( | levenshtein | ) |
| static bool rest_of_char_same | ( | const char * | s1, | |
| const char * | s2, | |||
| int | len | |||
| ) | [inline, static] |
Definition at line 171 of file fuzzystrmatch.c.
Referenced by levenshtein_internal().
| Datum soundex | ( | PG_FUNCTION_ARGS | ) |
Definition at line 715 of file fuzzystrmatch.c.
References _soundex(), arg, cstring_to_text(), PG_GETARG_TEXT_P, PG_RETURN_TEXT_P, SOUNDEX_LEN, and text_to_cstring().
{
char outstr[SOUNDEX_LEN + 1];
char *arg;
arg = text_to_cstring(PG_GETARG_TEXT_P(0));
_soundex(arg, outstr);
PG_RETURN_TEXT_P(cstring_to_text(outstr));
}
| static char soundex_code | ( | char | letter | ) | [static] |
Definition at line 71 of file fuzzystrmatch.c.
References soundex_table.
Referenced by _soundex().
{
letter = toupper((unsigned char) letter);
/* Defend against non-ASCII letters */
if (letter >= 'A' && letter <= 'Z')
return soundex_table[letter - 'A'];
return letter;
}
const char _codes[26] [static] |
{
1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
}
Definition at line 137 of file fuzzystrmatch.c.
Referenced by getcode().
Definition at line 46 of file fuzzystrmatch.c.
const char* soundex_table = "01230120022455012623010202" [static] |
Definition at line 68 of file fuzzystrmatch.c.
Referenced by soundex_code().
1.7.1