00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #include "postgres.h"
00040
00041 #include <ctype.h>
00042
00043 #include "mb/pg_wchar.h"
00044 #include "utils/builtins.h"
00045
00046 PG_MODULE_MAGIC;
00047
00048
00049
00050
00051
00052 extern Datum levenshtein_with_costs(PG_FUNCTION_ARGS);
00053 extern Datum levenshtein(PG_FUNCTION_ARGS);
00054 extern Datum levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS);
00055 extern Datum levenshtein_less_equal(PG_FUNCTION_ARGS);
00056 extern Datum metaphone(PG_FUNCTION_ARGS);
00057 extern Datum soundex(PG_FUNCTION_ARGS);
00058 extern Datum difference(PG_FUNCTION_ARGS);
00059
00060
00061
00062
00063 static void _soundex(const char *instr, char *outstr);
00064
00065 #define SOUNDEX_LEN 4
00066
00067
00068 static const char *soundex_table = "01230120022455012623010202";
00069
00070 static char
00071 soundex_code(char letter)
00072 {
00073 letter = toupper((unsigned char) letter);
00074
00075 if (letter >= 'A' && letter <= 'Z')
00076 return soundex_table[letter - 'A'];
00077 return letter;
00078 }
00079
00080
00081
00082
00083 #define MAX_METAPHONE_STRLEN 255
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115 #define META_ERROR FALSE
00116 #define META_SUCCESS TRUE
00117 #define META_FAILURE FALSE
00118
00119
00120
00121
00122
00123 #undef USE_TRADITIONAL_METAPHONE
00124
00125
00126 #define SH 'X'
00127 #define TH '0'
00128
00129 static char Lookahead(char *word, int how_far);
00130 static int _metaphone(char *word, int max_phonemes, char **phoned_word);
00131
00132
00133
00134
00135
00136
00137 static const char _codes[26] = {
00138 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
00139
00140 };
00141
00142 static int
00143 getcode(char c)
00144 {
00145 if (isalpha((unsigned char) c))
00146 {
00147 c = toupper((unsigned char) c);
00148
00149 if (c >= 'A' && c <= 'Z')
00150 return _codes[c - 'A'];
00151 }
00152 return 0;
00153 }
00154
00155 #define isvowel(c) (getcode(c) & 1)
00156
00157
00158 #define NOCHANGE(c) (getcode(c) & 2)
00159
00160
00161 #define AFFECTH(c) (getcode(c) & 4)
00162
00163
00164 #define MAKESOFT(c) (getcode(c) & 8)
00165
00166
00167 #define NOGHTOF(c) (getcode(c) & 16)
00168
00169
00170 static inline bool
00171 rest_of_char_same(const char *s1, const char *s2, int len)
00172 {
00173 while (len > 0)
00174 {
00175 len--;
00176 if (s1[len] != s2[len])
00177 return false;
00178 }
00179 return true;
00180 }
00181
00182 #include "levenshtein.c"
00183 #define LEVENSHTEIN_LESS_EQUAL
00184 #include "levenshtein.c"
00185
00186 PG_FUNCTION_INFO_V1(levenshtein_with_costs);
00187 Datum
00188 levenshtein_with_costs(PG_FUNCTION_ARGS)
00189 {
00190 text *src = PG_GETARG_TEXT_PP(0);
00191 text *dst = PG_GETARG_TEXT_PP(1);
00192 int ins_c = PG_GETARG_INT32(2);
00193 int del_c = PG_GETARG_INT32(3);
00194 int sub_c = PG_GETARG_INT32(4);
00195
00196 PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
00197 }
00198
00199
00200 PG_FUNCTION_INFO_V1(levenshtein);
00201 Datum
00202 levenshtein(PG_FUNCTION_ARGS)
00203 {
00204 text *src = PG_GETARG_TEXT_PP(0);
00205 text *dst = PG_GETARG_TEXT_PP(1);
00206
00207 PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
00208 }
00209
00210
00211 PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
00212 Datum
00213 levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
00214 {
00215 text *src = PG_GETARG_TEXT_PP(0);
00216 text *dst = PG_GETARG_TEXT_PP(1);
00217 int ins_c = PG_GETARG_INT32(2);
00218 int del_c = PG_GETARG_INT32(3);
00219 int sub_c = PG_GETARG_INT32(4);
00220 int max_d = PG_GETARG_INT32(5);
00221
00222 PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d));
00223 }
00224
00225
00226 PG_FUNCTION_INFO_V1(levenshtein_less_equal);
00227 Datum
00228 levenshtein_less_equal(PG_FUNCTION_ARGS)
00229 {
00230 text *src = PG_GETARG_TEXT_PP(0);
00231 text *dst = PG_GETARG_TEXT_PP(1);
00232 int max_d = PG_GETARG_INT32(2);
00233
00234 PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d));
00235 }
00236
00237
00238
00239
00240
00241
00242
00243 PG_FUNCTION_INFO_V1(metaphone);
00244 Datum
00245 metaphone(PG_FUNCTION_ARGS)
00246 {
00247 char *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
00248 size_t str_i_len = strlen(str_i);
00249 int reqlen;
00250 char *metaph;
00251 int retval;
00252
00253
00254 if (!(str_i_len > 0))
00255 PG_RETURN_TEXT_P(cstring_to_text(""));
00256
00257 if (str_i_len > MAX_METAPHONE_STRLEN)
00258 ereport(ERROR,
00259 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00260 errmsg("argument exceeds the maximum length of %d bytes",
00261 MAX_METAPHONE_STRLEN)));
00262
00263 if (!(str_i_len > 0))
00264 ereport(ERROR,
00265 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
00266 errmsg("argument is empty string")));
00267
00268 reqlen = PG_GETARG_INT32(1);
00269 if (reqlen > MAX_METAPHONE_STRLEN)
00270 ereport(ERROR,
00271 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00272 errmsg("output exceeds the maximum length of %d bytes",
00273 MAX_METAPHONE_STRLEN)));
00274
00275 if (!(reqlen > 0))
00276 ereport(ERROR,
00277 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
00278 errmsg("output cannot be empty string")));
00279
00280
00281 retval = _metaphone(str_i, reqlen, &metaph);
00282 if (retval == META_SUCCESS)
00283 PG_RETURN_TEXT_P(cstring_to_text(metaph));
00284 else
00285 {
00286
00287 elog(ERROR, "metaphone: failure");
00288
00289 PG_RETURN_NULL();
00290 }
00291 }
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304 #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
00305
00306 #define Curr_Letter (toupper((unsigned char) word[w_idx]))
00307
00308 #define Look_Back_Letter(n) \
00309 (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
00310
00311 #define Prev_Letter (Look_Back_Letter(1))
00312
00313 #define After_Next_Letter \
00314 (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
00315 #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
00316
00317
00318
00319
00320 static char
00321 Lookahead(char *word, int how_far)
00322 {
00323 char letter_ahead = '\0';
00324 int idx;
00325
00326 for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
00327
00328
00329 letter_ahead = word[idx];
00330
00331 return letter_ahead;
00332 }
00333
00334
00335
00336 #define Phonize(c) do {(*phoned_word)[p_idx++] = c;} while (0)
00337
00338 #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
00339
00340 #define Phone_Len (p_idx)
00341
00342
00343 #define Isbreak(c) (!isalpha((unsigned char) (c)))
00344
00345
00346 static int
00347 _metaphone(char *word,
00348 int max_phonemes,
00349 char **phoned_word)
00350 {
00351 int w_idx = 0;
00352 int p_idx = 0;
00353
00354
00355
00356
00357
00358
00359
00360
00361 if (!(max_phonemes > 0))
00362
00363 elog(ERROR, "metaphone: Requested output length must be > 0");
00364
00365
00366 if ((word == NULL) || !(strlen(word) > 0))
00367
00368 elog(ERROR, "metaphone: Input string length must be > 0");
00369
00370
00371 if (max_phonemes == 0)
00372 {
00373 *phoned_word = palloc(sizeof(char) * strlen(word) +1);
00374 }
00375 else
00376 {
00377 *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
00378 }
00379
00380
00381
00382 for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
00383 {
00384
00385 if (Curr_Letter == '\0')
00386 {
00387 End_Phoned_Word;
00388 return META_SUCCESS;
00389 }
00390 }
00391
00392 switch (Curr_Letter)
00393 {
00394
00395 case 'A':
00396 if (Next_Letter == 'E')
00397 {
00398 Phonize('E');
00399 w_idx += 2;
00400 }
00401
00402 else
00403 {
00404 Phonize('A');
00405 w_idx++;
00406 }
00407 break;
00408
00409 case 'G':
00410 case 'K':
00411 case 'P':
00412 if (Next_Letter == 'N')
00413 {
00414 Phonize('N');
00415 w_idx += 2;
00416 }
00417 break;
00418
00419
00420
00421
00422 case 'W':
00423 if (Next_Letter == 'H' ||
00424 Next_Letter == 'R')
00425 {
00426 Phonize(Next_Letter);
00427 w_idx += 2;
00428 }
00429 else if (isvowel(Next_Letter))
00430 {
00431 Phonize('W');
00432 w_idx += 2;
00433 }
00434
00435 break;
00436
00437 case 'X':
00438 Phonize('S');
00439 w_idx++;
00440 break;
00441
00442
00443
00444
00445
00446 case 'E':
00447 case 'I':
00448 case 'O':
00449 case 'U':
00450 Phonize(Curr_Letter);
00451 w_idx++;
00452 break;
00453 default:
00454
00455 break;
00456 }
00457
00458
00459
00460
00461 for (; Curr_Letter != '\0' &&
00462 (max_phonemes == 0 || Phone_Len < max_phonemes);
00463 w_idx++)
00464 {
00465
00466
00467
00468
00469 unsigned short int skip_letter = 0;
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481 if (!isalpha((unsigned char) (Curr_Letter)))
00482 continue;
00483
00484
00485 if (Curr_Letter == Prev_Letter &&
00486 Curr_Letter != 'C')
00487 continue;
00488
00489 switch (Curr_Letter)
00490 {
00491
00492 case 'B':
00493 if (Prev_Letter != 'M')
00494 Phonize('B');
00495 break;
00496
00497
00498
00499
00500
00501
00502 case 'C':
00503 if (MAKESOFT(Next_Letter))
00504 {
00505 if (After_Next_Letter == 'A' &&
00506 Next_Letter == 'I')
00507 {
00508 Phonize(SH);
00509 }
00510
00511 else if (Prev_Letter == 'S')
00512 {
00513
00514 }
00515 else
00516 Phonize('S');
00517 }
00518 else if (Next_Letter == 'H')
00519 {
00520 #ifndef USE_TRADITIONAL_METAPHONE
00521 if (After_Next_Letter == 'R' ||
00522 Prev_Letter == 'S')
00523 {
00524 Phonize('K');
00525 }
00526 else
00527 Phonize(SH);
00528 #else
00529 Phonize(SH);
00530 #endif
00531 skip_letter++;
00532 }
00533 else
00534 Phonize('K');
00535 break;
00536
00537
00538
00539
00540 case 'D':
00541 if (Next_Letter == 'G' &&
00542 MAKESOFT(After_Next_Letter))
00543 {
00544 Phonize('J');
00545 skip_letter++;
00546 }
00547 else
00548 Phonize('T');
00549 break;
00550
00551
00552
00553
00554
00555
00556
00557 case 'G':
00558 if (Next_Letter == 'H')
00559 {
00560 if (!(NOGHTOF(Look_Back_Letter(3)) ||
00561 Look_Back_Letter(4) == 'H'))
00562 {
00563 Phonize('F');
00564 skip_letter++;
00565 }
00566 else
00567 {
00568
00569 }
00570 }
00571 else if (Next_Letter == 'N')
00572 {
00573 if (Isbreak(After_Next_Letter) ||
00574 (After_Next_Letter == 'E' &&
00575 Look_Ahead_Letter(3) == 'D'))
00576 {
00577
00578 }
00579 else
00580 Phonize('K');
00581 }
00582 else if (MAKESOFT(Next_Letter) &&
00583 Prev_Letter != 'G')
00584 Phonize('J');
00585 else
00586 Phonize('K');
00587 break;
00588
00589 case 'H':
00590 if (isvowel(Next_Letter) &&
00591 !AFFECTH(Prev_Letter))
00592 Phonize('H');
00593 break;
00594
00595
00596
00597
00598 case 'K':
00599 if (Prev_Letter != 'C')
00600 Phonize('K');
00601 break;
00602
00603
00604
00605
00606 case 'P':
00607 if (Next_Letter == 'H')
00608 Phonize('F');
00609 else
00610 Phonize('P');
00611 break;
00612
00613
00614
00615
00616 case 'Q':
00617 Phonize('K');
00618 break;
00619
00620
00621
00622
00623 case 'S':
00624 if (Next_Letter == 'I' &&
00625 (After_Next_Letter == 'O' ||
00626 After_Next_Letter == 'A'))
00627 Phonize(SH);
00628 else if (Next_Letter == 'H')
00629 {
00630 Phonize(SH);
00631 skip_letter++;
00632 }
00633 #ifndef USE_TRADITIONAL_METAPHONE
00634 else if (Next_Letter == 'C' &&
00635 Look_Ahead_Letter(2) == 'H' &&
00636 Look_Ahead_Letter(3) == 'W')
00637 {
00638 Phonize(SH);
00639 skip_letter += 2;
00640 }
00641 #endif
00642 else
00643 Phonize('S');
00644 break;
00645
00646
00647
00648
00649 case 'T':
00650 if (Next_Letter == 'I' &&
00651 (After_Next_Letter == 'O' ||
00652 After_Next_Letter == 'A'))
00653 Phonize(SH);
00654 else if (Next_Letter == 'H')
00655 {
00656 Phonize(TH);
00657 skip_letter++;
00658 }
00659 else
00660 Phonize('T');
00661 break;
00662
00663 case 'V':
00664 Phonize('F');
00665 break;
00666
00667 case 'W':
00668 if (isvowel(Next_Letter))
00669 Phonize('W');
00670 break;
00671
00672 case 'X':
00673 Phonize('K');
00674 if (max_phonemes == 0 || Phone_Len < max_phonemes)
00675 Phonize('S');
00676 break;
00677
00678 case 'Y':
00679 if (isvowel(Next_Letter))
00680 Phonize('Y');
00681 break;
00682
00683 case 'Z':
00684 Phonize('S');
00685 break;
00686
00687 case 'F':
00688 case 'J':
00689 case 'L':
00690 case 'M':
00691 case 'N':
00692 case 'R':
00693 Phonize(Curr_Letter);
00694 break;
00695 default:
00696
00697 break;
00698 }
00699
00700 w_idx += skip_letter;
00701 }
00702
00703 End_Phoned_Word;
00704
00705 return (META_SUCCESS);
00706 }
00707
00708
00709
00710
00711
00712 PG_FUNCTION_INFO_V1(soundex);
00713
00714 Datum
00715 soundex(PG_FUNCTION_ARGS)
00716 {
00717 char outstr[SOUNDEX_LEN + 1];
00718 char *arg;
00719
00720 arg = text_to_cstring(PG_GETARG_TEXT_P(0));
00721
00722 _soundex(arg, outstr);
00723
00724 PG_RETURN_TEXT_P(cstring_to_text(outstr));
00725 }
00726
00727 static void
00728 _soundex(const char *instr, char *outstr)
00729 {
00730 int count;
00731
00732 AssertArg(instr);
00733 AssertArg(outstr);
00734
00735 outstr[SOUNDEX_LEN] = '\0';
00736
00737
00738 while (!isalpha((unsigned char) instr[0]) && instr[0])
00739 ++instr;
00740
00741
00742 if (!instr[0])
00743 {
00744 outstr[0] = (char) 0;
00745 return;
00746 }
00747
00748
00749 *outstr++ = (char) toupper((unsigned char) *instr++);
00750
00751 count = 1;
00752 while (*instr && count < SOUNDEX_LEN)
00753 {
00754 if (isalpha((unsigned char) *instr) &&
00755 soundex_code(*instr) != soundex_code(*(instr - 1)))
00756 {
00757 *outstr = soundex_code(instr[0]);
00758 if (*outstr != '0')
00759 {
00760 ++outstr;
00761 ++count;
00762 }
00763 }
00764 ++instr;
00765 }
00766
00767
00768 while (count < SOUNDEX_LEN)
00769 {
00770 *outstr = '0';
00771 ++outstr;
00772 ++count;
00773 }
00774 }
00775
00776 PG_FUNCTION_INFO_V1(difference);
00777
00778 Datum
00779 difference(PG_FUNCTION_ARGS)
00780 {
00781 char sndx1[SOUNDEX_LEN + 1],
00782 sndx2[SOUNDEX_LEN + 1];
00783 int i,
00784 result;
00785
00786 _soundex(text_to_cstring(PG_GETARG_TEXT_P(0)), sndx1);
00787 _soundex(text_to_cstring(PG_GETARG_TEXT_P(1)), sndx2);
00788
00789 result = 0;
00790 for (i = 0; i < SOUNDEX_LEN; i++)
00791 {
00792 if (sndx1[i] == sndx2[i])
00793 result++;
00794 }
00795
00796 PG_RETURN_INT32(result);
00797 }