PostgreSQL Source Code: contrib/fuzzystrmatch/fuzzystrmatch.c Source File

Go to the documentation of this file.
00001 /*
00002  * fuzzystrmatch.c
00003  *
00004  * Functions for "fuzzy" comparison of strings
00005  *
00006  * Joe Conway <[email protected]>
00007  *
00008  * contrib/fuzzystrmatch/fuzzystrmatch.c
00009  * Copyright (c) 2001-2013, PostgreSQL Global Development Group
00010  * ALL RIGHTS RESERVED;
00011  *
00012  * metaphone()
00013  * -----------
00014  * Modified for PostgreSQL by Joe Conway.
00015  * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <[email protected]>
00016  * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
00017  * Metaphone was originally created by Lawrence Philips and presented in article
00018  * in "Computer Language" December 1990 issue.
00019  *
00020  * Permission to use, copy, modify, and distribute this software and its
00021  * documentation for any purpose, without fee, and without a written agreement
00022  * is hereby granted, provided that the above copyright notice and this
00023  * paragraph and the following two paragraphs appear in all copies.
00024  *
00025  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
00026  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
00027  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
00028  * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
00029  * POSSIBILITY OF SUCH DAMAGE.
00030  *
00031  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
00032  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
00033  * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
00034  * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
00035  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
00036  *
00037  */
00038 
00039 #include "postgres.h"
00040 
00041 #include <ctype.h>
00042 
00043 #include "mb/pg_wchar.h"
00044 #include "utils/builtins.h"
00045 
00046 PG_MODULE_MAGIC;
00047 
00048 
00049 /*
00050  * External declarations for exported functions
00051  */
00052 extern Datum levenshtein_with_costs(PG_FUNCTION_ARGS);
00053 extern Datum levenshtein(PG_FUNCTION_ARGS);
00054 extern Datum levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS);
00055 extern Datum levenshtein_less_equal(PG_FUNCTION_ARGS);
00056 extern Datum metaphone(PG_FUNCTION_ARGS);
00057 extern Datum soundex(PG_FUNCTION_ARGS);
00058 extern Datum difference(PG_FUNCTION_ARGS);
00059 
00060 /*
00061  * Soundex
00062  */
00063 static void _soundex(const char *instr, char *outstr);
00064 
00065 #define SOUNDEX_LEN 4
00066 
00067 /*                                  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
00068 static const char *soundex_table = "01230120022455012623010202";
00069 
00070 static char
00071 soundex_code(char letter)
00072 {
00073     letter = toupper((unsigned char) letter);
00074     /* Defend against non-ASCII letters */
00075     if (letter >= 'A' && letter <= 'Z')
00076         return soundex_table[letter - 'A'];
00077     return letter;
00078 }
00079 
00080 /*
00081  * Metaphone
00082  */
00083 #define MAX_METAPHONE_STRLEN        255
00084 
00085 /*
00086  * Original code by Michael G Schwern starts here.
00087  * Code slightly modified for use as PostgreSQL function.
00088  */
00089 
00090 
00091 /**************************************************************************
00092     metaphone -- Breaks english phrases down into their phonemes.
00093 
00094     Input
00095         word            --  An english word to be phonized
00096         max_phonemes    --  How many phonemes to calculate.  If 0, then it
00097                             will phonize the entire phrase.
00098         phoned_word     --  The final phonized word.  (We'll allocate the
00099                             memory.)
00100     Output
00101         error   --  A simple error flag, returns TRUE or FALSE
00102 
00103     NOTES:  ALL non-alpha characters are ignored, this includes whitespace,
00104     although non-alpha characters will break up phonemes.
00105 ****************************************************************************/
00106 
00107 
00108 /**************************************************************************
00109     my constants -- constants I like
00110 
00111     Probably redundant.
00112 
00113 ***************************************************************************/
00114 
00115 #define META_ERROR          FALSE
00116 #define META_SUCCESS        TRUE
00117 #define META_FAILURE        FALSE
00118 
00119 
00120 /*  I add modifications to the traditional metaphone algorithm that you
00121     might find in books.  Define this if you want metaphone to behave
00122     traditionally */
00123 #undef USE_TRADITIONAL_METAPHONE
00124 
00125 /* Special encodings */
00126 #define  SH     'X'
00127 #define  TH     '0'
00128 
00129 static char Lookahead(char *word, int how_far);
00130 static int  _metaphone(char *word, int max_phonemes, char **phoned_word);
00131 
00132 /* Metachar.h ... little bits about characters for metaphone */
00133 
00134 
00135 /*-- Character encoding array & accessing macros --*/
00136 /* Stolen directly out of the book... */
00137 static const char _codes[26] = {
00138     1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
00139 /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
00140 };
00141 
00142 static int
00143 getcode(char c)
00144 {
00145     if (isalpha((unsigned char) c))
00146     {
00147         c = toupper((unsigned char) c);
00148         /* Defend against non-ASCII letters */
00149         if (c >= 'A' && c <= 'Z')
00150             return _codes[c - 'A'];
00151     }
00152     return 0;
00153 }
00154 
00155 #define isvowel(c)  (getcode(c) & 1)    /* AEIOU */
00156 
00157 /* These letters are passed through unchanged */
00158 #define NOCHANGE(c) (getcode(c) & 2)    /* FJMNR */
00159 
00160 /* These form diphthongs when preceding H */
00161 #define AFFECTH(c)  (getcode(c) & 4)    /* CGPST */
00162 
00163 /* These make C and G soft */
00164 #define MAKESOFT(c) (getcode(c) & 8)    /* EIY */
00165 
00166 /* These prevent GH from becoming F */
00167 #define NOGHTOF(c)  (getcode(c) & 16)   /* BDH */
00168 
00169 /* Faster than memcmp(), for this use case. */
00170 static inline bool
00171 rest_of_char_same(const char *s1, const char *s2, int len)
00172 {
00173     while (len > 0)
00174     {
00175         len--;
00176         if (s1[len] != s2[len])
00177             return false;
00178     }
00179     return true;
00180 }
00181 
00182 #include "levenshtein.c"
00183 #define LEVENSHTEIN_LESS_EQUAL
00184 #include "levenshtein.c"
00185 
00186 PG_FUNCTION_INFO_V1(levenshtein_with_costs);
00187 Datum
00188 levenshtein_with_costs(PG_FUNCTION_ARGS)
00189 {
00190     text       *src = PG_GETARG_TEXT_PP(0);
00191     text       *dst = PG_GETARG_TEXT_PP(1);
00192     int         ins_c = PG_GETARG_INT32(2);
00193     int         del_c = PG_GETARG_INT32(3);
00194     int         sub_c = PG_GETARG_INT32(4);
00195 
00196     PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
00197 }
00198 
00199 
00200 PG_FUNCTION_INFO_V1(levenshtein);
00201 Datum
00202 levenshtein(PG_FUNCTION_ARGS)
00203 {
00204     text       *src = PG_GETARG_TEXT_PP(0);
00205     text       *dst = PG_GETARG_TEXT_PP(1);
00206 
00207     PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
00208 }
00209 
00210 
00211 PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
00212 Datum
00213 levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
00214 {
00215     text       *src = PG_GETARG_TEXT_PP(0);
00216     text       *dst = PG_GETARG_TEXT_PP(1);
00217     int         ins_c = PG_GETARG_INT32(2);
00218     int         del_c = PG_GETARG_INT32(3);
00219     int         sub_c = PG_GETARG_INT32(4);
00220     int         max_d = PG_GETARG_INT32(5);
00221 
00222     PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d));
00223 }
00224 
00225 
00226 PG_FUNCTION_INFO_V1(levenshtein_less_equal);
00227 Datum
00228 levenshtein_less_equal(PG_FUNCTION_ARGS)
00229 {
00230     text       *src = PG_GETARG_TEXT_PP(0);
00231     text       *dst = PG_GETARG_TEXT_PP(1);
00232     int         max_d = PG_GETARG_INT32(2);
00233 
00234     PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d));
00235 }
00236 
00237 
00238 /*
00239  * Calculates the metaphone of an input string.
00240  * Returns number of characters requested
00241  * (suggested value is 4)
00242  */
00243 PG_FUNCTION_INFO_V1(metaphone);
00244 Datum
00245 metaphone(PG_FUNCTION_ARGS)
00246 {
00247     char       *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
00248     size_t      str_i_len = strlen(str_i);
00249     int         reqlen;
00250     char       *metaph;
00251     int         retval;
00252 
00253     /* return an empty string if we receive one */
00254     if (!(str_i_len > 0))
00255         PG_RETURN_TEXT_P(cstring_to_text(""));
00256 
00257     if (str_i_len > MAX_METAPHONE_STRLEN)
00258         ereport(ERROR,
00259                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00260                  errmsg("argument exceeds the maximum length of %d bytes",
00261                         MAX_METAPHONE_STRLEN)));
00262 
00263     if (!(str_i_len > 0))
00264         ereport(ERROR,
00265                 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
00266                  errmsg("argument is empty string")));
00267 
00268     reqlen = PG_GETARG_INT32(1);
00269     if (reqlen > MAX_METAPHONE_STRLEN)
00270         ereport(ERROR,
00271                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00272                  errmsg("output exceeds the maximum length of %d bytes",
00273                         MAX_METAPHONE_STRLEN)));
00274 
00275     if (!(reqlen > 0))
00276         ereport(ERROR,
00277                 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
00278                  errmsg("output cannot be empty string")));
00279 
00280 
00281     retval = _metaphone(str_i, reqlen, &metaph);
00282     if (retval == META_SUCCESS)
00283         PG_RETURN_TEXT_P(cstring_to_text(metaph));
00284     else
00285     {
00286         /* internal error */
00287         elog(ERROR, "metaphone: failure");
00288         /* keep the compiler quiet */
00289         PG_RETURN_NULL();
00290     }
00291 }
00292 
00293 
00294 /*
00295  * Original code by Michael G Schwern starts here.
00296  * Code slightly modified for use as PostgreSQL
00297  * function (palloc, etc).
00298  */
00299 
00300 /* I suppose I could have been using a character pointer instead of
00301  * accessing the array directly... */
00302 
00303 /* Look at the next letter in the word */
00304 #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
00305 /* Look at the current letter in the word */
00306 #define Curr_Letter (toupper((unsigned char) word[w_idx]))
00307 /* Go N letters back. */
00308 #define Look_Back_Letter(n) \
00309     (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
00310 /* Previous letter.  I dunno, should this return null on failure? */
00311 #define Prev_Letter (Look_Back_Letter(1))
00312 /* Look two letters down.  It makes sure you don't walk off the string. */
00313 #define After_Next_Letter \
00314     (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
00315 #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
00316 
00317 
00318 /* Allows us to safely look ahead an arbitrary # of letters */
00319 /* I probably could have just used strlen... */
00320 static char
00321 Lookahead(char *word, int how_far)
00322 {
00323     char        letter_ahead = '\0';    /* null by default */
00324     int         idx;
00325 
00326     for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
00327     /* Edge forward in the string... */
00328 
00329     letter_ahead = word[idx];   /* idx will be either == to how_far or at the
00330                                  * end of the string */
00331     return letter_ahead;
00332 }
00333 
00334 
00335 /* phonize one letter */
00336 #define Phonize(c)  do {(*phoned_word)[p_idx++] = c;} while (0)
00337 /* Slap a null character on the end of the phoned word */
00338 #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
00339 /* How long is the phoned word? */
00340 #define Phone_Len   (p_idx)
00341 
00342 /* Note is a letter is a 'break' in the word */
00343 #define Isbreak(c)  (!isalpha((unsigned char) (c)))
00344 
00345 
00346 static int
00347 _metaphone(char *word,          /* IN */
00348            int max_phonemes,
00349            char **phoned_word)  /* OUT */
00350 {
00351     int         w_idx = 0;      /* point in the phonization we're at. */
00352     int         p_idx = 0;      /* end of the phoned phrase */
00353 
00354     /*-- Parameter checks --*/
00355 
00356     /*
00357      * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
00358      */
00359 
00360     /* Negative phoneme length is meaningless */
00361     if (!(max_phonemes > 0))
00362         /* internal error */
00363         elog(ERROR, "metaphone: Requested output length must be > 0");
00364 
00365     /* Empty/null string is meaningless */
00366     if ((word == NULL) || !(strlen(word) > 0))
00367         /* internal error */
00368         elog(ERROR, "metaphone: Input string length must be > 0");
00369 
00370     /*-- Allocate memory for our phoned_phrase --*/
00371     if (max_phonemes == 0)
00372     {                           /* Assume largest possible */
00373         *phoned_word = palloc(sizeof(char) * strlen(word) +1);
00374     }
00375     else
00376     {
00377         *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
00378     }
00379 
00380     /*-- The first phoneme has to be processed specially. --*/
00381     /* Find our first letter */
00382     for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
00383     {
00384         /* On the off chance we were given nothing but crap... */
00385         if (Curr_Letter == '\0')
00386         {
00387             End_Phoned_Word;
00388             return META_SUCCESS;    /* For testing */
00389         }
00390     }
00391 
00392     switch (Curr_Letter)
00393     {
00394             /* AE becomes E */
00395         case 'A':
00396             if (Next_Letter == 'E')
00397             {
00398                 Phonize('E');
00399                 w_idx += 2;
00400             }
00401             /* Remember, preserve vowels at the beginning */
00402             else
00403             {
00404                 Phonize('A');
00405                 w_idx++;
00406             }
00407             break;
00408             /* [GKP]N becomes N */
00409         case 'G':
00410         case 'K':
00411         case 'P':
00412             if (Next_Letter == 'N')
00413             {
00414                 Phonize('N');
00415                 w_idx += 2;
00416             }
00417             break;
00418 
00419             /*
00420              * WH becomes H, WR becomes R W if followed by a vowel
00421              */
00422         case 'W':
00423             if (Next_Letter == 'H' ||
00424                 Next_Letter == 'R')
00425             {
00426                 Phonize(Next_Letter);
00427                 w_idx += 2;
00428             }
00429             else if (isvowel(Next_Letter))
00430             {
00431                 Phonize('W');
00432                 w_idx += 2;
00433             }
00434             /* else ignore */
00435             break;
00436             /* X becomes S */
00437         case 'X':
00438             Phonize('S');
00439             w_idx++;
00440             break;
00441             /* Vowels are kept */
00442 
00443             /*
00444              * We did A already case 'A': case 'a':
00445              */
00446         case 'E':
00447         case 'I':
00448         case 'O':
00449         case 'U':
00450             Phonize(Curr_Letter);
00451             w_idx++;
00452             break;
00453         default:
00454             /* do nothing */
00455             break;
00456     }
00457 
00458 
00459 
00460     /* On to the metaphoning */
00461     for (; Curr_Letter != '\0' &&
00462          (max_phonemes == 0 || Phone_Len < max_phonemes);
00463          w_idx++)
00464     {
00465         /*
00466          * How many letters to skip because an earlier encoding handled
00467          * multiple letters
00468          */
00469         unsigned short int skip_letter = 0;
00470 
00471 
00472         /*
00473          * THOUGHT:  It would be nice if, rather than having things like...
00474          * well, SCI.  For SCI you encode the S, then have to remember to skip
00475          * the C.  So the phonome SCI invades both S and C.  It would be
00476          * better, IMHO, to skip the C from the S part of the encoding. Hell,
00477          * I'm trying it.
00478          */
00479 
00480         /* Ignore non-alphas */
00481         if (!isalpha((unsigned char) (Curr_Letter)))
00482             continue;
00483 
00484         /* Drop duplicates, except CC */
00485         if (Curr_Letter == Prev_Letter &&
00486             Curr_Letter != 'C')
00487             continue;
00488 
00489         switch (Curr_Letter)
00490         {
00491                 /* B -> B unless in MB */
00492             case 'B':
00493                 if (Prev_Letter != 'M')
00494                     Phonize('B');
00495                 break;
00496 
00497                 /*
00498                  * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
00499                  * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
00500                  * SCE-, -SCY- (handed in S) else K
00501                  */
00502             case 'C':
00503                 if (MAKESOFT(Next_Letter))
00504                 {               /* C[IEY] */
00505                     if (After_Next_Letter == 'A' &&
00506                         Next_Letter == 'I')
00507                     {           /* CIA */
00508                         Phonize(SH);
00509                     }
00510                     /* SC[IEY] */
00511                     else if (Prev_Letter == 'S')
00512                     {
00513                         /* Dropped */
00514                     }
00515                     else
00516                         Phonize('S');
00517                 }
00518                 else if (Next_Letter == 'H')
00519                 {
00520 #ifndef USE_TRADITIONAL_METAPHONE
00521                     if (After_Next_Letter == 'R' ||
00522                         Prev_Letter == 'S')
00523                     {           /* Christ, School */
00524                         Phonize('K');
00525                     }
00526                     else
00527                         Phonize(SH);
00528 #else
00529                     Phonize(SH);
00530 #endif
00531                     skip_letter++;
00532                 }
00533                 else
00534                     Phonize('K');
00535                 break;
00536 
00537                 /*
00538                  * J if in -DGE-, -DGI- or -DGY- else T
00539                  */
00540             case 'D':
00541                 if (Next_Letter == 'G' &&
00542                     MAKESOFT(After_Next_Letter))
00543                 {
00544                     Phonize('J');
00545                     skip_letter++;
00546                 }
00547                 else
00548                     Phonize('T');
00549                 break;
00550 
00551                 /*
00552                  * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
00553                  * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
00554                  * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
00555                  * else K
00556                  */
00557             case 'G':
00558                 if (Next_Letter == 'H')
00559                 {
00560                     if (!(NOGHTOF(Look_Back_Letter(3)) ||
00561                           Look_Back_Letter(4) == 'H'))
00562                     {
00563                         Phonize('F');
00564                         skip_letter++;
00565                     }
00566                     else
00567                     {
00568                         /* silent */
00569                     }
00570                 }
00571                 else if (Next_Letter == 'N')
00572                 {
00573                     if (Isbreak(After_Next_Letter) ||
00574                         (After_Next_Letter == 'E' &&
00575                          Look_Ahead_Letter(3) == 'D'))
00576                     {
00577                         /* dropped */
00578                     }
00579                     else
00580                         Phonize('K');
00581                 }
00582                 else if (MAKESOFT(Next_Letter) &&
00583                          Prev_Letter != 'G')
00584                     Phonize('J');
00585                 else
00586                     Phonize('K');
00587                 break;
00588                 /* H if before a vowel and not after C,G,P,S,T */
00589             case 'H':
00590                 if (isvowel(Next_Letter) &&
00591                     !AFFECTH(Prev_Letter))
00592                     Phonize('H');
00593                 break;
00594 
00595                 /*
00596                  * dropped if after C else K
00597                  */
00598             case 'K':
00599                 if (Prev_Letter != 'C')
00600                     Phonize('K');
00601                 break;
00602 
00603                 /*
00604                  * F if before H else P
00605                  */
00606             case 'P':
00607                 if (Next_Letter == 'H')
00608                     Phonize('F');
00609                 else
00610                     Phonize('P');
00611                 break;
00612 
00613                 /*
00614                  * K
00615                  */
00616             case 'Q':
00617                 Phonize('K');
00618                 break;
00619 
00620                 /*
00621                  * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
00622                  */
00623             case 'S':
00624                 if (Next_Letter == 'I' &&
00625                     (After_Next_Letter == 'O' ||
00626                      After_Next_Letter == 'A'))
00627                     Phonize(SH);
00628                 else if (Next_Letter == 'H')
00629                 {
00630                     Phonize(SH);
00631                     skip_letter++;
00632                 }
00633 #ifndef USE_TRADITIONAL_METAPHONE
00634                 else if (Next_Letter == 'C' &&
00635                          Look_Ahead_Letter(2) == 'H' &&
00636                          Look_Ahead_Letter(3) == 'W')
00637                 {
00638                     Phonize(SH);
00639                     skip_letter += 2;
00640                 }
00641 #endif
00642                 else
00643                     Phonize('S');
00644                 break;
00645 
00646                 /*
00647                  * 'sh' in -TIA- or -TIO- else 'th' before H else T
00648                  */
00649             case 'T':
00650                 if (Next_Letter == 'I' &&
00651                     (After_Next_Letter == 'O' ||
00652                      After_Next_Letter == 'A'))
00653                     Phonize(SH);
00654                 else if (Next_Letter == 'H')
00655                 {
00656                     Phonize(TH);
00657                     skip_letter++;
00658                 }
00659                 else
00660                     Phonize('T');
00661                 break;
00662                 /* F */
00663             case 'V':
00664                 Phonize('F');
00665                 break;
00666                 /* W before a vowel, else dropped */
00667             case 'W':
00668                 if (isvowel(Next_Letter))
00669                     Phonize('W');
00670                 break;
00671                 /* KS */
00672             case 'X':
00673                 Phonize('K');
00674                 if (max_phonemes == 0 || Phone_Len < max_phonemes)
00675                     Phonize('S');
00676                 break;
00677                 /* Y if followed by a vowel */
00678             case 'Y':
00679                 if (isvowel(Next_Letter))
00680                     Phonize('Y');
00681                 break;
00682                 /* S */
00683             case 'Z':
00684                 Phonize('S');
00685                 break;
00686                 /* No transformation */
00687             case 'F':
00688             case 'J':
00689             case 'L':
00690             case 'M':
00691             case 'N':
00692             case 'R':
00693                 Phonize(Curr_Letter);
00694                 break;
00695             default:
00696                 /* nothing */
00697                 break;
00698         }                       /* END SWITCH */
00699 
00700         w_idx += skip_letter;
00701     }                           /* END FOR */
00702 
00703     End_Phoned_Word;
00704 
00705     return (META_SUCCESS);
00706 }   /* END metaphone */
00707 
00708 
00709 /*
00710  * SQL function: soundex(text) returns text
00711  */
00712 PG_FUNCTION_INFO_V1(soundex);
00713 
00714 Datum
00715 soundex(PG_FUNCTION_ARGS)
00716 {
00717     char        outstr[SOUNDEX_LEN + 1];
00718     char       *arg;
00719 
00720     arg = text_to_cstring(PG_GETARG_TEXT_P(0));
00721 
00722     _soundex(arg, outstr);
00723 
00724     PG_RETURN_TEXT_P(cstring_to_text(outstr));
00725 }
00726 
00727 static void
00728 _soundex(const char *instr, char *outstr)
00729 {
00730     int         count;
00731 
00732     AssertArg(instr);
00733     AssertArg(outstr);
00734 
00735     outstr[SOUNDEX_LEN] = '\0';
00736 
00737     /* Skip leading non-alphabetic characters */
00738     while (!isalpha((unsigned char) instr[0]) && instr[0])
00739         ++instr;
00740 
00741     /* No string left */
00742     if (!instr[0])
00743     {
00744         outstr[0] = (char) 0;
00745         return;
00746     }
00747 
00748     /* Take the first letter as is */
00749     *outstr++ = (char) toupper((unsigned char) *instr++);
00750 
00751     count = 1;
00752     while (*instr && count < SOUNDEX_LEN)
00753     {
00754         if (isalpha((unsigned char) *instr) &&
00755             soundex_code(*instr) != soundex_code(*(instr - 1)))
00756         {
00757             *outstr = soundex_code(instr[0]);
00758             if (*outstr != '0')
00759             {
00760                 ++outstr;
00761                 ++count;
00762             }
00763         }
00764         ++instr;
00765     }
00766 
00767     /* Fill with 0's */
00768     while (count < SOUNDEX_LEN)
00769     {
00770         *outstr = '0';
00771         ++outstr;
00772         ++count;
00773     }
00774 }
00775 
00776 PG_FUNCTION_INFO_V1(difference);
00777 
00778 Datum
00779 difference(PG_FUNCTION_ARGS)
00780 {
00781     char        sndx1[SOUNDEX_LEN + 1],
00782                 sndx2[SOUNDEX_LEN + 1];
00783     int         i,
00784                 result;
00785 
00786     _soundex(text_to_cstring(PG_GETARG_TEXT_P(0)), sndx1);
00787     _soundex(text_to_cstring(PG_GETARG_TEXT_P(1)), sndx2);
00788 
00789     result = 0;
00790     for (i = 0; i < SOUNDEX_LEN; i++)
00791     {
00792         if (sndx1[i] == sndx2[i])
00793             result++;
00794     }
00795 
00796     PG_RETURN_INT32(result);
00797 }
Header And Logo

fuzzystrmatch.c