Header And Logo

PostgreSQL
| The world's most advanced open source database.

dmetaphone.c

Go to the documentation of this file.
00001 /*
00002  * This is a port of the Double Metaphone algorithm for use in PostgreSQL.
00003  *
00004  * contrib/fuzzystrmatch/dmetaphone.c
00005  *
00006  * Double Metaphone computes 2 "sounds like" strings - a primary and an
00007  * alternate. In most cases they are the same, but for foreign names
00008  * especially they can be a bit different, depending on pronunciation.
00009  *
00010  * Information on using Double Metaphone can be found at
00011  *   http://www.codeproject.com/string/dmetaphone1.asp
00012  * and the original article describing it can be found at
00013  *   http://www.cuj.com/documents/s=8038/cuj0006philips/
00014  *
00015  * For PostgreSQL we provide 2 functions - one for the primary and one for
00016  * the alternate. That way the functions are pure text->text mappings that
00017  * are useful in functional indexes. These are 'dmetaphone' for the
00018  * primary and 'dmetaphone_alt' for the alternate.
00019  *
00020  * Assuming that dmetaphone.so is in $libdir, the SQL to set up the
00021  * functions looks like this:
00022  *
00023  * CREATE FUNCTION dmetaphone (text) RETURNS text
00024  *    LANGUAGE C IMMUTABLE STRICT
00025  *    AS '$libdir/dmetaphone', 'dmetaphone';
00026  *
00027  * CREATE FUNCTION dmetaphone_alt (text) RETURNS text
00028  *    LANGUAGE C IMMUTABLE STRICT
00029  *    AS '$libdir/dmetaphone', 'dmetaphone_alt';
00030  *
00031  * Note that you have to declare the functions IMMUTABLE if you want to
00032  * use them in functional indexes, and you have to declare them as STRICT
00033  * as they do not check for NULL input, and will segfault if given NULL input.
00034  * (See below for alternative ) Declaring them as STRICT means PostgreSQL
00035  * will never call them with NULL, but instead assume the result is NULL,
00036  * which is what we (I) want.
00037  *
00038  * Alternatively, compile with -DDMETAPHONE_NOSTRICT and the functions
00039  * will detect NULL input and return NULL. The you don't have to declare them
00040  * as STRICT.
00041  *
00042  * There is a small inefficiency here - each function call actually computes
00043  * both the primary and the alternate and then throws away the one it doesn't
00044  * need. That's the way the perl module was written, because perl can handle
00045  * a list return more easily than we can in PostgreSQL. The result has been
00046  * fast enough for my needs, but it could maybe be optimized a bit to remove
00047  * that behaviour.
00048  *
00049  */
00050 
00051 
00052 /***************************** COPYRIGHT NOTICES ***********************
00053 
00054 Most of this code is directly from the Text::DoubleMetaphone perl module
00055 version 0.05 available from http://www.cpan.org.
00056 It bears this copyright notice:
00057 
00058 
00059   Copyright 2000, Maurice Aubrey <[email protected]>.
00060   All rights reserved.
00061 
00062   This code is based heavily on the C++ implementation by
00063   Lawrence Philips and incorporates several bug fixes courtesy
00064   of Kevin Atkinson <[email protected]>.
00065 
00066   This module is free software; you may redistribute it and/or
00067   modify it under the same terms as Perl itself.
00068 
00069 The remaining code is authored by Andrew Dunstan <[email protected]> and
00070 <[email protected]> and is covered this copyright:
00071 
00072   Copyright 2003, North Carolina State Highway Patrol.
00073   All rights reserved.
00074 
00075   Permission to use, copy, modify, and distribute this software and its
00076   documentation for any purpose, without fee, and without a written agreement
00077   is hereby granted, provided that the above copyright notice and this
00078   paragraph and the following two paragraphs appear in all copies.
00079 
00080   IN NO EVENT SHALL THE NORTH CAROLINA STATE HIGHWAY PATROL BE LIABLE TO ANY
00081   PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
00082   INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
00083   DOCUMENTATION, EVEN IF THE NORTH CAROLINA STATE HIGHWAY PATROL HAS BEEN
00084   ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00085 
00086   THE NORTH CAROLINA STATE HIGHWAY PATROL SPECIFICALLY DISCLAIMS ANY
00087   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
00088   MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED
00089   HEREUNDER IS ON AN "AS IS" BASIS, AND THE NORTH CAROLINA STATE HIGHWAY PATROL
00090   HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
00091   MODIFICATIONS.
00092 
00093 ***********************************************************************/
00094 
00095 
00096 
00097 
00098 
00099 /* include these first, according to the docs */
00100 #ifndef DMETAPHONE_MAIN
00101 
00102 #include "postgres.h"
00103 
00104 #include "utils/builtins.h"
00105 
00106 /* turn off assertions for embedded function */
00107 #define NDEBUG
00108 #endif
00109 
00110 #include <stdio.h>
00111 #include <ctype.h>
00112 #include <stdlib.h>
00113 #include <string.h>
00114 #include <stdarg.h>
00115 #include <assert.h>
00116 
00117 extern Datum dmetaphone(PG_FUNCTION_ARGS);
00118 extern Datum dmetaphone_alt(PG_FUNCTION_ARGS);
00119 
00120 /* prototype for the main function we got from the perl module */
00121 static void DoubleMetaphone(char *, char **);
00122 
00123 #ifndef DMETAPHONE_MAIN
00124 
00125 /*
00126  * The PostgreSQL visible dmetaphone function.
00127  */
00128 
00129 PG_FUNCTION_INFO_V1(dmetaphone);
00130 
00131 Datum
00132 dmetaphone(PG_FUNCTION_ARGS)
00133 {
00134     text       *arg;
00135     char       *aptr,
00136                *codes[2],
00137                *code;
00138 
00139 #ifdef DMETAPHONE_NOSTRICT
00140     if (PG_ARGISNULL(0))
00141         PG_RETURN_NULL();
00142 #endif
00143     arg = PG_GETARG_TEXT_P(0);
00144     aptr = text_to_cstring(arg);
00145 
00146     DoubleMetaphone(aptr, codes);
00147     code = codes[0];
00148     if (!code)
00149         code = "";
00150 
00151     PG_RETURN_TEXT_P(cstring_to_text(code));
00152 }
00153 
00154 /*
00155  * The PostgreSQL visible dmetaphone_alt function.
00156  */
00157 
00158 PG_FUNCTION_INFO_V1(dmetaphone_alt);
00159 
00160 Datum
00161 dmetaphone_alt(PG_FUNCTION_ARGS)
00162 {
00163     text       *arg;
00164     char       *aptr,
00165                *codes[2],
00166                *code;
00167 
00168 #ifdef DMETAPHONE_NOSTRICT
00169     if (PG_ARGISNULL(0))
00170         PG_RETURN_NULL();
00171 #endif
00172     arg = PG_GETARG_TEXT_P(0);
00173     aptr = text_to_cstring(arg);
00174 
00175     DoubleMetaphone(aptr, codes);
00176     code = codes[1];
00177     if (!code)
00178         code = "";
00179 
00180     PG_RETURN_TEXT_P(cstring_to_text(code));
00181 }
00182 
00183 
00184 /* here is where we start the code imported from the perl module */
00185 
00186 /* all memory handling is done with these macros */
00187 
00188 #define META_MALLOC(v,n,t) \
00189           (v = (t*)palloc(((n)*sizeof(t))))
00190 
00191 #define META_REALLOC(v,n,t) \
00192                       (v = (t*)repalloc((v),((n)*sizeof(t))))
00193 
00194 /*
00195  * Don't do pfree - it seems to cause a segv sometimes - which might have just
00196  * been caused by reloading the module in development.
00197  * So we rely on context cleanup - Tom Lane says pfree shouldn't be necessary
00198  * in a case like this.
00199  */
00200 
00201 #define META_FREE(x)            /* pfree((x)) */
00202 #else                           /* not defined DMETAPHONE_MAIN */
00203 
00204 /* use the standard malloc library when not running in PostgreSQL */
00205 
00206 #define META_MALLOC(v,n,t) \
00207           (v = (t*)malloc(((n)*sizeof(t))))
00208 
00209 #define META_REALLOC(v,n,t) \
00210                       (v = (t*)realloc((v),((n)*sizeof(t))))
00211 
00212 #define META_FREE(x) free((x))
00213 #endif   /* defined DMETAPHONE_MAIN */
00214 
00215 
00216 
00217 /* this typedef was originally in the perl module's .h file */
00218 
00219 typedef struct
00220 {
00221     char       *str;
00222     int         length;
00223     int         bufsize;
00224     int         free_string_on_destroy;
00225 }
00226 
00227 metastring;
00228 
00229 /*
00230  * remaining perl module funcs unchanged except for declaring them static
00231  * and reformatting to PostgreSQL indentation and to fit in 80 cols.
00232  *
00233  */
00234 
00235 static metastring *
00236 NewMetaString(char *init_str)
00237 {
00238     metastring *s;
00239     char        empty_string[] = "";
00240 
00241     META_MALLOC(s, 1, metastring);
00242     assert(s != NULL);
00243 
00244     if (init_str == NULL)
00245         init_str = empty_string;
00246     s->length = strlen(init_str);
00247     /* preallocate a bit more for potential growth */
00248     s->bufsize = s->length + 7;
00249 
00250     META_MALLOC(s->str, s->bufsize, char);
00251     assert(s->str != NULL);
00252 
00253     strncpy(s->str, init_str, s->length + 1);
00254     s->free_string_on_destroy = 1;
00255 
00256     return s;
00257 }
00258 
00259 
00260 static void
00261 DestroyMetaString(metastring *s)
00262 {
00263     if (s == NULL)
00264         return;
00265 
00266     if (s->free_string_on_destroy && (s->str != NULL))
00267         META_FREE(s->str);
00268 
00269     META_FREE(s);
00270 }
00271 
00272 
00273 static void
00274 IncreaseBuffer(metastring *s, int chars_needed)
00275 {
00276     META_REALLOC(s->str, (s->bufsize + chars_needed + 10), char);
00277     assert(s->str != NULL);
00278     s->bufsize = s->bufsize + chars_needed + 10;
00279 }
00280 
00281 
00282 static void
00283 MakeUpper(metastring *s)
00284 {
00285     char       *i;
00286 
00287     for (i = s->str; *i; i++)
00288         *i = toupper((unsigned char) *i);
00289 }
00290 
00291 
00292 static int
00293 IsVowel(metastring *s, int pos)
00294 {
00295     char        c;
00296 
00297     if ((pos < 0) || (pos >= s->length))
00298         return 0;
00299 
00300     c = *(s->str + pos);
00301     if ((c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') ||
00302         (c == 'U') || (c == 'Y'))
00303         return 1;
00304 
00305     return 0;
00306 }
00307 
00308 
00309 static int
00310 SlavoGermanic(metastring *s)
00311 {
00312     if ((char *) strstr(s->str, "W"))
00313         return 1;
00314     else if ((char *) strstr(s->str, "K"))
00315         return 1;
00316     else if ((char *) strstr(s->str, "CZ"))
00317         return 1;
00318     else if ((char *) strstr(s->str, "WITZ"))
00319         return 1;
00320     else
00321         return 0;
00322 }
00323 
00324 
00325 static char
00326 GetAt(metastring *s, int pos)
00327 {
00328     if ((pos < 0) || (pos >= s->length))
00329         return '\0';
00330 
00331     return ((char) *(s->str + pos));
00332 }
00333 
00334 
00335 static void
00336 SetAt(metastring *s, int pos, char c)
00337 {
00338     if ((pos < 0) || (pos >= s->length))
00339         return;
00340 
00341     *(s->str + pos) = c;
00342 }
00343 
00344 
00345 /*
00346    Caveats: the START value is 0 based
00347 */
00348 static int
00349 StringAt(metastring *s, int start, int length,...)
00350 {
00351     char       *test;
00352     char       *pos;
00353     va_list     ap;
00354 
00355     if ((start < 0) || (start >= s->length))
00356         return 0;
00357 
00358     pos = (s->str + start);
00359     va_start(ap, length);
00360 
00361     do
00362     {
00363         test = va_arg(ap, char *);
00364         if (*test && (strncmp(pos, test, length) == 0))
00365             return 1;
00366     }
00367     while (strcmp(test, "") != 0);
00368 
00369     va_end(ap);
00370 
00371     return 0;
00372 }
00373 
00374 
00375 static void
00376 MetaphAdd(metastring *s, char *new_str)
00377 {
00378     int         add_length;
00379 
00380     if (new_str == NULL)
00381         return;
00382 
00383     add_length = strlen(new_str);
00384     if ((s->length + add_length) > (s->bufsize - 1))
00385         IncreaseBuffer(s, add_length);
00386 
00387     strcat(s->str, new_str);
00388     s->length += add_length;
00389 }
00390 
00391 
00392 static void
00393 DoubleMetaphone(char *str, char **codes)
00394 {
00395     int         length;
00396     metastring *original;
00397     metastring *primary;
00398     metastring *secondary;
00399     int         current;
00400     int         last;
00401 
00402     current = 0;
00403     /* we need the real length and last prior to padding */
00404     length = strlen(str);
00405     last = length - 1;
00406     original = NewMetaString(str);
00407     /* Pad original so we can index beyond end */
00408     MetaphAdd(original, "     ");
00409 
00410     primary = NewMetaString("");
00411     secondary = NewMetaString("");
00412     primary->free_string_on_destroy = 0;
00413     secondary->free_string_on_destroy = 0;
00414 
00415     MakeUpper(original);
00416 
00417     /* skip these when at start of word */
00418     if (StringAt(original, 0, 2, "GN", "KN", "PN", "WR", "PS", ""))
00419         current += 1;
00420 
00421     /* Initial 'X' is pronounced 'Z' e.g. 'Xavier' */
00422     if (GetAt(original, 0) == 'X')
00423     {
00424         MetaphAdd(primary, "S");    /* 'Z' maps to 'S' */
00425         MetaphAdd(secondary, "S");
00426         current += 1;
00427     }
00428 
00429     /* main loop */
00430     while ((primary->length < 4) || (secondary->length < 4))
00431     {
00432         if (current >= length)
00433             break;
00434 
00435         switch (GetAt(original, current))
00436         {
00437             case 'A':
00438             case 'E':
00439             case 'I':
00440             case 'O':
00441             case 'U':
00442             case 'Y':
00443                 if (current == 0)
00444                 {
00445                     /* all init vowels now map to 'A' */
00446                     MetaphAdd(primary, "A");
00447                     MetaphAdd(secondary, "A");
00448                 }
00449                 current += 1;
00450                 break;
00451 
00452             case 'B':
00453 
00454                 /* "-mb", e.g", "dumb", already skipped over... */
00455                 MetaphAdd(primary, "P");
00456                 MetaphAdd(secondary, "P");
00457 
00458                 if (GetAt(original, current + 1) == 'B')
00459                     current += 2;
00460                 else
00461                     current += 1;
00462                 break;
00463 
00464             case '\xc7':        /* C with cedilla */
00465                 MetaphAdd(primary, "S");
00466                 MetaphAdd(secondary, "S");
00467                 current += 1;
00468                 break;
00469 
00470             case 'C':
00471                 /* various germanic */
00472                 if ((current > 1)
00473                     && !IsVowel(original, current - 2)
00474                     && StringAt(original, (current - 1), 3, "ACH", "")
00475                     && ((GetAt(original, current + 2) != 'I')
00476                         && ((GetAt(original, current + 2) != 'E')
00477                             || StringAt(original, (current - 2), 6, "BACHER",
00478                                         "MACHER", ""))))
00479                 {
00480                     MetaphAdd(primary, "K");
00481                     MetaphAdd(secondary, "K");
00482                     current += 2;
00483                     break;
00484                 }
00485 
00486                 /* special case 'caesar' */
00487                 if ((current == 0)
00488                     && StringAt(original, current, 6, "CAESAR", ""))
00489                 {
00490                     MetaphAdd(primary, "S");
00491                     MetaphAdd(secondary, "S");
00492                     current += 2;
00493                     break;
00494                 }
00495 
00496                 /* italian 'chianti' */
00497                 if (StringAt(original, current, 4, "CHIA", ""))
00498                 {
00499                     MetaphAdd(primary, "K");
00500                     MetaphAdd(secondary, "K");
00501                     current += 2;
00502                     break;
00503                 }
00504 
00505                 if (StringAt(original, current, 2, "CH", ""))
00506                 {
00507                     /* find 'michael' */
00508                     if ((current > 0)
00509                         && StringAt(original, current, 4, "CHAE", ""))
00510                     {
00511                         MetaphAdd(primary, "K");
00512                         MetaphAdd(secondary, "X");
00513                         current += 2;
00514                         break;
00515                     }
00516 
00517                     /* greek roots e.g. 'chemistry', 'chorus' */
00518                     if ((current == 0)
00519                         && (StringAt(original, (current + 1), 5,
00520                                      "HARAC", "HARIS", "")
00521                             || StringAt(original, (current + 1), 3, "HOR",
00522                                         "HYM", "HIA", "HEM", ""))
00523                         && !StringAt(original, 0, 5, "CHORE", ""))
00524                     {
00525                         MetaphAdd(primary, "K");
00526                         MetaphAdd(secondary, "K");
00527                         current += 2;
00528                         break;
00529                     }
00530 
00531                     /* germanic, greek, or otherwise 'ch' for 'kh' sound */
00532                     if (
00533                         (StringAt(original, 0, 4, "VAN ", "VON ", "")
00534                          || StringAt(original, 0, 3, "SCH", ""))
00535                     /* 'architect but not 'arch', 'orchestra', 'orchid' */
00536                         || StringAt(original, (current - 2), 6, "ORCHES",
00537                                     "ARCHIT", "ORCHID", "")
00538                         || StringAt(original, (current + 2), 1, "T", "S",
00539                                     "")
00540                         || ((StringAt(original, (current - 1), 1,
00541                                       "A", "O", "U", "E", "")
00542                              || (current == 0))
00543 
00544                     /*
00545                      * e.g., 'wachtler', 'wechsler', but not 'tichner'
00546                      */
00547                             && StringAt(original, (current + 2), 1, "L", "R",
00548                                         "N", "M", "B", "H", "F", "V", "W",
00549                                         " ", "")))
00550                     {
00551                         MetaphAdd(primary, "K");
00552                         MetaphAdd(secondary, "K");
00553                     }
00554                     else
00555                     {
00556                         if (current > 0)
00557                         {
00558                             if (StringAt(original, 0, 2, "MC", ""))
00559                             {
00560                                 /* e.g., "McHugh" */
00561                                 MetaphAdd(primary, "K");
00562                                 MetaphAdd(secondary, "K");
00563                             }
00564                             else
00565                             {
00566                                 MetaphAdd(primary, "X");
00567                                 MetaphAdd(secondary, "K");
00568                             }
00569                         }
00570                         else
00571                         {
00572                             MetaphAdd(primary, "X");
00573                             MetaphAdd(secondary, "X");
00574                         }
00575                     }
00576                     current += 2;
00577                     break;
00578                 }
00579                 /* e.g, 'czerny' */
00580                 if (StringAt(original, current, 2, "CZ", "")
00581                     && !StringAt(original, (current - 2), 4, "WICZ", ""))
00582                 {
00583                     MetaphAdd(primary, "S");
00584                     MetaphAdd(secondary, "X");
00585                     current += 2;
00586                     break;
00587                 }
00588 
00589                 /* e.g., 'focaccia' */
00590                 if (StringAt(original, (current + 1), 3, "CIA", ""))
00591                 {
00592                     MetaphAdd(primary, "X");
00593                     MetaphAdd(secondary, "X");
00594                     current += 3;
00595                     break;
00596                 }
00597 
00598                 /* double 'C', but not if e.g. 'McClellan' */
00599                 if (StringAt(original, current, 2, "CC", "")
00600                     && !((current == 1) && (GetAt(original, 0) == 'M')))
00601                 {
00602                     /* 'bellocchio' but not 'bacchus' */
00603                     if (StringAt(original, (current + 2), 1, "I", "E", "H", "")
00604                         && !StringAt(original, (current + 2), 2, "HU", ""))
00605                     {
00606                         /* 'accident', 'accede' 'succeed' */
00607                         if (
00608                             ((current == 1)
00609                              && (GetAt(original, current - 1) == 'A'))
00610                             || StringAt(original, (current - 1), 5, "UCCEE",
00611                                         "UCCES", ""))
00612                         {
00613                             MetaphAdd(primary, "KS");
00614                             MetaphAdd(secondary, "KS");
00615                             /* 'bacci', 'bertucci', other italian */
00616                         }
00617                         else
00618                         {
00619                             MetaphAdd(primary, "X");
00620                             MetaphAdd(secondary, "X");
00621                         }
00622                         current += 3;
00623                         break;
00624                     }
00625                     else
00626                     {           /* Pierce's rule */
00627                         MetaphAdd(primary, "K");
00628                         MetaphAdd(secondary, "K");
00629                         current += 2;
00630                         break;
00631                     }
00632                 }
00633 
00634                 if (StringAt(original, current, 2, "CK", "CG", "CQ", ""))
00635                 {
00636                     MetaphAdd(primary, "K");
00637                     MetaphAdd(secondary, "K");
00638                     current += 2;
00639                     break;
00640                 }
00641 
00642                 if (StringAt(original, current, 2, "CI", "CE", "CY", ""))
00643                 {
00644                     /* italian vs. english */
00645                     if (StringAt
00646                         (original, current, 3, "CIO", "CIE", "CIA", ""))
00647                     {
00648                         MetaphAdd(primary, "S");
00649                         MetaphAdd(secondary, "X");
00650                     }
00651                     else
00652                     {
00653                         MetaphAdd(primary, "S");
00654                         MetaphAdd(secondary, "S");
00655                     }
00656                     current += 2;
00657                     break;
00658                 }
00659 
00660                 /* else */
00661                 MetaphAdd(primary, "K");
00662                 MetaphAdd(secondary, "K");
00663 
00664                 /* name sent in 'mac caffrey', 'mac gregor */
00665                 if (StringAt(original, (current + 1), 2, " C", " Q", " G", ""))
00666                     current += 3;
00667                 else if (StringAt(original, (current + 1), 1, "C", "K", "Q", "")
00668                          && !StringAt(original, (current + 1), 2,
00669                                       "CE", "CI", ""))
00670                     current += 2;
00671                 else
00672                     current += 1;
00673                 break;
00674 
00675             case 'D':
00676                 if (StringAt(original, current, 2, "DG", ""))
00677                 {
00678                     if (StringAt(original, (current + 2), 1,
00679                                  "I", "E", "Y", ""))
00680                     {
00681                         /* e.g. 'edge' */
00682                         MetaphAdd(primary, "J");
00683                         MetaphAdd(secondary, "J");
00684                         current += 3;
00685                         break;
00686                     }
00687                     else
00688                     {
00689                         /* e.g. 'edgar' */
00690                         MetaphAdd(primary, "TK");
00691                         MetaphAdd(secondary, "TK");
00692                         current += 2;
00693                         break;
00694                     }
00695                 }
00696 
00697                 if (StringAt(original, current, 2, "DT", "DD", ""))
00698                 {
00699                     MetaphAdd(primary, "T");
00700                     MetaphAdd(secondary, "T");
00701                     current += 2;
00702                     break;
00703                 }
00704 
00705                 /* else */
00706                 MetaphAdd(primary, "T");
00707                 MetaphAdd(secondary, "T");
00708                 current += 1;
00709                 break;
00710 
00711             case 'F':
00712                 if (GetAt(original, current + 1) == 'F')
00713                     current += 2;
00714                 else
00715                     current += 1;
00716                 MetaphAdd(primary, "F");
00717                 MetaphAdd(secondary, "F");
00718                 break;
00719 
00720             case 'G':
00721                 if (GetAt(original, current + 1) == 'H')
00722                 {
00723                     if ((current > 0) && !IsVowel(original, current - 1))
00724                     {
00725                         MetaphAdd(primary, "K");
00726                         MetaphAdd(secondary, "K");
00727                         current += 2;
00728                         break;
00729                     }
00730 
00731                     if (current < 3)
00732                     {
00733                         /* 'ghislane', ghiradelli */
00734                         if (current == 0)
00735                         {
00736                             if (GetAt(original, current + 2) == 'I')
00737                             {
00738                                 MetaphAdd(primary, "J");
00739                                 MetaphAdd(secondary, "J");
00740                             }
00741                             else
00742                             {
00743                                 MetaphAdd(primary, "K");
00744                                 MetaphAdd(secondary, "K");
00745                             }
00746                             current += 2;
00747                             break;
00748                         }
00749                     }
00750 
00751                     /*
00752                      * Parker's rule (with some further refinements) - e.g.,
00753                      * 'hugh'
00754                      */
00755                     if (
00756                         ((current > 1)
00757                          && StringAt(original, (current - 2), 1,
00758                                      "B", "H", "D", ""))
00759                     /* e.g., 'bough' */
00760                         || ((current > 2)
00761                             && StringAt(original, (current - 3), 1,
00762                                         "B", "H", "D", ""))
00763                     /* e.g., 'broughton' */
00764                         || ((current > 3)
00765                             && StringAt(original, (current - 4), 1,
00766                                         "B", "H", "")))
00767                     {
00768                         current += 2;
00769                         break;
00770                     }
00771                     else
00772                     {
00773                         /*
00774                          * e.g., 'laugh', 'McLaughlin', 'cough', 'gough',
00775                          * 'rough', 'tough'
00776                          */
00777                         if ((current > 2)
00778                             && (GetAt(original, current - 1) == 'U')
00779                             && StringAt(original, (current - 3), 1, "C",
00780                                         "G", "L", "R", "T", ""))
00781                         {
00782                             MetaphAdd(primary, "F");
00783                             MetaphAdd(secondary, "F");
00784                         }
00785                         else if ((current > 0)
00786                                  && GetAt(original, current - 1) != 'I')
00787                         {
00788 
00789 
00790                             MetaphAdd(primary, "K");
00791                             MetaphAdd(secondary, "K");
00792                         }
00793 
00794                         current += 2;
00795                         break;
00796                     }
00797                 }
00798 
00799                 if (GetAt(original, current + 1) == 'N')
00800                 {
00801                     if ((current == 1) && IsVowel(original, 0)
00802                         && !SlavoGermanic(original))
00803                     {
00804                         MetaphAdd(primary, "KN");
00805                         MetaphAdd(secondary, "N");
00806                     }
00807                     else
00808                         /* not e.g. 'cagney' */
00809                         if (!StringAt(original, (current + 2), 2, "EY", "")
00810                             && (GetAt(original, current + 1) != 'Y')
00811                             && !SlavoGermanic(original))
00812                     {
00813                         MetaphAdd(primary, "N");
00814                         MetaphAdd(secondary, "KN");
00815                     }
00816                     else
00817                     {
00818                         MetaphAdd(primary, "KN");
00819                         MetaphAdd(secondary, "KN");
00820                     }
00821                     current += 2;
00822                     break;
00823                 }
00824 
00825                 /* 'tagliaro' */
00826                 if (StringAt(original, (current + 1), 2, "LI", "")
00827                     && !SlavoGermanic(original))
00828                 {
00829                     MetaphAdd(primary, "KL");
00830                     MetaphAdd(secondary, "L");
00831                     current += 2;
00832                     break;
00833                 }
00834 
00835                 /* -ges-,-gep-,-gel-, -gie- at beginning */
00836                 if ((current == 0)
00837                     && ((GetAt(original, current + 1) == 'Y')
00838                         || StringAt(original, (current + 1), 2, "ES", "EP",
00839                                     "EB", "EL", "EY", "IB", "IL", "IN", "IE",
00840                                     "EI", "ER", "")))
00841                 {
00842                     MetaphAdd(primary, "K");
00843                     MetaphAdd(secondary, "J");
00844                     current += 2;
00845                     break;
00846                 }
00847 
00848                 /* -ger-,  -gy- */
00849                 if (
00850                     (StringAt(original, (current + 1), 2, "ER", "")
00851                      || (GetAt(original, current + 1) == 'Y'))
00852                     && !StringAt(original, 0, 6,
00853                                  "DANGER", "RANGER", "MANGER", "")
00854                     && !StringAt(original, (current - 1), 1, "E", "I", "")
00855                     && !StringAt(original, (current - 1), 3, "RGY", "OGY",
00856                                  ""))
00857                 {
00858                     MetaphAdd(primary, "K");
00859                     MetaphAdd(secondary, "J");
00860                     current += 2;
00861                     break;
00862                 }
00863 
00864                 /* italian e.g, 'biaggi' */
00865                 if (StringAt(original, (current + 1), 1, "E", "I", "Y", "")
00866                     || StringAt(original, (current - 1), 4,
00867                                 "AGGI", "OGGI", ""))
00868                 {
00869                     /* obvious germanic */
00870                     if (
00871                         (StringAt(original, 0, 4, "VAN ", "VON ", "")
00872                          || StringAt(original, 0, 3, "SCH", ""))
00873                         || StringAt(original, (current + 1), 2, "ET", ""))
00874                     {
00875                         MetaphAdd(primary, "K");
00876                         MetaphAdd(secondary, "K");
00877                     }
00878                     else
00879                     {
00880                         /* always soft if french ending */
00881                         if (StringAt
00882                             (original, (current + 1), 4, "IER ", ""))
00883                         {
00884                             MetaphAdd(primary, "J");
00885                             MetaphAdd(secondary, "J");
00886                         }
00887                         else
00888                         {
00889                             MetaphAdd(primary, "J");
00890                             MetaphAdd(secondary, "K");
00891                         }
00892                     }
00893                     current += 2;
00894                     break;
00895                 }
00896 
00897                 if (GetAt(original, current + 1) == 'G')
00898                     current += 2;
00899                 else
00900                     current += 1;
00901                 MetaphAdd(primary, "K");
00902                 MetaphAdd(secondary, "K");
00903                 break;
00904 
00905             case 'H':
00906                 /* only keep if first & before vowel or btw. 2 vowels */
00907                 if (((current == 0) || IsVowel(original, current - 1))
00908                     && IsVowel(original, current + 1))
00909                 {
00910                     MetaphAdd(primary, "H");
00911                     MetaphAdd(secondary, "H");
00912                     current += 2;
00913                 }
00914                 else
00915                     /* also takes care of 'HH' */
00916                     current += 1;
00917                 break;
00918 
00919             case 'J':
00920                 /* obvious spanish, 'jose', 'san jacinto' */
00921                 if (StringAt(original, current, 4, "JOSE", "")
00922                     || StringAt(original, 0, 4, "SAN ", ""))
00923                 {
00924                     if (((current == 0)
00925                          && (GetAt(original, current + 4) == ' '))
00926                         || StringAt(original, 0, 4, "SAN ", ""))
00927                     {
00928                         MetaphAdd(primary, "H");
00929                         MetaphAdd(secondary, "H");
00930                     }
00931                     else
00932                     {
00933                         MetaphAdd(primary, "J");
00934                         MetaphAdd(secondary, "H");
00935                     }
00936                     current += 1;
00937                     break;
00938                 }
00939 
00940                 if ((current == 0)
00941                     && !StringAt(original, current, 4, "JOSE", ""))
00942                 {
00943                     MetaphAdd(primary, "J");    /* Yankelovich/Jankelowicz */
00944                     MetaphAdd(secondary, "A");
00945                 }
00946                 else
00947                 {
00948                     /* spanish pron. of e.g. 'bajador' */
00949                     if (IsVowel(original, current - 1)
00950                         && !SlavoGermanic(original)
00951                         && ((GetAt(original, current + 1) == 'A')
00952                             || (GetAt(original, current + 1) == 'O')))
00953                     {
00954                         MetaphAdd(primary, "J");
00955                         MetaphAdd(secondary, "H");
00956                     }
00957                     else
00958                     {
00959                         if (current == last)
00960                         {
00961                             MetaphAdd(primary, "J");
00962                             MetaphAdd(secondary, "");
00963                         }
00964                         else
00965                         {
00966                             if (!StringAt(original, (current + 1), 1, "L", "T",
00967                                           "K", "S", "N", "M", "B", "Z", "")
00968                                 && !StringAt(original, (current - 1), 1,
00969                                              "S", "K", "L", ""))
00970                             {
00971                                 MetaphAdd(primary, "J");
00972                                 MetaphAdd(secondary, "J");
00973                             }
00974                         }
00975                     }
00976                 }
00977 
00978                 if (GetAt(original, current + 1) == 'J')        /* it could happen! */
00979                     current += 2;
00980                 else
00981                     current += 1;
00982                 break;
00983 
00984             case 'K':
00985                 if (GetAt(original, current + 1) == 'K')
00986                     current += 2;
00987                 else
00988                     current += 1;
00989                 MetaphAdd(primary, "K");
00990                 MetaphAdd(secondary, "K");
00991                 break;
00992 
00993             case 'L':
00994                 if (GetAt(original, current + 1) == 'L')
00995                 {
00996                     /* spanish e.g. 'cabrillo', 'gallegos' */
00997                     if (((current == (length - 3))
00998                          && StringAt(original, (current - 1), 4, "ILLO",
00999                                      "ILLA", "ALLE", ""))
01000                         || ((StringAt(original, (last - 1), 2, "AS", "OS", "")
01001                              || StringAt(original, last, 1, "A", "O", ""))
01002                             && StringAt(original, (current - 1), 4,
01003                                         "ALLE", "")))
01004                     {
01005                         MetaphAdd(primary, "L");
01006                         MetaphAdd(secondary, "");
01007                         current += 2;
01008                         break;
01009                     }
01010                     current += 2;
01011                 }
01012                 else
01013                     current += 1;
01014                 MetaphAdd(primary, "L");
01015                 MetaphAdd(secondary, "L");
01016                 break;
01017 
01018             case 'M':
01019                 if ((StringAt(original, (current - 1), 3, "UMB", "")
01020                      && (((current + 1) == last)
01021                          || StringAt(original, (current + 2), 2, "ER", "")))
01022                 /* 'dumb','thumb' */
01023                     || (GetAt(original, current + 1) == 'M'))
01024                     current += 2;
01025                 else
01026                     current += 1;
01027                 MetaphAdd(primary, "M");
01028                 MetaphAdd(secondary, "M");
01029                 break;
01030 
01031             case 'N':
01032                 if (GetAt(original, current + 1) == 'N')
01033                     current += 2;
01034                 else
01035                     current += 1;
01036                 MetaphAdd(primary, "N");
01037                 MetaphAdd(secondary, "N");
01038                 break;
01039 
01040             case '\xd1':        /* N with tilde */
01041                 current += 1;
01042                 MetaphAdd(primary, "N");
01043                 MetaphAdd(secondary, "N");
01044                 break;
01045 
01046             case 'P':
01047                 if (GetAt(original, current + 1) == 'H')
01048                 {
01049                     MetaphAdd(primary, "F");
01050                     MetaphAdd(secondary, "F");
01051                     current += 2;
01052                     break;
01053                 }
01054 
01055                 /* also account for "campbell", "raspberry" */
01056                 if (StringAt(original, (current + 1), 1, "P", "B", ""))
01057                     current += 2;
01058                 else
01059                     current += 1;
01060                 MetaphAdd(primary, "P");
01061                 MetaphAdd(secondary, "P");
01062                 break;
01063 
01064             case 'Q':
01065                 if (GetAt(original, current + 1) == 'Q')
01066                     current += 2;
01067                 else
01068                     current += 1;
01069                 MetaphAdd(primary, "K");
01070                 MetaphAdd(secondary, "K");
01071                 break;
01072 
01073             case 'R':
01074                 /* french e.g. 'rogier', but exclude 'hochmeier' */
01075                 if ((current == last)
01076                     && !SlavoGermanic(original)
01077                     && StringAt(original, (current - 2), 2, "IE", "")
01078                     && !StringAt(original, (current - 4), 2, "ME", "MA", ""))
01079                 {
01080                     MetaphAdd(primary, "");
01081                     MetaphAdd(secondary, "R");
01082                 }
01083                 else
01084                 {
01085                     MetaphAdd(primary, "R");
01086                     MetaphAdd(secondary, "R");
01087                 }
01088 
01089                 if (GetAt(original, current + 1) == 'R')
01090                     current += 2;
01091                 else
01092                     current += 1;
01093                 break;
01094 
01095             case 'S':
01096                 /* special cases 'island', 'isle', 'carlisle', 'carlysle' */
01097                 if (StringAt(original, (current - 1), 3, "ISL", "YSL", ""))
01098                 {
01099                     current += 1;
01100                     break;
01101                 }
01102 
01103                 /* special case 'sugar-' */
01104                 if ((current == 0)
01105                     && StringAt(original, current, 5, "SUGAR", ""))
01106                 {
01107                     MetaphAdd(primary, "X");
01108                     MetaphAdd(secondary, "S");
01109                     current += 1;
01110                     break;
01111                 }
01112 
01113                 if (StringAt(original, current, 2, "SH", ""))
01114                 {
01115                     /* germanic */
01116                     if (StringAt
01117                         (original, (current + 1), 4, "HEIM", "HOEK", "HOLM",
01118                          "HOLZ", ""))
01119                     {
01120                         MetaphAdd(primary, "S");
01121                         MetaphAdd(secondary, "S");
01122                     }
01123                     else
01124                     {
01125                         MetaphAdd(primary, "X");
01126                         MetaphAdd(secondary, "X");
01127                     }
01128                     current += 2;
01129                     break;
01130                 }
01131 
01132                 /* italian & armenian */
01133                 if (StringAt(original, current, 3, "SIO", "SIA", "")
01134                     || StringAt(original, current, 4, "SIAN", ""))
01135                 {
01136                     if (!SlavoGermanic(original))
01137                     {
01138                         MetaphAdd(primary, "S");
01139                         MetaphAdd(secondary, "X");
01140                     }
01141                     else
01142                     {
01143                         MetaphAdd(primary, "S");
01144                         MetaphAdd(secondary, "S");
01145                     }
01146                     current += 3;
01147                     break;
01148                 }
01149 
01150                 /*
01151                  * german & anglicisations, e.g. 'smith' match 'schmidt',
01152                  * 'snider' match 'schneider' also, -sz- in slavic language
01153                  * although in hungarian it is pronounced 's'
01154                  */
01155                 if (((current == 0)
01156                      && StringAt(original, (current + 1), 1,
01157                                  "M", "N", "L", "W", ""))
01158                     || StringAt(original, (current + 1), 1, "Z", ""))
01159                 {
01160                     MetaphAdd(primary, "S");
01161                     MetaphAdd(secondary, "X");
01162                     if (StringAt(original, (current + 1), 1, "Z", ""))
01163                         current += 2;
01164                     else
01165                         current += 1;
01166                     break;
01167                 }
01168 
01169                 if (StringAt(original, current, 2, "SC", ""))
01170                 {
01171                     /* Schlesinger's rule */
01172                     if (GetAt(original, current + 2) == 'H')
01173                     {
01174                         /* dutch origin, e.g. 'school', 'schooner' */
01175                         if (StringAt(original, (current + 3), 2,
01176                                      "OO", "ER", "EN",
01177                                      "UY", "ED", "EM", ""))
01178                         {
01179                             /* 'schermerhorn', 'schenker' */
01180                             if (StringAt(original, (current + 3), 2,
01181                                          "ER", "EN", ""))
01182                             {
01183                                 MetaphAdd(primary, "X");
01184                                 MetaphAdd(secondary, "SK");
01185                             }
01186                             else
01187                             {
01188                                 MetaphAdd(primary, "SK");
01189                                 MetaphAdd(secondary, "SK");
01190                             }
01191                             current += 3;
01192                             break;
01193                         }
01194                         else
01195                         {
01196                             if ((current == 0) && !IsVowel(original, 3)
01197                                 && (GetAt(original, 3) != 'W'))
01198                             {
01199                                 MetaphAdd(primary, "X");
01200                                 MetaphAdd(secondary, "S");
01201                             }
01202                             else
01203                             {
01204                                 MetaphAdd(primary, "X");
01205                                 MetaphAdd(secondary, "X");
01206                             }
01207                             current += 3;
01208                             break;
01209                         }
01210                     }
01211 
01212                     if (StringAt(original, (current + 2), 1,
01213                                  "I", "E", "Y", ""))
01214                     {
01215                         MetaphAdd(primary, "S");
01216                         MetaphAdd(secondary, "S");
01217                         current += 3;
01218                         break;
01219                     }
01220                     /* else */
01221                     MetaphAdd(primary, "SK");
01222                     MetaphAdd(secondary, "SK");
01223                     current += 3;
01224                     break;
01225                 }
01226 
01227                 /* french e.g. 'resnais', 'artois' */
01228                 if ((current == last)
01229                     && StringAt(original, (current - 2), 2, "AI", "OI", ""))
01230                 {
01231                     MetaphAdd(primary, "");
01232                     MetaphAdd(secondary, "S");
01233                 }
01234                 else
01235                 {
01236                     MetaphAdd(primary, "S");
01237                     MetaphAdd(secondary, "S");
01238                 }
01239 
01240                 if (StringAt(original, (current + 1), 1, "S", "Z", ""))
01241                     current += 2;
01242                 else
01243                     current += 1;
01244                 break;
01245 
01246             case 'T':
01247                 if (StringAt(original, current, 4, "TION", ""))
01248                 {
01249                     MetaphAdd(primary, "X");
01250                     MetaphAdd(secondary, "X");
01251                     current += 3;
01252                     break;
01253                 }
01254 
01255                 if (StringAt(original, current, 3, "TIA", "TCH", ""))
01256                 {
01257                     MetaphAdd(primary, "X");
01258                     MetaphAdd(secondary, "X");
01259                     current += 3;
01260                     break;
01261                 }
01262 
01263                 if (StringAt(original, current, 2, "TH", "")
01264                     || StringAt(original, current, 3, "TTH", ""))
01265                 {
01266                     /* special case 'thomas', 'thames' or germanic */
01267                     if (StringAt(original, (current + 2), 2, "OM", "AM", "")
01268                         || StringAt(original, 0, 4, "VAN ", "VON ", "")
01269                         || StringAt(original, 0, 3, "SCH", ""))
01270                     {
01271                         MetaphAdd(primary, "T");
01272                         MetaphAdd(secondary, "T");
01273                     }
01274                     else
01275                     {
01276                         MetaphAdd(primary, "0");
01277                         MetaphAdd(secondary, "T");
01278                     }
01279                     current += 2;
01280                     break;
01281                 }
01282 
01283                 if (StringAt(original, (current + 1), 1, "T", "D", ""))
01284                     current += 2;
01285                 else
01286                     current += 1;
01287                 MetaphAdd(primary, "T");
01288                 MetaphAdd(secondary, "T");
01289                 break;
01290 
01291             case 'V':
01292                 if (GetAt(original, current + 1) == 'V')
01293                     current += 2;
01294                 else
01295                     current += 1;
01296                 MetaphAdd(primary, "F");
01297                 MetaphAdd(secondary, "F");
01298                 break;
01299 
01300             case 'W':
01301                 /* can also be in middle of word */
01302                 if (StringAt(original, current, 2, "WR", ""))
01303                 {
01304                     MetaphAdd(primary, "R");
01305                     MetaphAdd(secondary, "R");
01306                     current += 2;
01307                     break;
01308                 }
01309 
01310                 if ((current == 0)
01311                     && (IsVowel(original, current + 1)
01312                         || StringAt(original, current, 2, "WH", "")))
01313                 {
01314                     /* Wasserman should match Vasserman */
01315                     if (IsVowel(original, current + 1))
01316                     {
01317                         MetaphAdd(primary, "A");
01318                         MetaphAdd(secondary, "F");
01319                     }
01320                     else
01321                     {
01322                         /* need Uomo to match Womo */
01323                         MetaphAdd(primary, "A");
01324                         MetaphAdd(secondary, "A");
01325                     }
01326                 }
01327 
01328                 /* Arnow should match Arnoff */
01329                 if (((current == last) && IsVowel(original, current - 1))
01330                     || StringAt(original, (current - 1), 5, "EWSKI", "EWSKY",
01331                                 "OWSKI", "OWSKY", "")
01332                     || StringAt(original, 0, 3, "SCH", ""))
01333                 {
01334                     MetaphAdd(primary, "");
01335                     MetaphAdd(secondary, "F");
01336                     current += 1;
01337                     break;
01338                 }
01339 
01340                 /* polish e.g. 'filipowicz' */
01341                 if (StringAt(original, current, 4, "WICZ", "WITZ", ""))
01342                 {
01343                     MetaphAdd(primary, "TS");
01344                     MetaphAdd(secondary, "FX");
01345                     current += 4;
01346                     break;
01347                 }
01348 
01349                 /* else skip it */
01350                 current += 1;
01351                 break;
01352 
01353             case 'X':
01354                 /* french e.g. breaux */
01355                 if (!((current == last)
01356                       && (StringAt(original, (current - 3), 3,
01357                                    "IAU", "EAU", "")
01358                           || StringAt(original, (current - 2), 2,
01359                                       "AU", "OU", ""))))
01360                 {
01361                     MetaphAdd(primary, "KS");
01362                     MetaphAdd(secondary, "KS");
01363                 }
01364 
01365 
01366                 if (StringAt(original, (current + 1), 1, "C", "X", ""))
01367                     current += 2;
01368                 else
01369                     current += 1;
01370                 break;
01371 
01372             case 'Z':
01373                 /* chinese pinyin e.g. 'zhao' */
01374                 if (GetAt(original, current + 1) == 'H')
01375                 {
01376                     MetaphAdd(primary, "J");
01377                     MetaphAdd(secondary, "J");
01378                     current += 2;
01379                     break;
01380                 }
01381                 else if (StringAt(original, (current + 1), 2,
01382                                   "ZO", "ZI", "ZA", "")
01383                          || (SlavoGermanic(original)
01384                              && ((current > 0)
01385                                  && GetAt(original, current - 1) != 'T')))
01386                 {
01387                     MetaphAdd(primary, "S");
01388                     MetaphAdd(secondary, "TS");
01389                 }
01390                 else
01391                 {
01392                     MetaphAdd(primary, "S");
01393                     MetaphAdd(secondary, "S");
01394                 }
01395 
01396                 if (GetAt(original, current + 1) == 'Z')
01397                     current += 2;
01398                 else
01399                     current += 1;
01400                 break;
01401 
01402             default:
01403                 current += 1;
01404         }
01405 
01406         /*
01407          * printf("PRIMARY: %s\n", primary->str); printf("SECONDARY: %s\n",
01408          * secondary->str);
01409          */
01410     }
01411 
01412 
01413     if (primary->length > 4)
01414         SetAt(primary, 4, '\0');
01415 
01416     if (secondary->length > 4)
01417         SetAt(secondary, 4, '\0');
01418 
01419     *codes = primary->str;
01420     *++codes = secondary->str;
01421 
01422     DestroyMetaString(original);
01423     DestroyMetaString(primary);
01424     DestroyMetaString(secondary);
01425 }
01426 
01427 #ifdef DMETAPHONE_MAIN
01428 
01429 /* just for testing - not part of the perl code */
01430 
01431 main(int argc, char **argv)
01432 {
01433     char       *codes[2];
01434 
01435     if (argc > 1)
01436     {
01437         DoubleMetaphone(argv[1], codes);
01438         printf("%s|%s\n", codes[0], codes[1]);
01439     }
01440 }
01441 
01442 #endif