Header And Logo

PostgreSQL
| The world's most advanced open source database.

levenshtein.c

Go to the documentation of this file.
00001 /*
00002  * levenshtein.c
00003  *
00004  * Functions for "fuzzy" comparison of strings
00005  *
00006  * Joe Conway <[email protected]>
00007  *
00008  * Copyright (c) 2001-2013, PostgreSQL Global Development Group
00009  * ALL RIGHTS RESERVED;
00010  *
00011  * levenshtein()
00012  * -------------
00013  * Written based on a description of the algorithm by Michael Gilleland
00014  * found at http://www.merriampark.com/ld.htm
00015  * Also looked at levenshtein.c in the PHP 4.0.6 distribution for
00016  * inspiration.
00017  * Configurable penalty costs extension is introduced by Volkan
00018  * YAZICI <[email protected]>.
00019  */
00020 
00021 /*
00022  * External declarations for exported functions
00023  */
00024 #ifdef LEVENSHTEIN_LESS_EQUAL
00025 static int levenshtein_less_equal_internal(text *s, text *t,
00026                                 int ins_c, int del_c, int sub_c, int max_d);
00027 #else
00028 static int levenshtein_internal(text *s, text *t,
00029                      int ins_c, int del_c, int sub_c);
00030 #endif
00031 
00032 #define MAX_LEVENSHTEIN_STRLEN      255
00033 
00034 
00035 /*
00036  * Calculates Levenshtein distance metric between supplied strings. Generally
00037  * (1, 1, 1) penalty costs suffices for common cases, but your mileage may
00038  * vary.
00039  *
00040  * One way to compute Levenshtein distance is to incrementally construct
00041  * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
00042  * of operations required to transform the first i characters of s into
00043  * the first j characters of t.  The last column of the final row is the
00044  * answer.
00045  *
00046  * We use that algorithm here with some modification.  In lieu of holding
00047  * the entire array in memory at once, we'll just use two arrays of size
00048  * m+1 for storing accumulated values. At each step one array represents
00049  * the "previous" row and one is the "current" row of the notional large
00050  * array.
00051  *
00052  * If max_d >= 0, we only need to provide an accurate answer when that answer
00053  * is less than or equal to the bound.  From any cell in the matrix, there is
00054  * theoretical "minimum residual distance" from that cell to the last column
00055  * of the final row.  This minimum residual distance is zero when the
00056  * untransformed portions of the strings are of equal length (because we might
00057  * get lucky and find all the remaining characters matching) and is otherwise
00058  * based on the minimum number of insertions or deletions needed to make them
00059  * equal length.  The residual distance grows as we move toward the upper
00060  * right or lower left corners of the matrix.  When the max_d bound is
00061  * usefully tight, we can use this property to avoid computing the entirety
00062  * of each row; instead, we maintain a start_column and stop_column that
00063  * identify the portion of the matrix close to the diagonal which can still
00064  * affect the final answer.
00065  */
00066 static int
00067 #ifdef LEVENSHTEIN_LESS_EQUAL
00068 levenshtein_less_equal_internal(text *s, text *t,
00069                                 int ins_c, int del_c, int sub_c, int max_d)
00070 #else
00071 levenshtein_internal(text *s, text *t,
00072                      int ins_c, int del_c, int sub_c)
00073 #endif
00074 {
00075     int         m,
00076                 n,
00077                 s_bytes,
00078                 t_bytes;
00079     int        *prev;
00080     int        *curr;
00081     int        *s_char_len = NULL;
00082     int         i,
00083                 j;
00084     const char *s_data;
00085     const char *t_data;
00086     const char *y;
00087 
00088     /*
00089      * For levenshtein_less_equal_internal, we have real variables called
00090      * start_column and stop_column; otherwise it's just short-hand for 0 and
00091      * m.
00092      */
00093 #ifdef LEVENSHTEIN_LESS_EQUAL
00094     int         start_column,
00095                 stop_column;
00096 
00097 #undef START_COLUMN
00098 #undef STOP_COLUMN
00099 #define START_COLUMN start_column
00100 #define STOP_COLUMN stop_column
00101 #else
00102 #undef START_COLUMN
00103 #undef STOP_COLUMN
00104 #define START_COLUMN 0
00105 #define STOP_COLUMN m
00106 #endif
00107 
00108     /* Extract a pointer to the actual character data. */
00109     s_data = VARDATA_ANY(s);
00110     t_data = VARDATA_ANY(t);
00111 
00112     /* Determine length of each string in bytes and characters. */
00113     s_bytes = VARSIZE_ANY_EXHDR(s);
00114     t_bytes = VARSIZE_ANY_EXHDR(t);
00115     m = pg_mbstrlen_with_len(s_data, s_bytes);
00116     n = pg_mbstrlen_with_len(t_data, t_bytes);
00117 
00118     /*
00119      * We can transform an empty s into t with n insertions, or a non-empty t
00120      * into an empty s with m deletions.
00121      */
00122     if (!m)
00123         return n * ins_c;
00124     if (!n)
00125         return m * del_c;
00126 
00127     /*
00128      * For security concerns, restrict excessive CPU+RAM usage. (This
00129      * implementation uses O(m) memory and has O(mn) complexity.)
00130      */
00131     if (m > MAX_LEVENSHTEIN_STRLEN ||
00132         n > MAX_LEVENSHTEIN_STRLEN)
00133         ereport(ERROR,
00134                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00135                  errmsg("argument exceeds the maximum length of %d bytes",
00136                         MAX_LEVENSHTEIN_STRLEN)));
00137 
00138 #ifdef LEVENSHTEIN_LESS_EQUAL
00139     /* Initialize start and stop columns. */
00140     start_column = 0;
00141     stop_column = m + 1;
00142 
00143     /*
00144      * If max_d >= 0, determine whether the bound is impossibly tight.  If so,
00145      * return max_d + 1 immediately.  Otherwise, determine whether it's tight
00146      * enough to limit the computation we must perform.  If so, figure out
00147      * initial stop column.
00148      */
00149     if (max_d >= 0)
00150     {
00151         int         min_theo_d; /* Theoretical minimum distance. */
00152         int         max_theo_d; /* Theoretical maximum distance. */
00153         int         net_inserts = n - m;
00154 
00155         min_theo_d = net_inserts < 0 ?
00156             -net_inserts * del_c : net_inserts * ins_c;
00157         if (min_theo_d > max_d)
00158             return max_d + 1;
00159         if (ins_c + del_c < sub_c)
00160             sub_c = ins_c + del_c;
00161         max_theo_d = min_theo_d + sub_c * Min(m, n);
00162         if (max_d >= max_theo_d)
00163             max_d = -1;
00164         else if (ins_c + del_c > 0)
00165         {
00166             /*
00167              * Figure out how much of the first row of the notional matrix we
00168              * need to fill in.  If the string is growing, the theoretical
00169              * minimum distance already incorporates the cost of deleting the
00170              * number of characters necessary to make the two strings equal in
00171              * length.  Each additional deletion forces another insertion, so
00172              * the best-case total cost increases by ins_c + del_c. If the
00173              * string is shrinking, the minimum theoretical cost assumes no
00174              * excess deletions; that is, we're starting no further right than
00175              * column n - m.  If we do start further right, the best-case
00176              * total cost increases by ins_c + del_c for each move right.
00177              */
00178             int         slack_d = max_d - min_theo_d;
00179             int         best_column = net_inserts < 0 ? -net_inserts : 0;
00180 
00181             stop_column = best_column + (slack_d / (ins_c + del_c)) + 1;
00182             if (stop_column > m)
00183                 stop_column = m + 1;
00184         }
00185     }
00186 #endif
00187 
00188     /*
00189      * In order to avoid calling pg_mblen() repeatedly on each character in s,
00190      * we cache all the lengths before starting the main loop -- but if all
00191      * the characters in both strings are single byte, then we skip this and
00192      * use a fast-path in the main loop.  If only one string contains
00193      * multi-byte characters, we still build the array, so that the fast-path
00194      * needn't deal with the case where the array hasn't been initialized.
00195      */
00196     if (m != s_bytes || n != t_bytes)
00197     {
00198         int         i;
00199         const char *cp = s_data;
00200 
00201         s_char_len = (int *) palloc((m + 1) * sizeof(int));
00202         for (i = 0; i < m; ++i)
00203         {
00204             s_char_len[i] = pg_mblen(cp);
00205             cp += s_char_len[i];
00206         }
00207         s_char_len[i] = 0;
00208     }
00209 
00210     /* One more cell for initialization column and row. */
00211     ++m;
00212     ++n;
00213 
00214     /* Previous and current rows of notional array. */
00215     prev = (int *) palloc(2 * m * sizeof(int));
00216     curr = prev + m;
00217 
00218     /*
00219      * To transform the first i characters of s into the first 0 characters of
00220      * t, we must perform i deletions.
00221      */
00222     for (i = START_COLUMN; i < STOP_COLUMN; i++)
00223         prev[i] = i * del_c;
00224 
00225     /* Loop through rows of the notional array */
00226     for (y = t_data, j = 1; j < n; j++)
00227     {
00228         int        *temp;
00229         const char *x = s_data;
00230         int         y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1;
00231 
00232 #ifdef LEVENSHTEIN_LESS_EQUAL
00233 
00234         /*
00235          * In the best case, values percolate down the diagonal unchanged, so
00236          * we must increment stop_column unless it's already on the right end
00237          * of the array.  The inner loop will read prev[stop_column], so we
00238          * have to initialize it even though it shouldn't affect the result.
00239          */
00240         if (stop_column < m)
00241         {
00242             prev[stop_column] = max_d + 1;
00243             ++stop_column;
00244         }
00245 
00246         /*
00247          * The main loop fills in curr, but curr[0] needs a special case: to
00248          * transform the first 0 characters of s into the first j characters
00249          * of t, we must perform j insertions.  However, if start_column > 0,
00250          * this special case does not apply.
00251          */
00252         if (start_column == 0)
00253         {
00254             curr[0] = j * ins_c;
00255             i = 1;
00256         }
00257         else
00258             i = start_column;
00259 #else
00260         curr[0] = j * ins_c;
00261         i = 1;
00262 #endif
00263 
00264         /*
00265          * This inner loop is critical to performance, so we include a
00266          * fast-path to handle the (fairly common) case where no multibyte
00267          * characters are in the mix.  The fast-path is entitled to assume
00268          * that if s_char_len is not initialized then BOTH strings contain
00269          * only single-byte characters.
00270          */
00271         if (s_char_len != NULL)
00272         {
00273             for (; i < STOP_COLUMN; i++)
00274             {
00275                 int         ins;
00276                 int         del;
00277                 int         sub;
00278                 int         x_char_len = s_char_len[i - 1];
00279 
00280                 /*
00281                  * Calculate costs for insertion, deletion, and substitution.
00282                  *
00283                  * When calculating cost for substitution, we compare the last
00284                  * character of each possibly-multibyte character first,
00285                  * because that's enough to rule out most mis-matches.  If we
00286                  * get past that test, then we compare the lengths and the
00287                  * remaining bytes.
00288                  */
00289                 ins = prev[i] + ins_c;
00290                 del = curr[i - 1] + del_c;
00291                 if (x[x_char_len - 1] == y[y_char_len - 1]
00292                     && x_char_len == y_char_len &&
00293                     (x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
00294                     sub = prev[i - 1];
00295                 else
00296                     sub = prev[i - 1] + sub_c;
00297 
00298                 /* Take the one with minimum cost. */
00299                 curr[i] = Min(ins, del);
00300                 curr[i] = Min(curr[i], sub);
00301 
00302                 /* Point to next character. */
00303                 x += x_char_len;
00304             }
00305         }
00306         else
00307         {
00308             for (; i < STOP_COLUMN; i++)
00309             {
00310                 int         ins;
00311                 int         del;
00312                 int         sub;
00313 
00314                 /* Calculate costs for insertion, deletion, and substitution. */
00315                 ins = prev[i] + ins_c;
00316                 del = curr[i - 1] + del_c;
00317                 sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
00318 
00319                 /* Take the one with minimum cost. */
00320                 curr[i] = Min(ins, del);
00321                 curr[i] = Min(curr[i], sub);
00322 
00323                 /* Point to next character. */
00324                 x++;
00325             }
00326         }
00327 
00328         /* Swap current row with previous row. */
00329         temp = curr;
00330         curr = prev;
00331         prev = temp;
00332 
00333         /* Point to next character. */
00334         y += y_char_len;
00335 
00336 #ifdef LEVENSHTEIN_LESS_EQUAL
00337 
00338         /*
00339          * This chunk of code represents a significant performance hit if used
00340          * in the case where there is no max_d bound.  This is probably not
00341          * because the max_d >= 0 test itself is expensive, but rather because
00342          * the possibility of needing to execute this code prevents tight
00343          * optimization of the loop as a whole.
00344          */
00345         if (max_d >= 0)
00346         {
00347             /*
00348              * The "zero point" is the column of the current row where the
00349              * remaining portions of the strings are of equal length.  There
00350              * are (n - 1) characters in the target string, of which j have
00351              * been transformed.  There are (m - 1) characters in the source
00352              * string, so we want to find the value for zp where (n - 1) - j =
00353              * (m - 1) - zp.
00354              */
00355             int         zp = j - (n - m);
00356 
00357             /* Check whether the stop column can slide left. */
00358             while (stop_column > 0)
00359             {
00360                 int         ii = stop_column - 1;
00361                 int         net_inserts = ii - zp;
00362 
00363                 if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c :
00364                                 -net_inserts * del_c) <= max_d)
00365                     break;
00366                 stop_column--;
00367             }
00368 
00369             /* Check whether the start column can slide right. */
00370             while (start_column < stop_column)
00371             {
00372                 int         net_inserts = start_column - zp;
00373 
00374                 if (prev[start_column] +
00375                     (net_inserts > 0 ? net_inserts * ins_c :
00376                      -net_inserts * del_c) <= max_d)
00377                     break;
00378 
00379                 /*
00380                  * We'll never again update these values, so we must make sure
00381                  * there's nothing here that could confuse any future
00382                  * iteration of the outer loop.
00383                  */
00384                 prev[start_column] = max_d + 1;
00385                 curr[start_column] = max_d + 1;
00386                 if (start_column != 0)
00387                     s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
00388                 start_column++;
00389             }
00390 
00391             /* If they cross, we're going to exceed the bound. */
00392             if (start_column >= stop_column)
00393                 return max_d + 1;
00394         }
00395 #endif
00396     }
00397 
00398     /*
00399      * Because the final value was swapped from the previous row to the
00400      * current row, that's where we'll find it.
00401      */
00402     return prev[m - 1];
00403 }