00001 /* 00002 * levenshtein.c 00003 * 00004 * Functions for "fuzzy" comparison of strings 00005 * 00006 * Joe Conway <[email protected]> 00007 * 00008 * Copyright (c) 2001-2013, PostgreSQL Global Development Group 00009 * ALL RIGHTS RESERVED; 00010 * 00011 * levenshtein() 00012 * ------------- 00013 * Written based on a description of the algorithm by Michael Gilleland 00014 * found at http://www.merriampark.com/ld.htm 00015 * Also looked at levenshtein.c in the PHP 4.0.6 distribution for 00016 * inspiration. 00017 * Configurable penalty costs extension is introduced by Volkan 00018 * YAZICI <[email protected]>. 00019 */ 00020 00021 /* 00022 * External declarations for exported functions 00023 */ 00024 #ifdef LEVENSHTEIN_LESS_EQUAL 00025 static int levenshtein_less_equal_internal(text *s, text *t, 00026 int ins_c, int del_c, int sub_c, int max_d); 00027 #else 00028 static int levenshtein_internal(text *s, text *t, 00029 int ins_c, int del_c, int sub_c); 00030 #endif 00031 00032 #define MAX_LEVENSHTEIN_STRLEN 255 00033 00034 00035 /* 00036 * Calculates Levenshtein distance metric between supplied strings. Generally 00037 * (1, 1, 1) penalty costs suffices for common cases, but your mileage may 00038 * vary. 00039 * 00040 * One way to compute Levenshtein distance is to incrementally construct 00041 * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number 00042 * of operations required to transform the first i characters of s into 00043 * the first j characters of t. The last column of the final row is the 00044 * answer. 00045 * 00046 * We use that algorithm here with some modification. In lieu of holding 00047 * the entire array in memory at once, we'll just use two arrays of size 00048 * m+1 for storing accumulated values. At each step one array represents 00049 * the "previous" row and one is the "current" row of the notional large 00050 * array. 00051 * 00052 * If max_d >= 0, we only need to provide an accurate answer when that answer 00053 * is less than or equal to the bound. From any cell in the matrix, there is 00054 * theoretical "minimum residual distance" from that cell to the last column 00055 * of the final row. This minimum residual distance is zero when the 00056 * untransformed portions of the strings are of equal length (because we might 00057 * get lucky and find all the remaining characters matching) and is otherwise 00058 * based on the minimum number of insertions or deletions needed to make them 00059 * equal length. The residual distance grows as we move toward the upper 00060 * right or lower left corners of the matrix. When the max_d bound is 00061 * usefully tight, we can use this property to avoid computing the entirety 00062 * of each row; instead, we maintain a start_column and stop_column that 00063 * identify the portion of the matrix close to the diagonal which can still 00064 * affect the final answer. 00065 */ 00066 static int 00067 #ifdef LEVENSHTEIN_LESS_EQUAL 00068 levenshtein_less_equal_internal(text *s, text *t, 00069 int ins_c, int del_c, int sub_c, int max_d) 00070 #else 00071 levenshtein_internal(text *s, text *t, 00072 int ins_c, int del_c, int sub_c) 00073 #endif 00074 { 00075 int m, 00076 n, 00077 s_bytes, 00078 t_bytes; 00079 int *prev; 00080 int *curr; 00081 int *s_char_len = NULL; 00082 int i, 00083 j; 00084 const char *s_data; 00085 const char *t_data; 00086 const char *y; 00087 00088 /* 00089 * For levenshtein_less_equal_internal, we have real variables called 00090 * start_column and stop_column; otherwise it's just short-hand for 0 and 00091 * m. 00092 */ 00093 #ifdef LEVENSHTEIN_LESS_EQUAL 00094 int start_column, 00095 stop_column; 00096 00097 #undef START_COLUMN 00098 #undef STOP_COLUMN 00099 #define START_COLUMN start_column 00100 #define STOP_COLUMN stop_column 00101 #else 00102 #undef START_COLUMN 00103 #undef STOP_COLUMN 00104 #define START_COLUMN 0 00105 #define STOP_COLUMN m 00106 #endif 00107 00108 /* Extract a pointer to the actual character data. */ 00109 s_data = VARDATA_ANY(s); 00110 t_data = VARDATA_ANY(t); 00111 00112 /* Determine length of each string in bytes and characters. */ 00113 s_bytes = VARSIZE_ANY_EXHDR(s); 00114 t_bytes = VARSIZE_ANY_EXHDR(t); 00115 m = pg_mbstrlen_with_len(s_data, s_bytes); 00116 n = pg_mbstrlen_with_len(t_data, t_bytes); 00117 00118 /* 00119 * We can transform an empty s into t with n insertions, or a non-empty t 00120 * into an empty s with m deletions. 00121 */ 00122 if (!m) 00123 return n * ins_c; 00124 if (!n) 00125 return m * del_c; 00126 00127 /* 00128 * For security concerns, restrict excessive CPU+RAM usage. (This 00129 * implementation uses O(m) memory and has O(mn) complexity.) 00130 */ 00131 if (m > MAX_LEVENSHTEIN_STRLEN || 00132 n > MAX_LEVENSHTEIN_STRLEN) 00133 ereport(ERROR, 00134 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 00135 errmsg("argument exceeds the maximum length of %d bytes", 00136 MAX_LEVENSHTEIN_STRLEN))); 00137 00138 #ifdef LEVENSHTEIN_LESS_EQUAL 00139 /* Initialize start and stop columns. */ 00140 start_column = 0; 00141 stop_column = m + 1; 00142 00143 /* 00144 * If max_d >= 0, determine whether the bound is impossibly tight. If so, 00145 * return max_d + 1 immediately. Otherwise, determine whether it's tight 00146 * enough to limit the computation we must perform. If so, figure out 00147 * initial stop column. 00148 */ 00149 if (max_d >= 0) 00150 { 00151 int min_theo_d; /* Theoretical minimum distance. */ 00152 int max_theo_d; /* Theoretical maximum distance. */ 00153 int net_inserts = n - m; 00154 00155 min_theo_d = net_inserts < 0 ? 00156 -net_inserts * del_c : net_inserts * ins_c; 00157 if (min_theo_d > max_d) 00158 return max_d + 1; 00159 if (ins_c + del_c < sub_c) 00160 sub_c = ins_c + del_c; 00161 max_theo_d = min_theo_d + sub_c * Min(m, n); 00162 if (max_d >= max_theo_d) 00163 max_d = -1; 00164 else if (ins_c + del_c > 0) 00165 { 00166 /* 00167 * Figure out how much of the first row of the notional matrix we 00168 * need to fill in. If the string is growing, the theoretical 00169 * minimum distance already incorporates the cost of deleting the 00170 * number of characters necessary to make the two strings equal in 00171 * length. Each additional deletion forces another insertion, so 00172 * the best-case total cost increases by ins_c + del_c. If the 00173 * string is shrinking, the minimum theoretical cost assumes no 00174 * excess deletions; that is, we're starting no further right than 00175 * column n - m. If we do start further right, the best-case 00176 * total cost increases by ins_c + del_c for each move right. 00177 */ 00178 int slack_d = max_d - min_theo_d; 00179 int best_column = net_inserts < 0 ? -net_inserts : 0; 00180 00181 stop_column = best_column + (slack_d / (ins_c + del_c)) + 1; 00182 if (stop_column > m) 00183 stop_column = m + 1; 00184 } 00185 } 00186 #endif 00187 00188 /* 00189 * In order to avoid calling pg_mblen() repeatedly on each character in s, 00190 * we cache all the lengths before starting the main loop -- but if all 00191 * the characters in both strings are single byte, then we skip this and 00192 * use a fast-path in the main loop. If only one string contains 00193 * multi-byte characters, we still build the array, so that the fast-path 00194 * needn't deal with the case where the array hasn't been initialized. 00195 */ 00196 if (m != s_bytes || n != t_bytes) 00197 { 00198 int i; 00199 const char *cp = s_data; 00200 00201 s_char_len = (int *) palloc((m + 1) * sizeof(int)); 00202 for (i = 0; i < m; ++i) 00203 { 00204 s_char_len[i] = pg_mblen(cp); 00205 cp += s_char_len[i]; 00206 } 00207 s_char_len[i] = 0; 00208 } 00209 00210 /* One more cell for initialization column and row. */ 00211 ++m; 00212 ++n; 00213 00214 /* Previous and current rows of notional array. */ 00215 prev = (int *) palloc(2 * m * sizeof(int)); 00216 curr = prev + m; 00217 00218 /* 00219 * To transform the first i characters of s into the first 0 characters of 00220 * t, we must perform i deletions. 00221 */ 00222 for (i = START_COLUMN; i < STOP_COLUMN; i++) 00223 prev[i] = i * del_c; 00224 00225 /* Loop through rows of the notional array */ 00226 for (y = t_data, j = 1; j < n; j++) 00227 { 00228 int *temp; 00229 const char *x = s_data; 00230 int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1; 00231 00232 #ifdef LEVENSHTEIN_LESS_EQUAL 00233 00234 /* 00235 * In the best case, values percolate down the diagonal unchanged, so 00236 * we must increment stop_column unless it's already on the right end 00237 * of the array. The inner loop will read prev[stop_column], so we 00238 * have to initialize it even though it shouldn't affect the result. 00239 */ 00240 if (stop_column < m) 00241 { 00242 prev[stop_column] = max_d + 1; 00243 ++stop_column; 00244 } 00245 00246 /* 00247 * The main loop fills in curr, but curr[0] needs a special case: to 00248 * transform the first 0 characters of s into the first j characters 00249 * of t, we must perform j insertions. However, if start_column > 0, 00250 * this special case does not apply. 00251 */ 00252 if (start_column == 0) 00253 { 00254 curr[0] = j * ins_c; 00255 i = 1; 00256 } 00257 else 00258 i = start_column; 00259 #else 00260 curr[0] = j * ins_c; 00261 i = 1; 00262 #endif 00263 00264 /* 00265 * This inner loop is critical to performance, so we include a 00266 * fast-path to handle the (fairly common) case where no multibyte 00267 * characters are in the mix. The fast-path is entitled to assume 00268 * that if s_char_len is not initialized then BOTH strings contain 00269 * only single-byte characters. 00270 */ 00271 if (s_char_len != NULL) 00272 { 00273 for (; i < STOP_COLUMN; i++) 00274 { 00275 int ins; 00276 int del; 00277 int sub; 00278 int x_char_len = s_char_len[i - 1]; 00279 00280 /* 00281 * Calculate costs for insertion, deletion, and substitution. 00282 * 00283 * When calculating cost for substitution, we compare the last 00284 * character of each possibly-multibyte character first, 00285 * because that's enough to rule out most mis-matches. If we 00286 * get past that test, then we compare the lengths and the 00287 * remaining bytes. 00288 */ 00289 ins = prev[i] + ins_c; 00290 del = curr[i - 1] + del_c; 00291 if (x[x_char_len - 1] == y[y_char_len - 1] 00292 && x_char_len == y_char_len && 00293 (x_char_len == 1 || rest_of_char_same(x, y, x_char_len))) 00294 sub = prev[i - 1]; 00295 else 00296 sub = prev[i - 1] + sub_c; 00297 00298 /* Take the one with minimum cost. */ 00299 curr[i] = Min(ins, del); 00300 curr[i] = Min(curr[i], sub); 00301 00302 /* Point to next character. */ 00303 x += x_char_len; 00304 } 00305 } 00306 else 00307 { 00308 for (; i < STOP_COLUMN; i++) 00309 { 00310 int ins; 00311 int del; 00312 int sub; 00313 00314 /* Calculate costs for insertion, deletion, and substitution. */ 00315 ins = prev[i] + ins_c; 00316 del = curr[i - 1] + del_c; 00317 sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c); 00318 00319 /* Take the one with minimum cost. */ 00320 curr[i] = Min(ins, del); 00321 curr[i] = Min(curr[i], sub); 00322 00323 /* Point to next character. */ 00324 x++; 00325 } 00326 } 00327 00328 /* Swap current row with previous row. */ 00329 temp = curr; 00330 curr = prev; 00331 prev = temp; 00332 00333 /* Point to next character. */ 00334 y += y_char_len; 00335 00336 #ifdef LEVENSHTEIN_LESS_EQUAL 00337 00338 /* 00339 * This chunk of code represents a significant performance hit if used 00340 * in the case where there is no max_d bound. This is probably not 00341 * because the max_d >= 0 test itself is expensive, but rather because 00342 * the possibility of needing to execute this code prevents tight 00343 * optimization of the loop as a whole. 00344 */ 00345 if (max_d >= 0) 00346 { 00347 /* 00348 * The "zero point" is the column of the current row where the 00349 * remaining portions of the strings are of equal length. There 00350 * are (n - 1) characters in the target string, of which j have 00351 * been transformed. There are (m - 1) characters in the source 00352 * string, so we want to find the value for zp where (n - 1) - j = 00353 * (m - 1) - zp. 00354 */ 00355 int zp = j - (n - m); 00356 00357 /* Check whether the stop column can slide left. */ 00358 while (stop_column > 0) 00359 { 00360 int ii = stop_column - 1; 00361 int net_inserts = ii - zp; 00362 00363 if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c : 00364 -net_inserts * del_c) <= max_d) 00365 break; 00366 stop_column--; 00367 } 00368 00369 /* Check whether the start column can slide right. */ 00370 while (start_column < stop_column) 00371 { 00372 int net_inserts = start_column - zp; 00373 00374 if (prev[start_column] + 00375 (net_inserts > 0 ? net_inserts * ins_c : 00376 -net_inserts * del_c) <= max_d) 00377 break; 00378 00379 /* 00380 * We'll never again update these values, so we must make sure 00381 * there's nothing here that could confuse any future 00382 * iteration of the outer loop. 00383 */ 00384 prev[start_column] = max_d + 1; 00385 curr[start_column] = max_d + 1; 00386 if (start_column != 0) 00387 s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; 00388 start_column++; 00389 } 00390 00391 /* If they cross, we're going to exceed the bound. */ 00392 if (start_column >= stop_column) 00393 return max_d + 1; 00394 } 00395 #endif 00396 } 00397 00398 /* 00399 * Because the final value was swapped from the previous row to the 00400 * current row, that's where we'll find it. 00401 */ 00402 return prev[m - 1]; 00403 }