api/editdistance.cc

Go to the documentation of this file.
00001 
00012 /* Copyright (C) 2003 Richard Boulton
00013  * Copyright (C) 2007,2008 Olly Betts
00014  *
00015  * This program is free software; you can redistribute it and/or modify
00016  * it under the terms of the GNU General Public License as published by
00017  * the Free Software Foundation; either version 2 of the License, or
00018  * (at your option) any later version.
00019  *
00020  * This program is distributed in the hope that it will be useful,
00021  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00022  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023  * GNU General Public License for more details.
00024  *
00025  * You should have received a copy of the GNU General Public License
00026  * along with this program; if not, write to the Free Software
00027  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00028  */
00029 
00030 #include <config.h>
00031 
00032 #include "editdistance.h"
00033 
00034 #include "omassert.h"
00035 
00036 #include <algorithm>
00037 #include <stdlib.h>
00038 
00039 using namespace std;
00040 
00041 template<class CHR>
00042 struct edist_seq {
00043     edist_seq(const CHR * ptr_, int len_) : ptr(ptr_), len(len_) { }
00044     const CHR * ptr;
00045     int len;
00046 };
00047 
00048 template<class CHR>
00049 class edist_state {
00050     edist_seq<CHR> seq1;
00051     edist_seq<CHR> seq2;
00052 
00053     /* Array of f(k,p) values, where f(k,p) = the largest index i such that
00054      * d(i,j) = p and d(i,j) is on diagonal k.
00055      * ie: f(k,p) = largest i s.t. d(i,k+i) = p
00056      * Where: d(i,j) = edit distance between substrings of length i and j.
00057      */
00058     int * fkp;
00059     int fkp_cols;
00060 
00061     /* Maximum possible edit distance (this is referred to as ZERO_K in
00062      * the algorithm description by Berghel and Roach). */
00063     int maxdist;
00064 
00065     int calc_index(int k, int p) const {
00066         return (k + maxdist) * fkp_cols + p + 1;
00067     }
00068 
00069   public:
00070 
00071     edist_state(const CHR * ptr1, int len1, const CHR * ptr2, int len2);
00072 
00073     ~edist_state();
00074 
00075     int get_f_kp(int k, int p) const {
00076         return fkp[calc_index(k, p)];
00077     }
00078 
00079     void set_f_kp(int k, int p, int val) {
00080         fkp[calc_index(k, p)] = val;
00081     }
00082 
00083     bool is_transposed(int pos1, int pos2) {
00084         if (pos1 <= 0 || pos2 <= 0 || pos1 >= seq1.len || pos2 >= seq2.len) return false;
00085         return (seq1.ptr[pos1 - 1] == seq2.ptr[pos2] &&
00086                 seq1.ptr[pos1] == seq2.ptr[pos2 - 1]);
00087     }
00088 
00089     void edist_calc_f_kp(int k, int p);
00090 };
00091 
00092 template<class CHR>
00093 void edist_state<CHR>::edist_calc_f_kp(int k, int p)
00094 {
00095     int maxlen = get_f_kp(k, p - 1) + 1; /* dist if do substitute */
00096     int maxlen2 = get_f_kp(k - 1, p - 1); /* dist if do insert */
00097     int maxlen3 = get_f_kp(k + 1, p - 1) + 1; /* dist if delete */
00098 
00099     if (is_transposed(maxlen, maxlen + k)) {
00100         // Transposition.
00101         ++maxlen;
00102     }
00103 
00104     if (maxlen >= maxlen2) {
00105         if (maxlen >= maxlen3) {
00106             // Transposition or Substitution.
00107         } else {
00108             // Deletion.
00109             maxlen = maxlen3;
00110         }
00111     } else {
00112         if (maxlen2 >= maxlen3) {
00113             // Insertion.
00114             maxlen = maxlen2;
00115         } else {
00116             // Deletion.
00117             maxlen = maxlen3;
00118         }
00119     }
00120 
00121     /* Check for exact matches, and increase the length until we don't have
00122      * one. */
00123     while (maxlen < seq1.len &&
00124            maxlen + k < seq2.len &&
00125            seq1.ptr[maxlen] == seq2.ptr[maxlen + k]) {
00126         ++maxlen;
00127     }
00128     set_f_kp(k, p, maxlen);
00129 }
00130 
00131 #define INF 1000000
00132 template<class CHR>
00133 edist_state<CHR>::edist_state(const CHR * ptr1, int len1,
00134                               const CHR * ptr2, int len2)
00135     : seq1(ptr1, len1), seq2(ptr2, len2)
00136 {
00137     Assert(len2 >= len1);
00138     maxdist = len2;
00139     /* Each row represents a value of k, from -maxdist to maxdist. */
00140     int fkp_rows = maxdist * 2 + 1;
00141     /* Each column represents a value of p, from -1 to maxdist. */
00142     fkp_cols = maxdist + 2;
00143     /* fkp is stored as a rectangular array, row by row. */
00144     fkp = new int[fkp_rows * fkp_cols];
00145 
00146     for (int k = -maxdist; k <= maxdist; k++) {
00147         for (int p = -1; p <= maxdist; p++) {
00148             if (p == abs(k) - 1) {
00149                 if (k < 0) {
00150                     set_f_kp(k, p, abs(k) - 1);
00151                 } else {
00152                     set_f_kp(k, p, -1);
00153                 }
00154             } else if (p < abs(k)) {
00155                 set_f_kp(k, p, -INF);
00156             }
00157         }
00158     }
00159 }
00160 
00161 template<class CHR>
00162 edist_state<CHR>::~edist_state() {
00163     delete [] fkp;
00164 }
00165 
00166 template<class CHR>
00167 static int
00168 seqcmp_editdist(const CHR * ptr1, int len1, const CHR * ptr2, int len2,
00169                 int max_distance)
00170 {
00171     int lendiff = len2 - len1;
00172     /* Make sure second sequence is longer (or same length). */
00173     if (lendiff < 0) {
00174         lendiff = -lendiff;
00175         swap(ptr1, ptr2);
00176         swap(len1, len2);
00177     }
00178 
00179     /* Special case for if one or both sequences are empty. */
00180     if (len1 == 0) return len2;
00181 
00182     edist_state<CHR> state(ptr1, len1, ptr2, len2);
00183 
00184     int p = lendiff; /* This is the minimum possible edit distance. */
00185     while (p <= max_distance) {
00186         for (int temp_p = 0; temp_p != p; ++temp_p) {
00187             int inc = p - temp_p;
00188             if (abs(lendiff - inc) <= temp_p) {
00189                 state.edist_calc_f_kp(lendiff - inc, temp_p);
00190             }
00191             if (abs(lendiff + inc) <= temp_p) {
00192                 state.edist_calc_f_kp(lendiff + inc, temp_p);
00193             }
00194         }
00195         state.edist_calc_f_kp(lendiff, p);
00196 
00197         if (state.get_f_kp(lendiff, p) == len1) break;
00198         ++p;
00199     }
00200 
00201     return p;
00202 }
00203 
00204 int
00205 edit_distance_unsigned(const unsigned * ptr1, int len1,
00206                        const unsigned * ptr2, int len2,
00207                        int max_distance)
00208 {
00209     return seqcmp_editdist<unsigned>(ptr1, len1, ptr2, len2, max_distance);
00210 }

Documentation for Xapian (version 1.0.10).
Generated on 24 Dec 2008 by Doxygen 1.5.2.