00001
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include <config.h>
00031
00032 #include "editdistance.h"
00033
00034 #include "omassert.h"
00035
00036 #include <algorithm>
00037 #include <stdlib.h>
00038
00039 using namespace std;
00040
00041 template<class CHR>
00042 struct edist_seq {
00043 edist_seq(const CHR * ptr_, int len_) : ptr(ptr_), len(len_) { }
00044 const CHR * ptr;
00045 int len;
00046 };
00047
00048 template<class CHR>
00049 class edist_state {
00050 edist_seq<CHR> seq1;
00051 edist_seq<CHR> seq2;
00052
00053
00054
00055
00056
00057
00058 int * fkp;
00059 int fkp_cols;
00060
00061
00062
00063 int maxdist;
00064
00065 int calc_index(int k, int p) const {
00066 return (k + maxdist) * fkp_cols + p + 1;
00067 }
00068
00069 public:
00070
00071 edist_state(const CHR * ptr1, int len1, const CHR * ptr2, int len2);
00072
00073 ~edist_state();
00074
00075 int get_f_kp(int k, int p) const {
00076 return fkp[calc_index(k, p)];
00077 }
00078
00079 void set_f_kp(int k, int p, int val) {
00080 fkp[calc_index(k, p)] = val;
00081 }
00082
00083 bool is_transposed(int pos1, int pos2) {
00084 if (pos1 <= 0 || pos2 <= 0 || pos1 >= seq1.len || pos2 >= seq2.len) return false;
00085 return (seq1.ptr[pos1 - 1] == seq2.ptr[pos2] &&
00086 seq1.ptr[pos1] == seq2.ptr[pos2 - 1]);
00087 }
00088
00089 void edist_calc_f_kp(int k, int p);
00090 };
00091
00092 template<class CHR>
00093 void edist_state<CHR>::edist_calc_f_kp(int k, int p)
00094 {
00095 int maxlen = get_f_kp(k, p - 1) + 1;
00096 int maxlen2 = get_f_kp(k - 1, p - 1);
00097 int maxlen3 = get_f_kp(k + 1, p - 1) + 1;
00098
00099 if (is_transposed(maxlen, maxlen + k)) {
00100
00101 ++maxlen;
00102 }
00103
00104 if (maxlen >= maxlen2) {
00105 if (maxlen >= maxlen3) {
00106
00107 } else {
00108
00109 maxlen = maxlen3;
00110 }
00111 } else {
00112 if (maxlen2 >= maxlen3) {
00113
00114 maxlen = maxlen2;
00115 } else {
00116
00117 maxlen = maxlen3;
00118 }
00119 }
00120
00121
00122
00123 while (maxlen < seq1.len &&
00124 maxlen + k < seq2.len &&
00125 seq1.ptr[maxlen] == seq2.ptr[maxlen + k]) {
00126 ++maxlen;
00127 }
00128 set_f_kp(k, p, maxlen);
00129 }
00130
00131 #define INF 1000000
00132 template<class CHR>
00133 edist_state<CHR>::edist_state(const CHR * ptr1, int len1,
00134 const CHR * ptr2, int len2)
00135 : seq1(ptr1, len1), seq2(ptr2, len2)
00136 {
00137 Assert(len2 >= len1);
00138 maxdist = len2;
00139
00140 int fkp_rows = maxdist * 2 + 1;
00141
00142 fkp_cols = maxdist + 2;
00143
00144 fkp = new int[fkp_rows * fkp_cols];
00145
00146 for (int k = -maxdist; k <= maxdist; k++) {
00147 for (int p = -1; p <= maxdist; p++) {
00148 if (p == abs(k) - 1) {
00149 if (k < 0) {
00150 set_f_kp(k, p, abs(k) - 1);
00151 } else {
00152 set_f_kp(k, p, -1);
00153 }
00154 } else if (p < abs(k)) {
00155 set_f_kp(k, p, -INF);
00156 }
00157 }
00158 }
00159 }
00160
00161 template<class CHR>
00162 edist_state<CHR>::~edist_state() {
00163 delete [] fkp;
00164 }
00165
00166 template<class CHR>
00167 static int
00168 seqcmp_editdist(const CHR * ptr1, int len1, const CHR * ptr2, int len2,
00169 int max_distance)
00170 {
00171 int lendiff = len2 - len1;
00172
00173 if (lendiff < 0) {
00174 lendiff = -lendiff;
00175 swap(ptr1, ptr2);
00176 swap(len1, len2);
00177 }
00178
00179
00180 if (len1 == 0) return len2;
00181
00182 edist_state<CHR> state(ptr1, len1, ptr2, len2);
00183
00184 int p = lendiff;
00185 while (p <= max_distance) {
00186 for (int temp_p = 0; temp_p != p; ++temp_p) {
00187 int inc = p - temp_p;
00188 if (abs(lendiff - inc) <= temp_p) {
00189 state.edist_calc_f_kp(lendiff - inc, temp_p);
00190 }
00191 if (abs(lendiff + inc) <= temp_p) {
00192 state.edist_calc_f_kp(lendiff + inc, temp_p);
00193 }
00194 }
00195 state.edist_calc_f_kp(lendiff, p);
00196
00197 if (state.get_f_kp(lendiff, p) == len1) break;
00198 ++p;
00199 }
00200
00201 return p;
00202 }
00203
00204 int
00205 edit_distance_unsigned(const unsigned * ptr1, int len1,
00206 const unsigned * ptr2, int len2,
00207 int max_distance)
00208 {
00209 return seqcmp_editdist<unsigned>(ptr1, len1, ptr2, len2, max_distance);
00210 }