MediaWiki
REL1_19
|
00001 <?php 00039 class WikiDiff3 { 00040 00041 // Input variables 00042 private $from; 00043 private $to; 00044 private $m; 00045 private $n; 00046 00047 private $tooLong; 00048 private $powLimit; 00049 00050 // State variables 00051 private $maxDifferences; 00052 private $lcsLengthCorrectedForHeuristic = false; 00053 00054 // Output variables 00055 public $length; 00056 public $removed; 00057 public $added; 00058 public $heuristicUsed; 00059 00060 function __construct( $tooLong = 2000000, $powLimit = 1.45 ) { 00061 $this->tooLong = $tooLong; 00062 $this->powLimit = $powLimit; 00063 } 00064 00065 public function diff( /*array*/ $from, /*array*/ $to ) { 00066 // remember initial lengths 00067 $m = sizeof( $from ); 00068 $n = count( $to ); 00069 00070 $this->heuristicUsed = false; 00071 00072 // output 00073 $removed = $m > 0 ? array_fill( 0, $m, true ) : array(); 00074 $added = $n > 0 ? array_fill( 0, $n, true ) : array(); 00075 00076 // reduce the complexity for the next step (intentionally done twice) 00077 // remove common tokens at the start 00078 $i = 0; 00079 while ( $i < $m && $i < $n && $from[$i] === $to[$i] ) { 00080 $removed[$i] = $added[$i] = false; 00081 unset( $from[$i], $to[$i] ); 00082 ++$i; 00083 } 00084 00085 // remove common tokens at the end 00086 $j = 1; 00087 while ( $i + $j <= $m && $i + $j <= $n && $from[$m - $j] === $to[$n - $j] ) { 00088 $removed[$m - $j] = $added[$n - $j] = false; 00089 unset( $from[$m - $j], $to[$n - $j] ); 00090 ++$j; 00091 } 00092 00093 $this->from = $newFromIndex = $this->to = $newToIndex = array(); 00094 00095 // remove tokens not in both sequences 00096 $shared = array(); 00097 foreach ( $from as $key ) { 00098 $shared[$key] = false; 00099 } 00100 00101 foreach ( $to as $index => &$el ) { 00102 if ( array_key_exists( $el, $shared ) ) { 00103 // keep it 00104 $this->to[] = $el; 00105 $shared[$el] = true; 00106 $newToIndex[] = $index; 00107 } 00108 } 00109 foreach ( $from as $index => &$el ) { 00110 if ( $shared[$el] ) { 00111 // keep it 00112 $this->from[] = $el; 00113 $newFromIndex[] = $index; 00114 } 00115 } 00116 00117 unset( $shared, $from, $to ); 00118 00119 $this->m = count( $this->from ); 00120 $this->n = count( $this->to ); 00121 00122 $this->removed = $this->m > 0 ? array_fill( 0, $this->m, true ) : array(); 00123 $this->added = $this->n > 0 ? array_fill( 0, $this->n, true ) : array(); 00124 00125 if ( $this->m == 0 || $this->n == 0 ) { 00126 $this->length = 0; 00127 } else { 00128 $this->maxDifferences = ceil( ( $this->m + $this->n ) / 2.0 ); 00129 if ( $this->m * $this->n > $this->tooLong ) { 00130 // limit complexity to D^POW_LIMIT for long sequences 00131 $this->maxDifferences = floor( pow( $this->maxDifferences, $this->powLimit - 1.0 ) ); 00132 wfDebug( "Limiting max number of differences to $this->maxDifferences\n" ); 00133 } 00134 00135 /* 00136 * The common prefixes and suffixes are always part of some LCS, include 00137 * them now to reduce our search space 00138 */ 00139 $max = min( $this->m, $this->n ); 00140 for ( $forwardBound = 0; $forwardBound < $max 00141 && $this->from[$forwardBound] === $this->to[$forwardBound]; 00142 ++$forwardBound ) { 00143 $this->removed[$forwardBound] = $this->added[$forwardBound] = false; 00144 } 00145 00146 $backBoundL1 = $this->m - 1; 00147 $backBoundL2 = $this->n - 1; 00148 00149 while ( $backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound 00150 && $this->from[$backBoundL1] === $this->to[$backBoundL2] ) { 00151 $this->removed[$backBoundL1--] = $this->added[$backBoundL2--] = false; 00152 } 00153 00154 $temp = array_fill( 0, $this->m + $this->n + 1, 0 ); 00155 $V = array( $temp, $temp ); 00156 $snake = array( 0, 0, 0 ); 00157 00158 $this->length = $forwardBound + $this->m - $backBoundL1 - 1 00159 + $this->lcs_rec( $forwardBound, $backBoundL1, 00160 $forwardBound, $backBoundL2, $V, $snake ); 00161 } 00162 00163 $this->m = $m; 00164 $this->n = $n; 00165 00166 $this->length += $i + $j - 1; 00167 00168 foreach ( $this->removed as $key => &$removed_elem ) { 00169 if ( !$removed_elem ) { 00170 $removed[$newFromIndex[$key]] = false; 00171 } 00172 } 00173 foreach ( $this->added as $key => &$added_elem ) { 00174 if ( !$added_elem ) { 00175 $added[$newToIndex[$key]] = false; 00176 } 00177 } 00178 $this->removed = $removed; 00179 $this->added = $added; 00180 } 00181 00182 function diff_range( $from_lines, $to_lines ) { 00183 // Diff and store locally 00184 $this->diff( $from_lines, $to_lines ); 00185 unset( $from_lines, $to_lines ); 00186 00187 $ranges = array(); 00188 $xi = $yi = 0; 00189 while ( $xi < $this->m || $yi < $this->n ) { 00190 // Matching "snake". 00191 while ( $xi < $this->m && $yi < $this->n 00192 && !$this->removed[$xi] 00193 && !$this->added[$yi] ) { 00194 ++$xi; 00195 ++$yi; 00196 } 00197 // Find deletes & adds. 00198 $xstart = $xi; 00199 while ( $xi < $this->m && $this->removed[$xi] ) { 00200 ++$xi; 00201 } 00202 00203 $ystart = $yi; 00204 while ( $yi < $this->n && $this->added[$yi] ) { 00205 ++$yi; 00206 } 00207 00208 if ( $xi > $xstart || $yi > $ystart ) { 00209 $ranges[] = new RangeDifference( $xstart, $xi, 00210 $ystart, $yi ); 00211 } 00212 } 00213 return $ranges; 00214 } 00215 00216 private function lcs_rec( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) { 00217 // check that both sequences are non-empty 00218 if ( $bottoml1 > $topl1 || $bottoml2 > $topl2 ) { 00219 return 0; 00220 } 00221 00222 $d = $this->find_middle_snake( $bottoml1, $topl1, $bottoml2, 00223 $topl2, $V, $snake ); 00224 00225 // need to store these so we don't lose them when they're 00226 // overwritten by the recursion 00227 $len = $snake[2]; 00228 $startx = $snake[0]; 00229 $starty = $snake[1]; 00230 00231 // the middle snake is part of the LCS, store it 00232 for ( $i = 0; $i < $len; ++$i ) { 00233 $this->removed[$startx + $i] = $this->added[$starty + $i] = false; 00234 } 00235 00236 if ( $d > 1 ) { 00237 return $len 00238 + $this->lcs_rec( $bottoml1, $startx - 1, $bottoml2, 00239 $starty - 1, $V, $snake ) 00240 + $this->lcs_rec( $startx + $len, $topl1, $starty + $len, 00241 $topl2, $V, $snake ); 00242 } elseif ( $d == 1 ) { 00243 /* 00244 * In this case the sequences differ by exactly 1 line. We have 00245 * already saved all the lines after the difference in the for loop 00246 * above, now we need to save all the lines before the difference. 00247 */ 00248 $max = min( $startx - $bottoml1, $starty - $bottoml2 ); 00249 for ( $i = 0; $i < $max; ++$i ) { 00250 $this->removed[$bottoml1 + $i] = 00251 $this->added[$bottoml2 + $i] = false; 00252 } 00253 return $max + $len; 00254 } 00255 return $len; 00256 } 00257 00258 private function find_middle_snake( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) { 00259 $from = &$this->from; 00260 $to = &$this->to; 00261 $V0 = &$V[0]; 00262 $V1 = &$V[1]; 00263 $snake0 = &$snake[0]; 00264 $snake1 = &$snake[1]; 00265 $snake2 = &$snake[2]; 00266 $bottoml1_min_1 = $bottoml1 -1; 00267 $bottoml2_min_1 = $bottoml2 -1; 00268 $N = $topl1 - $bottoml1_min_1; 00269 $M = $topl2 - $bottoml2_min_1; 00270 $delta = $N - $M; 00271 $maxabsx = $N + $bottoml1; 00272 $maxabsy = $M + $bottoml2; 00273 $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) ); 00274 00275 // value_to_add_forward: a 0 or 1 that we add to the start 00276 // offset to make it odd/even 00277 if ( ( $M & 1 ) == 1 ) { 00278 $value_to_add_forward = 1; 00279 } else { 00280 $value_to_add_forward = 0; 00281 } 00282 00283 if ( ( $N & 1 ) == 1 ) { 00284 $value_to_add_backward = 1; 00285 } else { 00286 $value_to_add_backward = 0; 00287 } 00288 00289 $start_forward = -$M; 00290 $end_forward = $N; 00291 $start_backward = -$N; 00292 $end_backward = $M; 00293 00294 $limit_min_1 = $limit - 1; 00295 $limit_plus_1 = $limit + 1; 00296 00297 $V0[$limit_plus_1] = 0; 00298 $V1[$limit_min_1] = $N; 00299 $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) ); 00300 00301 if ( ( $delta & 1 ) == 1 ) { 00302 for ( $d = 0; $d <= $limit; ++$d ) { 00303 $start_diag = max( $value_to_add_forward + $start_forward, -$d ); 00304 $end_diag = min( $end_forward, $d ); 00305 $value_to_add_forward = 1 - $value_to_add_forward; 00306 00307 // compute forward furthest reaching paths 00308 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) { 00309 if ( $k == -$d || ( $k < $d 00310 && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] ) ) { 00311 $x = $V0[$limit_plus_1 + $k]; 00312 } else { 00313 $x = $V0[$limit_min_1 + $k] + 1; 00314 } 00315 00316 $absx = $snake0 = $x + $bottoml1; 00317 $absy = $snake1 = $x - $k + $bottoml2; 00318 00319 while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) { 00320 ++$absx; 00321 ++$absy; 00322 } 00323 $x = $absx -$bottoml1; 00324 00325 $snake2 = $absx -$snake0; 00326 $V0[$limit + $k] = $x; 00327 if ( $k >= $delta - $d + 1 && $k <= $delta + $d - 1 00328 && $x >= $V1[$limit + $k - $delta] ) { 00329 return 2 * $d - 1; 00330 } 00331 00332 // check to see if we can cut down the diagonal range 00333 if ( $x >= $N && $end_forward > $k - 1 ) { 00334 $end_forward = $k - 1; 00335 } elseif ( $absy - $bottoml2 >= $M ) { 00336 $start_forward = $k + 1; 00337 $value_to_add_forward = 0; 00338 } 00339 } 00340 00341 $start_diag = max( $value_to_add_backward + $start_backward, -$d ); 00342 $end_diag = min( $end_backward, $d ); 00343 $value_to_add_backward = 1 - $value_to_add_backward; 00344 00345 // compute backward furthest reaching paths 00346 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) { 00347 if ( $k == $d 00348 || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] ) ) { 00349 $x = $V1[$limit_min_1 + $k]; 00350 } else { 00351 $x = $V1[$limit_plus_1 + $k] - 1; 00352 } 00353 00354 $y = $x - $k - $delta; 00355 00356 $snake2 = 0; 00357 while ( $x > 0 && $y > 0 00358 && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1] ) { 00359 --$x; 00360 --$y; 00361 ++$snake2; 00362 } 00363 $V1[$limit + $k] = $x; 00364 00365 // check to see if we can cut down our diagonal range 00366 if ( $x <= 0 ) { 00367 $start_backward = $k + 1; 00368 $value_to_add_backward = 0; 00369 } elseif ( $y <= 0 && $end_backward > $k - 1 ) { 00370 $end_backward = $k - 1; 00371 } 00372 } 00373 } 00374 } else { 00375 for ( $d = 0; $d <= $limit; ++$d ) { 00376 $start_diag = max( $value_to_add_forward + $start_forward, -$d ); 00377 $end_diag = min( $end_forward, $d ); 00378 $value_to_add_forward = 1 - $value_to_add_forward; 00379 00380 // compute forward furthest reaching paths 00381 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) { 00382 if ( $k == -$d 00383 || ( $k < $d && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] ) ) { 00384 $x = $V0[$limit_plus_1 + $k]; 00385 } else { 00386 $x = $V0[$limit_min_1 + $k] + 1; 00387 } 00388 00389 $absx = $snake0 = $x + $bottoml1; 00390 $absy = $snake1 = $x - $k + $bottoml2; 00391 00392 while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) { 00393 ++$absx; 00394 ++$absy; 00395 } 00396 $x = $absx -$bottoml1; 00397 $snake2 = $absx -$snake0; 00398 $V0[$limit + $k] = $x; 00399 00400 // check to see if we can cut down the diagonal range 00401 if ( $x >= $N && $end_forward > $k - 1 ) { 00402 $end_forward = $k - 1; 00403 } elseif ( $absy -$bottoml2 >= $M ) { 00404 $start_forward = $k + 1; 00405 $value_to_add_forward = 0; 00406 } 00407 } 00408 00409 $start_diag = max( $value_to_add_backward + $start_backward, -$d ); 00410 $end_diag = min( $end_backward, $d ); 00411 $value_to_add_backward = 1 - $value_to_add_backward; 00412 00413 // compute backward furthest reaching paths 00414 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) { 00415 if ( $k == $d 00416 || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] ) ) { 00417 $x = $V1[$limit_min_1 + $k]; 00418 } else { 00419 $x = $V1[$limit_plus_1 + $k] - 1; 00420 } 00421 00422 $y = $x - $k - $delta; 00423 00424 $snake2 = 0; 00425 while ( $x > 0 && $y > 0 00426 && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1] ) { 00427 --$x; 00428 --$y; 00429 ++$snake2; 00430 } 00431 $V1[$limit + $k] = $x; 00432 00433 if ( $k >= -$delta - $d && $k <= $d - $delta 00434 && $x <= $V0[$limit + $k + $delta] ) { 00435 $snake0 = $bottoml1 + $x; 00436 $snake1 = $bottoml2 + $y; 00437 return 2 * $d; 00438 } 00439 00440 // check to see if we can cut down our diagonal range 00441 if ( $x <= 0 ) { 00442 $start_backward = $k + 1; 00443 $value_to_add_backward = 0; 00444 } elseif ( $y <= 0 && $end_backward > $k - 1 ) { 00445 $end_backward = $k - 1; 00446 } 00447 } 00448 } 00449 } 00450 /* 00451 * computing the true LCS is too expensive, instead find the diagonal 00452 * with the most progress and pretend a midle snake of length 0 occurs 00453 * there. 00454 */ 00455 00456 $most_progress = self::findMostProgress( $M, $N, $limit, $V ); 00457 00458 $snake0 = $bottoml1 + $most_progress[0]; 00459 $snake1 = $bottoml2 + $most_progress[1]; 00460 $snake2 = 0; 00461 wfDebug( "Computing the LCS is too expensive. Using a heuristic.\n" ); 00462 $this->heuristicUsed = true; 00463 return 5; /* 00464 * HACK: since we didn't really finish the LCS computation 00465 * we don't really know the length of the SES. We don't do 00466 * anything with the result anyway, unless it's <=1. We know 00467 * for a fact SES > 1 so 5 is as good a number as any to 00468 * return here 00469 */ 00470 } 00471 00472 private static function findMostProgress( $M, $N, $limit, $V ) { 00473 $delta = $N - $M; 00474 00475 if ( ( $M & 1 ) == ( $limit & 1 ) ) { 00476 $forward_start_diag = max( -$M, -$limit ); 00477 } else { 00478 $forward_start_diag = max( 1 - $M, -$limit ); 00479 } 00480 00481 $forward_end_diag = min( $N, $limit ); 00482 00483 if ( ( $N & 1 ) == ( $limit & 1 ) ) { 00484 $backward_start_diag = max( -$N, -$limit ); 00485 } else { 00486 $backward_start_diag = max( 1 - $N, -$limit ); 00487 } 00488 00489 $backward_end_diag = -min( $M, $limit ); 00490 00491 $temp = array( 0, 0, 0 ); 00492 00493 00494 $max_progress = array_fill( 0, ceil( max( $forward_end_diag - $forward_start_diag, 00495 $backward_end_diag - $backward_start_diag ) / 2 ), $temp ); 00496 $num_progress = 0; // the 1st entry is current, it is initialized 00497 // with 0s 00498 00499 // first search the forward diagonals 00500 for ( $k = $forward_start_diag; $k <= $forward_end_diag; $k += 2 ) { 00501 $x = $V[0][$limit + $k]; 00502 $y = $x - $k; 00503 if ( $x > $N || $y > $M ) { 00504 continue; 00505 } 00506 00507 $progress = $x + $y; 00508 if ( $progress > $max_progress[0][2] ) { 00509 $num_progress = 0; 00510 $max_progress[0][0] = $x; 00511 $max_progress[0][1] = $y; 00512 $max_progress[0][2] = $progress; 00513 } elseif ( $progress == $max_progress[0][2] ) { 00514 ++$num_progress; 00515 $max_progress[$num_progress][0] = $x; 00516 $max_progress[$num_progress][1] = $y; 00517 $max_progress[$num_progress][2] = $progress; 00518 } 00519 } 00520 00521 $max_progress_forward = true; // initially the maximum 00522 // progress is in the forward 00523 // direction 00524 00525 // now search the backward diagonals 00526 for ( $k = $backward_start_diag; $k <= $backward_end_diag; $k += 2 ) { 00527 $x = $V[1][$limit + $k]; 00528 $y = $x - $k - $delta; 00529 if ( $x < 0 || $y < 0 ) { 00530 continue; 00531 } 00532 00533 $progress = $N - $x + $M - $y; 00534 if ( $progress > $max_progress[0][2] ) { 00535 $num_progress = 0; 00536 $max_progress_forward = false; 00537 $max_progress[0][0] = $x; 00538 $max_progress[0][1] = $y; 00539 $max_progress[0][2] = $progress; 00540 } elseif ( $progress == $max_progress[0][2] && !$max_progress_forward ) { 00541 ++$num_progress; 00542 $max_progress[$num_progress][0] = $x; 00543 $max_progress[$num_progress][1] = $y; 00544 $max_progress[$num_progress][2] = $progress; 00545 } 00546 } 00547 00548 // return the middle diagonal with maximal progress. 00549 return $max_progress[(int)floor( $num_progress / 2 )]; 00550 } 00551 00555 public function getLcsLength() { 00556 if ( $this->heuristicUsed && !$this->lcsLengthCorrectedForHeuristic ) { 00557 $this->lcsLengthCorrectedForHeuristic = true; 00558 $this->length = $this->m -array_sum( $this->added ); 00559 } 00560 return $this->length; 00561 } 00562 00563 } 00564 00571 class RangeDifference { 00572 00573 public $leftstart; 00574 public $leftend; 00575 public $leftlength; 00576 00577 public $rightstart; 00578 public $rightend; 00579 public $rightlength; 00580 00581 function __construct( $leftstart, $leftend, $rightstart, $rightend ) { 00582 $this->leftstart = $leftstart; 00583 $this->leftend = $leftend; 00584 $this->leftlength = $leftend - $leftstart; 00585 $this->rightstart = $rightstart; 00586 $this->rightend = $rightend; 00587 $this->rightlength = $rightend - $rightstart; 00588 } 00589 }