MediaWiki
REL1_24
|
00001 <?php 00039 class WikiDiff3 { 00040 00041 // Input variables 00042 private $from; 00043 private $to; 00044 private $m; 00045 private $n; 00046 00047 private $tooLong; 00048 private $powLimit; 00049 00050 // State variables 00051 private $maxDifferences; 00052 private $lcsLengthCorrectedForHeuristic = false; 00053 00054 // Output variables 00055 public $length; 00056 public $removed; 00057 public $added; 00058 public $heuristicUsed; 00059 00060 function __construct( $tooLong = 2000000, $powLimit = 1.45 ) { 00061 $this->tooLong = $tooLong; 00062 $this->powLimit = $powLimit; 00063 } 00064 00065 public function diff( /*array*/ $from, /*array*/ $to ) { 00066 // remember initial lengths 00067 $m = count( $from ); 00068 $n = count( $to ); 00069 00070 $this->heuristicUsed = false; 00071 00072 // output 00073 $removed = $m > 0 ? array_fill( 0, $m, true ) : array(); 00074 $added = $n > 0 ? array_fill( 0, $n, true ) : array(); 00075 00076 // reduce the complexity for the next step (intentionally done twice) 00077 // remove common tokens at the start 00078 $i = 0; 00079 while ( $i < $m && $i < $n && $from[$i] === $to[$i] ) { 00080 $removed[$i] = $added[$i] = false; 00081 unset( $from[$i], $to[$i] ); 00082 ++$i; 00083 } 00084 00085 // remove common tokens at the end 00086 $j = 1; 00087 while ( $i + $j <= $m && $i + $j <= $n && $from[$m - $j] === $to[$n - $j] ) { 00088 $removed[$m - $j] = $added[$n - $j] = false; 00089 unset( $from[$m - $j], $to[$n - $j] ); 00090 ++$j; 00091 } 00092 00093 $this->from = $newFromIndex = $this->to = $newToIndex = array(); 00094 00095 // remove tokens not in both sequences 00096 $shared = array(); 00097 foreach ( $from as $key ) { 00098 $shared[$key] = false; 00099 } 00100 00101 foreach ( $to as $index => &$el ) { 00102 if ( array_key_exists( $el, $shared ) ) { 00103 // keep it 00104 $this->to[] = $el; 00105 $shared[$el] = true; 00106 $newToIndex[] = $index; 00107 } 00108 } 00109 foreach ( $from as $index => &$el ) { 00110 if ( $shared[$el] ) { 00111 // keep it 00112 $this->from[] = $el; 00113 $newFromIndex[] = $index; 00114 } 00115 } 00116 00117 unset( $shared, $from, $to ); 00118 00119 $this->m = count( $this->from ); 00120 $this->n = count( $this->to ); 00121 00122 $this->removed = $this->m > 0 ? array_fill( 0, $this->m, true ) : array(); 00123 $this->added = $this->n > 0 ? array_fill( 0, $this->n, true ) : array(); 00124 00125 if ( $this->m == 0 || $this->n == 0 ) { 00126 $this->length = 0; 00127 } else { 00128 $this->maxDifferences = ceil( ( $this->m + $this->n ) / 2.0 ); 00129 if ( $this->m * $this->n > $this->tooLong ) { 00130 // limit complexity to D^POW_LIMIT for long sequences 00131 $this->maxDifferences = floor( pow( $this->maxDifferences, $this->powLimit - 1.0 ) ); 00132 wfDebug( "Limiting max number of differences to $this->maxDifferences\n" ); 00133 } 00134 00135 /* 00136 * The common prefixes and suffixes are always part of some LCS, include 00137 * them now to reduce our search space 00138 */ 00139 $max = min( $this->m, $this->n ); 00140 for ( $forwardBound = 0; $forwardBound < $max 00141 && $this->from[$forwardBound] === $this->to[$forwardBound]; 00142 ++$forwardBound 00143 ) { 00144 $this->removed[$forwardBound] = $this->added[$forwardBound] = false; 00145 } 00146 00147 $backBoundL1 = $this->m - 1; 00148 $backBoundL2 = $this->n - 1; 00149 00150 while ( $backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound 00151 && $this->from[$backBoundL1] === $this->to[$backBoundL2] 00152 ) { 00153 $this->removed[$backBoundL1--] = $this->added[$backBoundL2--] = false; 00154 } 00155 00156 $temp = array_fill( 0, $this->m + $this->n + 1, 0 ); 00157 $V = array( $temp, $temp ); 00158 $snake = array( 0, 0, 0 ); 00159 00160 $this->length = $forwardBound + $this->m - $backBoundL1 - 1 00161 + $this->lcs_rec( 00162 $forwardBound, 00163 $backBoundL1, 00164 $forwardBound, 00165 $backBoundL2, 00166 $V, 00167 $snake 00168 ); 00169 } 00170 00171 $this->m = $m; 00172 $this->n = $n; 00173 00174 $this->length += $i + $j - 1; 00175 00176 foreach ( $this->removed as $key => &$removed_elem ) { 00177 if ( !$removed_elem ) { 00178 $removed[$newFromIndex[$key]] = false; 00179 } 00180 } 00181 foreach ( $this->added as $key => &$added_elem ) { 00182 if ( !$added_elem ) { 00183 $added[$newToIndex[$key]] = false; 00184 } 00185 } 00186 $this->removed = $removed; 00187 $this->added = $added; 00188 } 00189 00190 function diff_range( $from_lines, $to_lines ) { 00191 // Diff and store locally 00192 $this->diff( $from_lines, $to_lines ); 00193 unset( $from_lines, $to_lines ); 00194 00195 $ranges = array(); 00196 $xi = $yi = 0; 00197 while ( $xi < $this->m || $yi < $this->n ) { 00198 // Matching "snake". 00199 while ( $xi < $this->m && $yi < $this->n 00200 && !$this->removed[$xi] 00201 && !$this->added[$yi] 00202 ) { 00203 ++$xi; 00204 ++$yi; 00205 } 00206 // Find deletes & adds. 00207 $xstart = $xi; 00208 while ( $xi < $this->m && $this->removed[$xi] ) { 00209 ++$xi; 00210 } 00211 00212 $ystart = $yi; 00213 while ( $yi < $this->n && $this->added[$yi] ) { 00214 ++$yi; 00215 } 00216 00217 if ( $xi > $xstart || $yi > $ystart ) { 00218 $ranges[] = new RangeDifference( $xstart, $xi, $ystart, $yi ); 00219 } 00220 } 00221 00222 return $ranges; 00223 } 00224 00225 private function lcs_rec( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) { 00226 // check that both sequences are non-empty 00227 if ( $bottoml1 > $topl1 || $bottoml2 > $topl2 ) { 00228 return 0; 00229 } 00230 00231 $d = $this->find_middle_snake( $bottoml1, $topl1, $bottoml2, 00232 $topl2, $V, $snake ); 00233 00234 // need to store these so we don't lose them when they're 00235 // overwritten by the recursion 00236 $len = $snake[2]; 00237 $startx = $snake[0]; 00238 $starty = $snake[1]; 00239 00240 // the middle snake is part of the LCS, store it 00241 for ( $i = 0; $i < $len; ++$i ) { 00242 $this->removed[$startx + $i] = $this->added[$starty + $i] = false; 00243 } 00244 00245 if ( $d > 1 ) { 00246 return $len 00247 + $this->lcs_rec( $bottoml1, $startx - 1, $bottoml2, 00248 $starty - 1, $V, $snake ) 00249 + $this->lcs_rec( $startx + $len, $topl1, $starty + $len, 00250 $topl2, $V, $snake ); 00251 } elseif ( $d == 1 ) { 00252 /* 00253 * In this case the sequences differ by exactly 1 line. We have 00254 * already saved all the lines after the difference in the for loop 00255 * above, now we need to save all the lines before the difference. 00256 */ 00257 $max = min( $startx - $bottoml1, $starty - $bottoml2 ); 00258 for ( $i = 0; $i < $max; ++$i ) { 00259 $this->removed[$bottoml1 + $i] = 00260 $this->added[$bottoml2 + $i] = false; 00261 } 00262 00263 return $max + $len; 00264 } 00265 00266 return $len; 00267 } 00268 00269 private function find_middle_snake( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) { 00270 $from = &$this->from; 00271 $to = &$this->to; 00272 $V0 = &$V[0]; 00273 $V1 = &$V[1]; 00274 $snake0 = &$snake[0]; 00275 $snake1 = &$snake[1]; 00276 $snake2 = &$snake[2]; 00277 $bottoml1_min_1 = $bottoml1 - 1; 00278 $bottoml2_min_1 = $bottoml2 - 1; 00279 $N = $topl1 - $bottoml1_min_1; 00280 $M = $topl2 - $bottoml2_min_1; 00281 $delta = $N - $M; 00282 $maxabsx = $N + $bottoml1; 00283 $maxabsy = $M + $bottoml2; 00284 $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) ); 00285 00286 // value_to_add_forward: a 0 or 1 that we add to the start 00287 // offset to make it odd/even 00288 if ( ( $M & 1 ) == 1 ) { 00289 $value_to_add_forward = 1; 00290 } else { 00291 $value_to_add_forward = 0; 00292 } 00293 00294 if ( ( $N & 1 ) == 1 ) { 00295 $value_to_add_backward = 1; 00296 } else { 00297 $value_to_add_backward = 0; 00298 } 00299 00300 $start_forward = -$M; 00301 $end_forward = $N; 00302 $start_backward = -$N; 00303 $end_backward = $M; 00304 00305 $limit_min_1 = $limit - 1; 00306 $limit_plus_1 = $limit + 1; 00307 00308 $V0[$limit_plus_1] = 0; 00309 $V1[$limit_min_1] = $N; 00310 $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) ); 00311 00312 if ( ( $delta & 1 ) == 1 ) { 00313 for ( $d = 0; $d <= $limit; ++$d ) { 00314 $start_diag = max( $value_to_add_forward + $start_forward, -$d ); 00315 $end_diag = min( $end_forward, $d ); 00316 $value_to_add_forward = 1 - $value_to_add_forward; 00317 00318 // compute forward furthest reaching paths 00319 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) { 00320 if ( $k == -$d || ( $k < $d 00321 && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] ) 00322 ) { 00323 $x = $V0[$limit_plus_1 + $k]; 00324 } else { 00325 $x = $V0[$limit_min_1 + $k] + 1; 00326 } 00327 00328 $absx = $snake0 = $x + $bottoml1; 00329 $absy = $snake1 = $x - $k + $bottoml2; 00330 00331 while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) { 00332 ++$absx; 00333 ++$absy; 00334 } 00335 $x = $absx - $bottoml1; 00336 00337 $snake2 = $absx - $snake0; 00338 $V0[$limit + $k] = $x; 00339 if ( $k >= $delta - $d + 1 && $k <= $delta + $d - 1 00340 && $x >= $V1[$limit + $k - $delta] 00341 ) { 00342 return 2 * $d - 1; 00343 } 00344 00345 // check to see if we can cut down the diagonal range 00346 if ( $x >= $N && $end_forward > $k - 1 ) { 00347 $end_forward = $k - 1; 00348 } elseif ( $absy - $bottoml2 >= $M ) { 00349 $start_forward = $k + 1; 00350 $value_to_add_forward = 0; 00351 } 00352 } 00353 00354 $start_diag = max( $value_to_add_backward + $start_backward, -$d ); 00355 $end_diag = min( $end_backward, $d ); 00356 $value_to_add_backward = 1 - $value_to_add_backward; 00357 00358 // compute backward furthest reaching paths 00359 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) { 00360 if ( $k == $d 00361 || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] ) 00362 ) { 00363 $x = $V1[$limit_min_1 + $k]; 00364 } else { 00365 $x = $V1[$limit_plus_1 + $k] - 1; 00366 } 00367 00368 $y = $x - $k - $delta; 00369 00370 $snake2 = 0; 00371 while ( $x > 0 && $y > 0 00372 && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1] 00373 ) { 00374 --$x; 00375 --$y; 00376 ++$snake2; 00377 } 00378 $V1[$limit + $k] = $x; 00379 00380 // check to see if we can cut down our diagonal range 00381 if ( $x <= 0 ) { 00382 $start_backward = $k + 1; 00383 $value_to_add_backward = 0; 00384 } elseif ( $y <= 0 && $end_backward > $k - 1 ) { 00385 $end_backward = $k - 1; 00386 } 00387 } 00388 } 00389 } else { 00390 for ( $d = 0; $d <= $limit; ++$d ) { 00391 $start_diag = max( $value_to_add_forward + $start_forward, -$d ); 00392 $end_diag = min( $end_forward, $d ); 00393 $value_to_add_forward = 1 - $value_to_add_forward; 00394 00395 // compute forward furthest reaching paths 00396 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) { 00397 if ( $k == -$d 00398 || ( $k < $d && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] ) 00399 ) { 00400 $x = $V0[$limit_plus_1 + $k]; 00401 } else { 00402 $x = $V0[$limit_min_1 + $k] + 1; 00403 } 00404 00405 $absx = $snake0 = $x + $bottoml1; 00406 $absy = $snake1 = $x - $k + $bottoml2; 00407 00408 while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) { 00409 ++$absx; 00410 ++$absy; 00411 } 00412 $x = $absx - $bottoml1; 00413 $snake2 = $absx - $snake0; 00414 $V0[$limit + $k] = $x; 00415 00416 // check to see if we can cut down the diagonal range 00417 if ( $x >= $N && $end_forward > $k - 1 ) { 00418 $end_forward = $k - 1; 00419 } elseif ( $absy - $bottoml2 >= $M ) { 00420 $start_forward = $k + 1; 00421 $value_to_add_forward = 0; 00422 } 00423 } 00424 00425 $start_diag = max( $value_to_add_backward + $start_backward, -$d ); 00426 $end_diag = min( $end_backward, $d ); 00427 $value_to_add_backward = 1 - $value_to_add_backward; 00428 00429 // compute backward furthest reaching paths 00430 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) { 00431 if ( $k == $d 00432 || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] ) 00433 ) { 00434 $x = $V1[$limit_min_1 + $k]; 00435 } else { 00436 $x = $V1[$limit_plus_1 + $k] - 1; 00437 } 00438 00439 $y = $x - $k - $delta; 00440 00441 $snake2 = 0; 00442 while ( $x > 0 && $y > 0 00443 && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1] 00444 ) { 00445 --$x; 00446 --$y; 00447 ++$snake2; 00448 } 00449 $V1[$limit + $k] = $x; 00450 00451 if ( $k >= -$delta - $d && $k <= $d - $delta 00452 && $x <= $V0[$limit + $k + $delta] 00453 ) { 00454 $snake0 = $bottoml1 + $x; 00455 $snake1 = $bottoml2 + $y; 00456 00457 return 2 * $d; 00458 } 00459 00460 // check to see if we can cut down our diagonal range 00461 if ( $x <= 0 ) { 00462 $start_backward = $k + 1; 00463 $value_to_add_backward = 0; 00464 } elseif ( $y <= 0 && $end_backward > $k - 1 ) { 00465 $end_backward = $k - 1; 00466 } 00467 } 00468 } 00469 } 00470 /* 00471 * computing the true LCS is too expensive, instead find the diagonal 00472 * with the most progress and pretend a midle snake of length 0 occurs 00473 * there. 00474 */ 00475 00476 $most_progress = self::findMostProgress( $M, $N, $limit, $V ); 00477 00478 $snake0 = $bottoml1 + $most_progress[0]; 00479 $snake1 = $bottoml2 + $most_progress[1]; 00480 $snake2 = 0; 00481 wfDebug( "Computing the LCS is too expensive. Using a heuristic.\n" ); 00482 $this->heuristicUsed = true; 00483 00484 return 5; /* 00485 * HACK: since we didn't really finish the LCS computation 00486 * we don't really know the length of the SES. We don't do 00487 * anything with the result anyway, unless it's <=1. We know 00488 * for a fact SES > 1 so 5 is as good a number as any to 00489 * return here 00490 */ 00491 } 00492 00493 private static function findMostProgress( $M, $N, $limit, $V ) { 00494 $delta = $N - $M; 00495 00496 if ( ( $M & 1 ) == ( $limit & 1 ) ) { 00497 $forward_start_diag = max( -$M, -$limit ); 00498 } else { 00499 $forward_start_diag = max( 1 - $M, -$limit ); 00500 } 00501 00502 $forward_end_diag = min( $N, $limit ); 00503 00504 if ( ( $N & 1 ) == ( $limit & 1 ) ) { 00505 $backward_start_diag = max( -$N, -$limit ); 00506 } else { 00507 $backward_start_diag = max( 1 - $N, -$limit ); 00508 } 00509 00510 $backward_end_diag = -min( $M, $limit ); 00511 00512 $temp = array( 0, 0, 0 ); 00513 00514 $max_progress = array_fill( 0, ceil( max( $forward_end_diag - $forward_start_diag, 00515 $backward_end_diag - $backward_start_diag ) / 2 ), $temp ); 00516 $num_progress = 0; // the 1st entry is current, it is initialized 00517 // with 0s 00518 00519 // first search the forward diagonals 00520 for ( $k = $forward_start_diag; $k <= $forward_end_diag; $k += 2 ) { 00521 $x = $V[0][$limit + $k]; 00522 $y = $x - $k; 00523 if ( $x > $N || $y > $M ) { 00524 continue; 00525 } 00526 00527 $progress = $x + $y; 00528 if ( $progress > $max_progress[0][2] ) { 00529 $num_progress = 0; 00530 $max_progress[0][0] = $x; 00531 $max_progress[0][1] = $y; 00532 $max_progress[0][2] = $progress; 00533 } elseif ( $progress == $max_progress[0][2] ) { 00534 ++$num_progress; 00535 $max_progress[$num_progress][0] = $x; 00536 $max_progress[$num_progress][1] = $y; 00537 $max_progress[$num_progress][2] = $progress; 00538 } 00539 } 00540 00541 $max_progress_forward = true; // initially the maximum 00542 // progress is in the forward 00543 // direction 00544 00545 // now search the backward diagonals 00546 for ( $k = $backward_start_diag; $k <= $backward_end_diag; $k += 2 ) { 00547 $x = $V[1][$limit + $k]; 00548 $y = $x - $k - $delta; 00549 if ( $x < 0 || $y < 0 ) { 00550 continue; 00551 } 00552 00553 $progress = $N - $x + $M - $y; 00554 if ( $progress > $max_progress[0][2] ) { 00555 $num_progress = 0; 00556 $max_progress_forward = false; 00557 $max_progress[0][0] = $x; 00558 $max_progress[0][1] = $y; 00559 $max_progress[0][2] = $progress; 00560 } elseif ( $progress == $max_progress[0][2] && !$max_progress_forward ) { 00561 ++$num_progress; 00562 $max_progress[$num_progress][0] = $x; 00563 $max_progress[$num_progress][1] = $y; 00564 $max_progress[$num_progress][2] = $progress; 00565 } 00566 } 00567 00568 // return the middle diagonal with maximal progress. 00569 return $max_progress[(int)floor( $num_progress / 2 )]; 00570 } 00571 00575 public function getLcsLength() { 00576 if ( $this->heuristicUsed && !$this->lcsLengthCorrectedForHeuristic ) { 00577 $this->lcsLengthCorrectedForHeuristic = true; 00578 $this->length = $this->m - array_sum( $this->added ); 00579 } 00580 00581 return $this->length; 00582 } 00583 00584 } 00585 00592 class RangeDifference { 00593 00595 public $leftstart; 00596 00598 public $leftend; 00599 00601 public $leftlength; 00602 00604 public $rightstart; 00605 00607 public $rightend; 00608 00610 public $rightlength; 00611 00612 function __construct( $leftstart, $leftend, $rightstart, $rightend ) { 00613 $this->leftstart = $leftstart; 00614 $this->leftend = $leftend; 00615 $this->leftlength = $leftend - $leftstart; 00616 $this->rightstart = $rightstart; 00617 $this->rightend = $rightend; 00618 $this->rightlength = $rightend - $rightstart; 00619 } 00620 00621 }