MediaWiki  REL1_22
WikiDiff3.php
Go to the documentation of this file.
00001 <?php
00039 class WikiDiff3 {
00040 
00041     // Input variables
00042     private $from;
00043     private $to;
00044     private $m;
00045     private $n;
00046 
00047     private $tooLong;
00048     private $powLimit;
00049 
00050     // State variables
00051     private $maxDifferences;
00052     private $lcsLengthCorrectedForHeuristic = false;
00053 
00054     // Output variables
00055     public $length;
00056     public $removed;
00057     public $added;
00058     public $heuristicUsed;
00059 
00060     function __construct( $tooLong = 2000000, $powLimit = 1.45 ) {
00061         $this->tooLong = $tooLong;
00062         $this->powLimit = $powLimit;
00063     }
00064 
00065     public function diff( /*array*/ $from, /*array*/ $to ) {
00066         // remember initial lengths
00067         $m = count( $from );
00068         $n = count( $to );
00069 
00070         $this->heuristicUsed = false;
00071 
00072         // output
00073         $removed = $m > 0 ? array_fill( 0, $m, true ) : array();
00074         $added = $n > 0 ? array_fill( 0, $n, true ) : array();
00075 
00076         // reduce the complexity for the next step (intentionally done twice)
00077         // remove common tokens at the start
00078         $i = 0;
00079         while ( $i < $m && $i < $n && $from[$i] === $to[$i] ) {
00080             $removed[$i] = $added[$i] = false;
00081             unset( $from[$i], $to[$i] );
00082             ++$i;
00083         }
00084 
00085         // remove common tokens at the end
00086         $j = 1;
00087         while ( $i + $j <= $m && $i + $j <= $n && $from[$m - $j] === $to[$n - $j] ) {
00088             $removed[$m - $j] = $added[$n - $j] = false;
00089             unset( $from[$m - $j], $to[$n - $j] );
00090             ++$j;
00091         }
00092 
00093         $this->from = $newFromIndex = $this->to = $newToIndex = array();
00094 
00095         // remove tokens not in both sequences
00096         $shared = array();
00097         foreach ( $from as $key ) {
00098             $shared[$key] = false;
00099         }
00100 
00101         foreach ( $to as $index => &$el ) {
00102             if ( array_key_exists( $el, $shared ) ) {
00103                 // keep it
00104                 $this->to[] = $el;
00105                 $shared[$el] = true;
00106                 $newToIndex[] = $index;
00107             }
00108         }
00109         foreach ( $from as $index => &$el ) {
00110             if ( $shared[$el] ) {
00111                 // keep it
00112                 $this->from[] = $el;
00113                 $newFromIndex[] = $index;
00114             }
00115         }
00116 
00117         unset( $shared, $from, $to );
00118 
00119         $this->m = count( $this->from );
00120         $this->n = count( $this->to );
00121 
00122         $this->removed = $this->m > 0 ? array_fill( 0, $this->m, true ) : array();
00123         $this->added = $this->n > 0 ? array_fill( 0, $this->n, true ) : array();
00124 
00125         if ( $this->m == 0 || $this->n == 0 ) {
00126             $this->length = 0;
00127         } else {
00128             $this->maxDifferences = ceil( ( $this->m + $this->n ) / 2.0 );
00129             if ( $this->m * $this->n > $this->tooLong ) {
00130                 // limit complexity to D^POW_LIMIT for long sequences
00131                 $this->maxDifferences = floor( pow( $this->maxDifferences, $this->powLimit - 1.0 ) );
00132                 wfDebug( "Limiting max number of differences to $this->maxDifferences\n" );
00133             }
00134 
00135             /*
00136              * The common prefixes and suffixes are always part of some LCS, include
00137              * them now to reduce our search space
00138              */
00139             $max = min( $this->m, $this->n );
00140             for ( $forwardBound = 0; $forwardBound < $max
00141                     && $this->from[$forwardBound] === $this->to[$forwardBound];
00142                     ++$forwardBound ) {
00143                 $this->removed[$forwardBound] = $this->added[$forwardBound] = false;
00144             }
00145 
00146             $backBoundL1 = $this->m - 1;
00147             $backBoundL2 = $this->n - 1;
00148 
00149             while ( $backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound
00150                     && $this->from[$backBoundL1] === $this->to[$backBoundL2] ) {
00151                 $this->removed[$backBoundL1--] = $this->added[$backBoundL2--] = false;
00152             }
00153 
00154             $temp = array_fill( 0, $this->m + $this->n + 1, 0 );
00155             $V = array( $temp, $temp );
00156             $snake = array( 0, 0, 0 );
00157 
00158             $this->length = $forwardBound + $this->m - $backBoundL1 - 1
00159                 + $this->lcs_rec( $forwardBound, $backBoundL1,
00160                 $forwardBound, $backBoundL2, $V, $snake );
00161         }
00162 
00163         $this->m = $m;
00164         $this->n = $n;
00165 
00166         $this->length += $i + $j - 1;
00167 
00168         foreach ( $this->removed as $key => &$removed_elem ) {
00169             if ( !$removed_elem ) {
00170                 $removed[$newFromIndex[$key]] = false;
00171             }
00172         }
00173         foreach ( $this->added as $key => &$added_elem ) {
00174             if ( !$added_elem ) {
00175                 $added[$newToIndex[$key]] = false;
00176             }
00177         }
00178         $this->removed = $removed;
00179         $this->added = $added;
00180     }
00181 
00182     function diff_range( $from_lines, $to_lines ) {
00183         // Diff and store locally
00184         $this->diff( $from_lines, $to_lines );
00185         unset( $from_lines, $to_lines );
00186 
00187         $ranges = array();
00188         $xi = $yi = 0;
00189         while ( $xi < $this->m || $yi < $this->n ) {
00190             // Matching "snake".
00191             while ( $xi < $this->m && $yi < $this->n
00192                     && !$this->removed[$xi]
00193                     && !$this->added[$yi] ) {
00194                 ++$xi;
00195                 ++$yi;
00196             }
00197             // Find deletes & adds.
00198             $xstart = $xi;
00199             while ( $xi < $this->m && $this->removed[$xi] ) {
00200                 ++$xi;
00201             }
00202 
00203             $ystart = $yi;
00204             while ( $yi < $this->n && $this->added[$yi] ) {
00205                 ++$yi;
00206             }
00207 
00208             if ( $xi > $xstart || $yi > $ystart ) {
00209                 $ranges[] = new RangeDifference( $xstart, $xi,
00210                                 $ystart, $yi );
00211             }
00212         }
00213         return $ranges;
00214     }
00215 
00216     private function lcs_rec( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) {
00217         // check that both sequences are non-empty
00218         if ( $bottoml1 > $topl1 || $bottoml2 > $topl2 ) {
00219             return 0;
00220         }
00221 
00222         $d = $this->find_middle_snake( $bottoml1, $topl1, $bottoml2,
00223                             $topl2, $V, $snake );
00224 
00225         // need to store these so we don't lose them when they're
00226         // overwritten by the recursion
00227         $len = $snake[2];
00228         $startx = $snake[0];
00229         $starty = $snake[1];
00230 
00231         // the middle snake is part of the LCS, store it
00232         for ( $i = 0; $i < $len; ++$i ) {
00233             $this->removed[$startx + $i] = $this->added[$starty + $i] = false;
00234         }
00235 
00236         if ( $d > 1 ) {
00237             return $len
00238             + $this->lcs_rec( $bottoml1, $startx - 1, $bottoml2,
00239                             $starty - 1, $V, $snake )
00240             + $this->lcs_rec( $startx + $len, $topl1, $starty + $len,
00241                             $topl2, $V, $snake );
00242         } elseif ( $d == 1 ) {
00243             /*
00244              * In this case the sequences differ by exactly 1 line. We have
00245              * already saved all the lines after the difference in the for loop
00246              * above, now we need to save all the lines before the difference.
00247              */
00248             $max = min( $startx - $bottoml1, $starty - $bottoml2 );
00249             for ( $i = 0; $i < $max; ++$i ) {
00250                 $this->removed[$bottoml1 + $i] =
00251                     $this->added[$bottoml2 + $i] = false;
00252             }
00253             return $max + $len;
00254         }
00255         return $len;
00256     }
00257 
00258     private function find_middle_snake( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) {
00259         $from = &$this->from;
00260         $to = &$this->to;
00261         $V0 = &$V[0];
00262         $V1 = &$V[1];
00263         $snake0 = &$snake[0];
00264         $snake1 = &$snake[1];
00265         $snake2 = &$snake[2];
00266         $bottoml1_min_1 = $bottoml1 -1;
00267         $bottoml2_min_1 = $bottoml2 -1;
00268         $N = $topl1 - $bottoml1_min_1;
00269         $M = $topl2 - $bottoml2_min_1;
00270         $delta = $N - $M;
00271         $maxabsx = $N + $bottoml1;
00272         $maxabsy = $M + $bottoml2;
00273         $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) );
00274 
00275         // value_to_add_forward: a 0 or 1 that we add to the start
00276         // offset to make it odd/even
00277         if ( ( $M & 1 ) == 1 ) {
00278             $value_to_add_forward = 1;
00279         } else {
00280             $value_to_add_forward = 0;
00281         }
00282 
00283         if ( ( $N & 1 ) == 1 ) {
00284             $value_to_add_backward = 1;
00285         } else {
00286             $value_to_add_backward = 0;
00287         }
00288 
00289         $start_forward = -$M;
00290         $end_forward = $N;
00291         $start_backward = -$N;
00292         $end_backward = $M;
00293 
00294         $limit_min_1 = $limit - 1;
00295         $limit_plus_1 = $limit + 1;
00296 
00297         $V0[$limit_plus_1] = 0;
00298         $V1[$limit_min_1] = $N;
00299         $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) );
00300 
00301         if ( ( $delta & 1 ) == 1 ) {
00302             for ( $d = 0; $d <= $limit; ++$d ) {
00303                 $start_diag = max( $value_to_add_forward + $start_forward, -$d );
00304                 $end_diag = min( $end_forward, $d );
00305                 $value_to_add_forward = 1 - $value_to_add_forward;
00306 
00307                 // compute forward furthest reaching paths
00308                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
00309                     if ( $k == -$d || ( $k < $d
00310                             && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] ) ) {
00311                         $x = $V0[$limit_plus_1 + $k];
00312                     } else {
00313                         $x = $V0[$limit_min_1 + $k] + 1;
00314                     }
00315 
00316                     $absx = $snake0 = $x + $bottoml1;
00317                     $absy = $snake1 = $x - $k + $bottoml2;
00318 
00319                     while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) {
00320                         ++$absx;
00321                         ++$absy;
00322                     }
00323                     $x = $absx -$bottoml1;
00324 
00325                     $snake2 = $absx -$snake0;
00326                     $V0[$limit + $k] = $x;
00327                     if ( $k >= $delta - $d + 1 && $k <= $delta + $d - 1
00328                             && $x >= $V1[$limit + $k - $delta] ) {
00329                         return 2 * $d - 1;
00330                     }
00331 
00332                     // check to see if we can cut down the diagonal range
00333                     if ( $x >= $N && $end_forward > $k - 1 ) {
00334                         $end_forward = $k - 1;
00335                     } elseif ( $absy - $bottoml2 >= $M ) {
00336                         $start_forward = $k + 1;
00337                         $value_to_add_forward = 0;
00338                     }
00339                 }
00340 
00341                 $start_diag = max( $value_to_add_backward + $start_backward, -$d );
00342                 $end_diag = min( $end_backward, $d );
00343                 $value_to_add_backward = 1 - $value_to_add_backward;
00344 
00345                 // compute backward furthest reaching paths
00346                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
00347                     if ( $k == $d
00348                     || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] ) ) {
00349                         $x = $V1[$limit_min_1 + $k];
00350                     } else {
00351                         $x = $V1[$limit_plus_1 + $k] - 1;
00352                     }
00353 
00354                     $y = $x - $k - $delta;
00355 
00356                     $snake2 = 0;
00357                     while ( $x > 0 && $y > 0
00358                     && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1] ) {
00359                         --$x;
00360                         --$y;
00361                         ++$snake2;
00362                     }
00363                     $V1[$limit + $k] = $x;
00364 
00365                     // check to see if we can cut down our diagonal range
00366                     if ( $x <= 0 ) {
00367                         $start_backward = $k + 1;
00368                         $value_to_add_backward = 0;
00369                     } elseif ( $y <= 0 && $end_backward > $k - 1 ) {
00370                         $end_backward = $k - 1;
00371                     }
00372                 }
00373             }
00374         } else {
00375             for ( $d = 0; $d <= $limit; ++$d ) {
00376                 $start_diag = max( $value_to_add_forward + $start_forward, -$d );
00377                 $end_diag = min( $end_forward, $d );
00378                 $value_to_add_forward = 1 - $value_to_add_forward;
00379 
00380                 // compute forward furthest reaching paths
00381                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
00382                     if ( $k == -$d
00383                     || ( $k < $d && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] ) ) {
00384                         $x = $V0[$limit_plus_1 + $k];
00385                     } else {
00386                         $x = $V0[$limit_min_1 + $k] + 1;
00387                     }
00388 
00389                     $absx = $snake0 = $x + $bottoml1;
00390                     $absy = $snake1 = $x - $k + $bottoml2;
00391 
00392                     while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) {
00393                         ++$absx;
00394                         ++$absy;
00395                     }
00396                     $x = $absx -$bottoml1;
00397                     $snake2 = $absx -$snake0;
00398                     $V0[$limit + $k] = $x;
00399 
00400                     // check to see if we can cut down the diagonal range
00401                     if ( $x >= $N && $end_forward > $k - 1 ) {
00402                         $end_forward = $k - 1;
00403                     } elseif ( $absy -$bottoml2 >= $M ) {
00404                         $start_forward = $k + 1;
00405                         $value_to_add_forward = 0;
00406                     }
00407                 }
00408 
00409                 $start_diag = max( $value_to_add_backward + $start_backward, -$d );
00410                 $end_diag = min( $end_backward, $d );
00411                 $value_to_add_backward = 1 - $value_to_add_backward;
00412 
00413                 // compute backward furthest reaching paths
00414                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
00415                     if ( $k == $d
00416                     || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] ) ) {
00417                         $x = $V1[$limit_min_1 + $k];
00418                     } else {
00419                         $x = $V1[$limit_plus_1 + $k] - 1;
00420                     }
00421 
00422                     $y = $x - $k - $delta;
00423 
00424                     $snake2 = 0;
00425                     while ( $x > 0 && $y > 0
00426                             && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1] ) {
00427                         --$x;
00428                         --$y;
00429                         ++$snake2;
00430                     }
00431                     $V1[$limit + $k] = $x;
00432 
00433                     if ( $k >= -$delta - $d && $k <= $d - $delta
00434                             && $x <= $V0[$limit + $k + $delta] ) {
00435                         $snake0 = $bottoml1 + $x;
00436                         $snake1 = $bottoml2 + $y;
00437                         return 2 * $d;
00438                     }
00439 
00440                     // check to see if we can cut down our diagonal range
00441                     if ( $x <= 0 ) {
00442                         $start_backward = $k + 1;
00443                         $value_to_add_backward = 0;
00444                     } elseif ( $y <= 0 && $end_backward > $k - 1 ) {
00445                         $end_backward = $k - 1;
00446                     }
00447                 }
00448             }
00449         }
00450         /*
00451          * computing the true LCS is too expensive, instead find the diagonal
00452          * with the most progress and pretend a midle snake of length 0 occurs
00453          * there.
00454          */
00455 
00456         $most_progress = self::findMostProgress( $M, $N, $limit, $V );
00457 
00458         $snake0 = $bottoml1 + $most_progress[0];
00459         $snake1 = $bottoml2 + $most_progress[1];
00460         $snake2 = 0;
00461         wfDebug( "Computing the LCS is too expensive. Using a heuristic.\n" );
00462         $this->heuristicUsed = true;
00463         return 5; /*
00464         * HACK: since we didn't really finish the LCS computation
00465         * we don't really know the length of the SES. We don't do
00466         * anything with the result anyway, unless it's <=1. We know
00467         * for a fact SES > 1 so 5 is as good a number as any to
00468         * return here
00469         */
00470     }
00471 
00472     private static function findMostProgress( $M, $N, $limit, $V ) {
00473         $delta = $N - $M;
00474 
00475         if ( ( $M & 1 ) == ( $limit & 1 ) ) {
00476             $forward_start_diag = max( -$M, -$limit );
00477         } else {
00478             $forward_start_diag = max( 1 - $M, -$limit );
00479         }
00480 
00481         $forward_end_diag = min( $N, $limit );
00482 
00483         if ( ( $N & 1 ) == ( $limit & 1 ) ) {
00484             $backward_start_diag = max( -$N, -$limit );
00485         } else {
00486             $backward_start_diag = max( 1 - $N, -$limit );
00487         }
00488 
00489         $backward_end_diag = -min( $M, $limit );
00490 
00491         $temp = array( 0, 0, 0 );
00492 
00493         $max_progress = array_fill( 0, ceil( max( $forward_end_diag - $forward_start_diag,
00494                 $backward_end_diag - $backward_start_diag ) / 2 ), $temp );
00495         $num_progress = 0; // the 1st entry is current, it is initialized
00496         // with 0s
00497 
00498         // first search the forward diagonals
00499         for ( $k = $forward_start_diag; $k <= $forward_end_diag; $k += 2 ) {
00500             $x = $V[0][$limit + $k];
00501             $y = $x - $k;
00502             if ( $x > $N || $y > $M ) {
00503                 continue;
00504             }
00505 
00506             $progress = $x + $y;
00507             if ( $progress > $max_progress[0][2] ) {
00508                 $num_progress = 0;
00509                 $max_progress[0][0] = $x;
00510                 $max_progress[0][1] = $y;
00511                 $max_progress[0][2] = $progress;
00512             } elseif ( $progress == $max_progress[0][2] ) {
00513                 ++$num_progress;
00514                 $max_progress[$num_progress][0] = $x;
00515                 $max_progress[$num_progress][1] = $y;
00516                 $max_progress[$num_progress][2] = $progress;
00517             }
00518         }
00519 
00520         $max_progress_forward = true; // initially the maximum
00521         // progress is in the forward
00522         // direction
00523 
00524         // now search the backward diagonals
00525         for ( $k = $backward_start_diag; $k <= $backward_end_diag; $k += 2 ) {
00526             $x = $V[1][$limit + $k];
00527             $y = $x - $k - $delta;
00528             if ( $x < 0 || $y < 0 ) {
00529                 continue;
00530             }
00531 
00532             $progress = $N - $x + $M - $y;
00533             if ( $progress > $max_progress[0][2] ) {
00534                 $num_progress = 0;
00535                 $max_progress_forward = false;
00536                 $max_progress[0][0] = $x;
00537                 $max_progress[0][1] = $y;
00538                 $max_progress[0][2] = $progress;
00539             } elseif ( $progress == $max_progress[0][2] && !$max_progress_forward ) {
00540                 ++$num_progress;
00541                 $max_progress[$num_progress][0] = $x;
00542                 $max_progress[$num_progress][1] = $y;
00543                 $max_progress[$num_progress][2] = $progress;
00544             }
00545         }
00546 
00547         // return the middle diagonal with maximal progress.
00548         return $max_progress[(int)floor( $num_progress / 2 )];
00549     }
00550 
00554     public function getLcsLength() {
00555         if ( $this->heuristicUsed && !$this->lcsLengthCorrectedForHeuristic ) {
00556             $this->lcsLengthCorrectedForHeuristic = true;
00557             $this->length = $this->m -array_sum( $this->added );
00558         }
00559         return $this->length;
00560     }
00561 
00562 }
00563 
00570 class RangeDifference {
00571 
00572     public $leftstart;
00573     public $leftend;
00574     public $leftlength;
00575 
00576     public $rightstart;
00577     public $rightend;
00578     public $rightlength;
00579 
00580     function __construct( $leftstart, $leftend, $rightstart, $rightend ) {
00581         $this->leftstart = $leftstart;
00582         $this->leftend = $leftend;
00583         $this->leftlength = $leftend - $leftstart;
00584         $this->rightstart = $rightstart;
00585         $this->rightend = $rightend;
00586         $this->rightlength = $rightend - $rightstart;
00587     }
00588 }