MediaWiki  REL1_19
WikiDiff3.php
Go to the documentation of this file.
00001 <?php
00039 class WikiDiff3 {
00040 
00041         // Input variables
00042         private $from;
00043         private $to;
00044         private $m;
00045         private $n;
00046 
00047         private $tooLong;
00048         private $powLimit;
00049 
00050         // State variables
00051         private $maxDifferences;
00052         private $lcsLengthCorrectedForHeuristic = false;
00053 
00054         // Output variables
00055         public $length;
00056         public $removed;
00057         public $added;
00058         public $heuristicUsed;
00059 
00060         function __construct( $tooLong = 2000000, $powLimit = 1.45 ) {
00061                 $this->tooLong = $tooLong;
00062                 $this->powLimit = $powLimit;
00063         }
00064 
00065         public function diff( /*array*/ $from, /*array*/ $to ) {
00066                 // remember initial lengths
00067                 $m = sizeof( $from );
00068                 $n = count( $to );
00069 
00070                 $this->heuristicUsed = false;
00071 
00072                 // output
00073                 $removed = $m > 0 ? array_fill( 0, $m, true ) : array();
00074                 $added = $n > 0 ? array_fill( 0, $n, true ) : array();
00075 
00076                 // reduce the complexity for the next step (intentionally done twice)
00077                 // remove common tokens at the start
00078                 $i = 0;
00079                 while ( $i < $m && $i < $n && $from[$i] === $to[$i] ) {
00080                         $removed[$i] = $added[$i] = false;
00081                         unset( $from[$i], $to[$i] );
00082                         ++$i;
00083                 }
00084 
00085                 // remove common tokens at the end
00086                 $j = 1;
00087                 while ( $i + $j <= $m && $i + $j <= $n && $from[$m - $j] === $to[$n - $j] ) {
00088                         $removed[$m - $j] = $added[$n - $j] = false;
00089                         unset( $from[$m - $j], $to[$n - $j] );
00090                         ++$j;
00091                 }
00092 
00093                 $this->from = $newFromIndex = $this->to = $newToIndex = array();
00094 
00095                 // remove tokens not in both sequences
00096                 $shared = array();
00097                 foreach ( $from as $key ) {
00098                         $shared[$key] = false;
00099                 }
00100 
00101                 foreach ( $to as $index => &$el ) {
00102                         if ( array_key_exists( $el, $shared ) ) {
00103                                 // keep it
00104                                 $this->to[] = $el;
00105                                 $shared[$el] = true;
00106                                 $newToIndex[] = $index;
00107                         }
00108                 }
00109                 foreach ( $from as $index => &$el ) {
00110                         if ( $shared[$el] ) {
00111                                 // keep it
00112                                 $this->from[] = $el;
00113                                 $newFromIndex[] = $index;
00114                         }
00115                 }
00116 
00117                 unset( $shared, $from, $to );
00118 
00119                 $this->m = count( $this->from );
00120                 $this->n = count( $this->to );
00121 
00122                 $this->removed = $this->m > 0 ? array_fill( 0, $this->m, true ) : array();
00123                 $this->added = $this->n > 0 ? array_fill( 0, $this->n, true ) : array();
00124 
00125                 if ( $this->m == 0 || $this->n == 0 ) {
00126                         $this->length = 0;
00127                 } else {
00128                         $this->maxDifferences = ceil( ( $this->m + $this->n ) / 2.0 );
00129                         if ( $this->m * $this->n > $this->tooLong ) {
00130                                 // limit complexity to D^POW_LIMIT for long sequences
00131                                 $this->maxDifferences = floor( pow( $this->maxDifferences, $this->powLimit - 1.0 ) );
00132                                 wfDebug( "Limiting max number of differences to $this->maxDifferences\n" );
00133                         }
00134 
00135                         /*
00136                          * The common prefixes and suffixes are always part of some LCS, include
00137                          * them now to reduce our search space
00138                          */
00139                         $max = min( $this->m, $this->n );
00140                         for ( $forwardBound = 0; $forwardBound < $max
00141                                         && $this->from[$forwardBound] === $this->to[$forwardBound];
00142                                         ++$forwardBound ) {
00143                                 $this->removed[$forwardBound] = $this->added[$forwardBound] = false;
00144                         }
00145 
00146                         $backBoundL1 = $this->m - 1;
00147                         $backBoundL2 = $this->n - 1;
00148 
00149                         while ( $backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound
00150                                         && $this->from[$backBoundL1] === $this->to[$backBoundL2] ) {
00151                                 $this->removed[$backBoundL1--] = $this->added[$backBoundL2--] = false;
00152                         }
00153 
00154                         $temp = array_fill( 0, $this->m + $this->n + 1, 0 );
00155                         $V = array( $temp, $temp );
00156                         $snake = array( 0, 0, 0 );
00157 
00158                         $this->length = $forwardBound + $this->m - $backBoundL1 - 1
00159                                 + $this->lcs_rec( $forwardBound, $backBoundL1,
00160                                 $forwardBound, $backBoundL2, $V, $snake );
00161                 }
00162 
00163                 $this->m = $m;
00164                 $this->n = $n;
00165 
00166                 $this->length += $i + $j - 1;
00167 
00168                 foreach ( $this->removed as $key => &$removed_elem ) {
00169                         if ( !$removed_elem ) {
00170                                 $removed[$newFromIndex[$key]] = false;
00171                         }
00172                 }
00173                 foreach ( $this->added as $key => &$added_elem ) {
00174                         if ( !$added_elem ) {
00175                                 $added[$newToIndex[$key]] = false;
00176                         }
00177                 }
00178                 $this->removed = $removed;
00179                 $this->added = $added;
00180         }
00181 
00182         function diff_range( $from_lines, $to_lines ) {
00183                 // Diff and store locally
00184                 $this->diff( $from_lines, $to_lines );
00185                 unset( $from_lines, $to_lines );
00186 
00187                 $ranges = array();
00188                 $xi = $yi = 0;
00189                 while ( $xi < $this->m || $yi < $this->n ) {
00190                         // Matching "snake".
00191                         while ( $xi < $this->m && $yi < $this->n
00192                                         && !$this->removed[$xi]
00193                                         && !$this->added[$yi] ) {
00194                                 ++$xi;
00195                                 ++$yi;
00196                         }
00197                         // Find deletes & adds.
00198                         $xstart = $xi;
00199                         while ( $xi < $this->m && $this->removed[$xi] ) {
00200                                 ++$xi;
00201                         }
00202 
00203                         $ystart = $yi;
00204                         while ( $yi < $this->n && $this->added[$yi] ) {
00205                                 ++$yi;
00206                         }
00207 
00208                         if ( $xi > $xstart || $yi > $ystart ) {
00209                                 $ranges[] = new RangeDifference( $xstart, $xi,
00210                                                                 $ystart, $yi );
00211                         }
00212                 }
00213                 return $ranges;
00214         }
00215 
00216         private function lcs_rec( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) {
00217                 // check that both sequences are non-empty
00218                 if ( $bottoml1 > $topl1 || $bottoml2 > $topl2 ) {
00219                         return 0;
00220                 }
00221 
00222                 $d = $this->find_middle_snake( $bottoml1, $topl1, $bottoml2,
00223                                                         $topl2, $V, $snake );
00224 
00225                 // need to store these so we don't lose them when they're
00226                 // overwritten by the recursion
00227                 $len = $snake[2];
00228                 $startx = $snake[0];
00229                 $starty = $snake[1];
00230 
00231                 // the middle snake is part of the LCS, store it
00232                 for ( $i = 0; $i < $len; ++$i ) {
00233                         $this->removed[$startx + $i] = $this->added[$starty + $i] = false;
00234                 }
00235 
00236                 if ( $d > 1 ) {
00237                         return $len
00238                         + $this->lcs_rec( $bottoml1, $startx - 1, $bottoml2,
00239                                                         $starty - 1, $V, $snake )
00240                         + $this->lcs_rec( $startx + $len, $topl1, $starty + $len,
00241                                                         $topl2, $V, $snake );
00242                 } elseif ( $d == 1 ) {
00243                         /*
00244                          * In this case the sequences differ by exactly 1 line. We have
00245                          * already saved all the lines after the difference in the for loop
00246                          * above, now we need to save all the lines before the difference.
00247                          */
00248                         $max = min( $startx - $bottoml1, $starty - $bottoml2 );
00249                         for ( $i = 0; $i < $max; ++$i ) {
00250                                 $this->removed[$bottoml1 + $i] =
00251                                         $this->added[$bottoml2 + $i] = false;
00252                         }
00253                         return $max + $len;
00254                 }
00255                 return $len;
00256         }
00257 
00258         private function find_middle_snake( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) {
00259                 $from = &$this->from;
00260                 $to = &$this->to;
00261                 $V0 = &$V[0];
00262                 $V1 = &$V[1];
00263                 $snake0 = &$snake[0];
00264                 $snake1 = &$snake[1];
00265                 $snake2 = &$snake[2];
00266                 $bottoml1_min_1 = $bottoml1 -1;
00267                 $bottoml2_min_1 = $bottoml2 -1;
00268                 $N = $topl1 - $bottoml1_min_1;
00269                 $M = $topl2 - $bottoml2_min_1;
00270                 $delta = $N - $M;
00271                 $maxabsx = $N + $bottoml1;
00272                 $maxabsy = $M + $bottoml2;
00273                 $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) );
00274 
00275                 // value_to_add_forward: a 0 or 1 that we add to the start
00276                 // offset to make it odd/even
00277                 if ( ( $M & 1 ) == 1 ) {
00278                         $value_to_add_forward = 1;
00279                 } else {
00280                         $value_to_add_forward = 0;
00281                 }
00282 
00283                 if ( ( $N & 1 ) == 1 ) {
00284                         $value_to_add_backward = 1;
00285                 } else {
00286                         $value_to_add_backward = 0;
00287                 }
00288 
00289                 $start_forward = -$M;
00290                 $end_forward = $N;
00291                 $start_backward = -$N;
00292                 $end_backward = $M;
00293 
00294                 $limit_min_1 = $limit - 1;
00295                 $limit_plus_1 = $limit + 1;
00296 
00297                 $V0[$limit_plus_1] = 0;
00298                 $V1[$limit_min_1] = $N;
00299                 $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) );
00300 
00301                 if ( ( $delta & 1 ) == 1 ) {
00302                         for ( $d = 0; $d <= $limit; ++$d ) {
00303                                 $start_diag = max( $value_to_add_forward + $start_forward, -$d );
00304                                 $end_diag = min( $end_forward, $d );
00305                                 $value_to_add_forward = 1 - $value_to_add_forward;
00306 
00307                                 // compute forward furthest reaching paths
00308                                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
00309                                         if ( $k == -$d || ( $k < $d
00310                                                         && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] ) ) {
00311                                                 $x = $V0[$limit_plus_1 + $k];
00312                                         } else {
00313                                                 $x = $V0[$limit_min_1 + $k] + 1;
00314                                         }
00315 
00316                                         $absx = $snake0 = $x + $bottoml1;
00317                                         $absy = $snake1 = $x - $k + $bottoml2;
00318 
00319                                         while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) {
00320                                                 ++$absx;
00321                                                 ++$absy;
00322                                         }
00323                                         $x = $absx -$bottoml1;
00324 
00325                                         $snake2 = $absx -$snake0;
00326                                         $V0[$limit + $k] = $x;
00327                                         if ( $k >= $delta - $d + 1 && $k <= $delta + $d - 1
00328                                                         && $x >= $V1[$limit + $k - $delta] ) {
00329                                                 return 2 * $d - 1;
00330                                         }
00331 
00332                                         // check to see if we can cut down the diagonal range
00333                                         if ( $x >= $N && $end_forward > $k - 1 ) {
00334                                                 $end_forward = $k - 1;
00335                                         } elseif ( $absy - $bottoml2 >= $M ) {
00336                                                 $start_forward = $k + 1;
00337                                                 $value_to_add_forward = 0;
00338                                         }
00339                                 }
00340 
00341                                 $start_diag = max( $value_to_add_backward + $start_backward, -$d );
00342                                 $end_diag = min( $end_backward, $d );
00343                                 $value_to_add_backward = 1 - $value_to_add_backward;
00344 
00345                                 // compute backward furthest reaching paths
00346                                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
00347                                         if ( $k == $d
00348                                         || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] ) ) {
00349                                                 $x = $V1[$limit_min_1 + $k];
00350                                         } else {
00351                                                 $x = $V1[$limit_plus_1 + $k] - 1;
00352                                         }
00353 
00354                                         $y = $x - $k - $delta;
00355 
00356                                         $snake2 = 0;
00357                                         while ( $x > 0 && $y > 0
00358                                         && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1] ) {
00359                                                 --$x;
00360                                                 --$y;
00361                                                 ++$snake2;
00362                                         }
00363                                         $V1[$limit + $k] = $x;
00364 
00365                                         // check to see if we can cut down our diagonal range
00366                                         if ( $x <= 0 ) {
00367                                                 $start_backward = $k + 1;
00368                                                 $value_to_add_backward = 0;
00369                                         } elseif ( $y <= 0 && $end_backward > $k - 1 ) {
00370                                                 $end_backward = $k - 1;
00371                                         }
00372                                 }
00373                         }
00374                 } else {
00375                         for ( $d = 0; $d <= $limit; ++$d ) {
00376                                 $start_diag = max( $value_to_add_forward + $start_forward, -$d );
00377                                 $end_diag = min( $end_forward, $d );
00378                                 $value_to_add_forward = 1 - $value_to_add_forward;
00379 
00380                                 // compute forward furthest reaching paths
00381                                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
00382                                         if ( $k == -$d
00383                                         || ( $k < $d && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] ) ) {
00384                                                 $x = $V0[$limit_plus_1 + $k];
00385                                         } else {
00386                                                 $x = $V0[$limit_min_1 + $k] + 1;
00387                                         }
00388 
00389                                         $absx = $snake0 = $x + $bottoml1;
00390                                         $absy = $snake1 = $x - $k + $bottoml2;
00391 
00392                                         while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) {
00393                                                 ++$absx;
00394                                                 ++$absy;
00395                                         }
00396                                         $x = $absx -$bottoml1;
00397                                         $snake2 = $absx -$snake0;
00398                                         $V0[$limit + $k] = $x;
00399 
00400                                         // check to see if we can cut down the diagonal range
00401                                         if ( $x >= $N && $end_forward > $k - 1 ) {
00402                                                 $end_forward = $k - 1;
00403                                         } elseif ( $absy -$bottoml2 >= $M ) {
00404                                                 $start_forward = $k + 1;
00405                                                 $value_to_add_forward = 0;
00406                                         }
00407                                 }
00408 
00409                                 $start_diag = max( $value_to_add_backward + $start_backward, -$d );
00410                                 $end_diag = min( $end_backward, $d );
00411                                 $value_to_add_backward = 1 - $value_to_add_backward;
00412 
00413                                 // compute backward furthest reaching paths
00414                                 for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
00415                                         if ( $k == $d
00416                                         || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] ) ) {
00417                                                 $x = $V1[$limit_min_1 + $k];
00418                                         } else {
00419                                                 $x = $V1[$limit_plus_1 + $k] - 1;
00420                                         }
00421 
00422                                         $y = $x - $k - $delta;
00423 
00424                                         $snake2 = 0;
00425                                         while ( $x > 0 && $y > 0
00426                                                         && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1] ) {
00427                                                 --$x;
00428                                                 --$y;
00429                                                 ++$snake2;
00430                                         }
00431                                         $V1[$limit + $k] = $x;
00432 
00433                                         if ( $k >= -$delta - $d && $k <= $d - $delta
00434                                                         && $x <= $V0[$limit + $k + $delta] ) {
00435                                                 $snake0 = $bottoml1 + $x;
00436                                                 $snake1 = $bottoml2 + $y;
00437                                                 return 2 * $d;
00438                                         }
00439 
00440                                         // check to see if we can cut down our diagonal range
00441                                         if ( $x <= 0 ) {
00442                                                 $start_backward = $k + 1;
00443                                                 $value_to_add_backward = 0;
00444                                         } elseif ( $y <= 0 && $end_backward > $k - 1 ) {
00445                                                 $end_backward = $k - 1;
00446                                         }
00447                                 }
00448                         }
00449                 }
00450                 /*
00451                  * computing the true LCS is too expensive, instead find the diagonal
00452                  * with the most progress and pretend a midle snake of length 0 occurs
00453                  * there.
00454                  */
00455 
00456                 $most_progress = self::findMostProgress( $M, $N, $limit, $V );
00457 
00458                 $snake0 = $bottoml1 + $most_progress[0];
00459                 $snake1 = $bottoml2 + $most_progress[1];
00460                 $snake2 = 0;
00461                 wfDebug( "Computing the LCS is too expensive. Using a heuristic.\n" );
00462                 $this->heuristicUsed = true;
00463                 return 5; /*
00464                 * HACK: since we didn't really finish the LCS computation
00465                 * we don't really know the length of the SES. We don't do
00466                 * anything with the result anyway, unless it's <=1. We know
00467                 * for a fact SES > 1 so 5 is as good a number as any to
00468                 * return here
00469                 */
00470         }
00471 
00472         private static function findMostProgress( $M, $N, $limit, $V ) {
00473                 $delta = $N - $M;
00474 
00475                 if ( ( $M & 1 ) == ( $limit & 1 ) ) {
00476                         $forward_start_diag = max( -$M, -$limit );
00477                 } else {
00478                         $forward_start_diag = max( 1 - $M, -$limit );
00479                 }
00480 
00481                 $forward_end_diag = min( $N, $limit );
00482 
00483                 if ( ( $N & 1 ) == ( $limit & 1 ) ) {
00484                         $backward_start_diag = max( -$N, -$limit );
00485                 } else {
00486                         $backward_start_diag = max( 1 - $N, -$limit );
00487                 }
00488 
00489                 $backward_end_diag = -min( $M, $limit );
00490 
00491                 $temp = array( 0, 0, 0 );
00492 
00493 
00494                 $max_progress = array_fill( 0, ceil( max( $forward_end_diag - $forward_start_diag,
00495                                 $backward_end_diag - $backward_start_diag ) / 2 ), $temp );
00496                 $num_progress = 0; // the 1st entry is current, it is initialized
00497                 // with 0s
00498 
00499                 // first search the forward diagonals
00500                 for ( $k = $forward_start_diag; $k <= $forward_end_diag; $k += 2 ) {
00501                         $x = $V[0][$limit + $k];
00502                         $y = $x - $k;
00503                         if ( $x > $N || $y > $M ) {
00504                                 continue;
00505                         }
00506 
00507                         $progress = $x + $y;
00508                         if ( $progress > $max_progress[0][2] ) {
00509                                 $num_progress = 0;
00510                                 $max_progress[0][0] = $x;
00511                                 $max_progress[0][1] = $y;
00512                                 $max_progress[0][2] = $progress;
00513                         } elseif ( $progress == $max_progress[0][2] ) {
00514                                 ++$num_progress;
00515                                 $max_progress[$num_progress][0] = $x;
00516                                 $max_progress[$num_progress][1] = $y;
00517                                 $max_progress[$num_progress][2] = $progress;
00518                         }
00519                 }
00520 
00521                 $max_progress_forward = true; // initially the maximum
00522                 // progress is in the forward
00523                 // direction
00524 
00525                 // now search the backward diagonals
00526                 for ( $k = $backward_start_diag; $k <= $backward_end_diag; $k += 2 ) {
00527                         $x = $V[1][$limit + $k];
00528                         $y = $x - $k - $delta;
00529                         if ( $x < 0 || $y < 0 ) {
00530                                 continue;
00531                         }
00532 
00533                         $progress = $N - $x + $M - $y;
00534                         if ( $progress > $max_progress[0][2] ) {
00535                                 $num_progress = 0;
00536                                 $max_progress_forward = false;
00537                                 $max_progress[0][0] = $x;
00538                                 $max_progress[0][1] = $y;
00539                                 $max_progress[0][2] = $progress;
00540                         } elseif ( $progress == $max_progress[0][2] && !$max_progress_forward ) {
00541                                 ++$num_progress;
00542                                 $max_progress[$num_progress][0] = $x;
00543                                 $max_progress[$num_progress][1] = $y;
00544                                 $max_progress[$num_progress][2] = $progress;
00545                         }
00546                 }
00547 
00548                 // return the middle diagonal with maximal progress.
00549                 return $max_progress[(int)floor( $num_progress / 2 )];
00550         }
00551 
00555         public function getLcsLength() {
00556                 if ( $this->heuristicUsed && !$this->lcsLengthCorrectedForHeuristic ) {
00557                         $this->lcsLengthCorrectedForHeuristic = true;
00558                         $this->length = $this->m -array_sum( $this->added );
00559                 }
00560                 return $this->length;
00561         }
00562 
00563 }
00564 
00571 class RangeDifference {
00572 
00573         public $leftstart;
00574         public $leftend;
00575         public $leftlength;
00576 
00577         public $rightstart;
00578         public $rightend;
00579         public $rightlength;
00580 
00581         function __construct( $leftstart, $leftend, $rightstart, $rightend ) {
00582                 $this->leftstart = $leftstart;
00583                 $this->leftend = $leftend;
00584                 $this->leftlength = $leftend - $leftstart;
00585                 $this->rightstart = $rightstart;
00586                 $this->rightend = $rightend;
00587                 $this->rightlength = $rightend - $rightstart;
00588         }
00589 }