[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/includes/diff/ -> WikiDiff3.php (source)

   1  <?php
   2  /**
   3   * New version of the difference engine
   4   *
   5   * Copyright © 2008 Guy Van den Broeck <[email protected]>
   6   *
   7   * This program is free software; you can redistribute it and/or modify
   8   * it under the terms of the GNU General Public License as published by
   9   * the Free Software Foundation; either version 2 of the License, or
  10   * (at your option) any later version.
  11   *
  12   * This program is distributed in the hope that it will be useful,
  13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15   * GNU General Public License for more details.
  16   *
  17   * You should have received a copy of the GNU General Public License
  18   * along with this program; if not, write to the Free Software
  19   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20   * http://www.gnu.org/copyleft/gpl.html
  21   *
  22   * @file
  23   * @ingroup DifferenceEngine
  24   */
  25  
  26  /**
  27   * This diff implementation is mainly lifted from the LCS algorithm of the Eclipse project which
  28   * in turn is based on Myers' "An O(ND) difference algorithm and its variations"
  29   * (http://citeseer.ist.psu.edu/myers86ond.html) with range compression (see Wu et al.'s
  30   * "An O(NP) Sequence Comparison Algorithm").
  31   *
  32   * This implementation supports an upper bound on the execution time.
  33   *
  34   * Complexity: O((M + N)D) worst case time, O(M + N + D^2) expected time, O(M + N) space
  35   *
  36   * @author Guy Van den Broeck
  37   * @ingroup DifferenceEngine
  38   */
  39  class WikiDiff3 {
  40  
  41      // Input variables
  42      private $from;
  43      private $to;
  44      private $m;
  45      private $n;
  46  
  47      private $tooLong;
  48      private $powLimit;
  49  
  50      // State variables
  51      private $maxDifferences;
  52      private $lcsLengthCorrectedForHeuristic = false;
  53  
  54      // Output variables
  55      public $length;
  56      public $removed;
  57      public $added;
  58      public $heuristicUsed;
  59  
  60  	function __construct( $tooLong = 2000000, $powLimit = 1.45 ) {
  61          $this->tooLong = $tooLong;
  62          $this->powLimit = $powLimit;
  63      }
  64  
  65  	public function diff( /*array*/ $from, /*array*/ $to ) {
  66          // remember initial lengths
  67          $m = count( $from );
  68          $n = count( $to );
  69  
  70          $this->heuristicUsed = false;
  71  
  72          // output
  73          $removed = $m > 0 ? array_fill( 0, $m, true ) : array();
  74          $added = $n > 0 ? array_fill( 0, $n, true ) : array();
  75  
  76          // reduce the complexity for the next step (intentionally done twice)
  77          // remove common tokens at the start
  78          $i = 0;
  79          while ( $i < $m && $i < $n && $from[$i] === $to[$i] ) {
  80              $removed[$i] = $added[$i] = false;
  81              unset( $from[$i], $to[$i] );
  82              ++$i;
  83          }
  84  
  85          // remove common tokens at the end
  86          $j = 1;
  87          while ( $i + $j <= $m && $i + $j <= $n && $from[$m - $j] === $to[$n - $j] ) {
  88              $removed[$m - $j] = $added[$n - $j] = false;
  89              unset( $from[$m - $j], $to[$n - $j] );
  90              ++$j;
  91          }
  92  
  93          $this->from = $newFromIndex = $this->to = $newToIndex = array();
  94  
  95          // remove tokens not in both sequences
  96          $shared = array();
  97          foreach ( $from as $key ) {
  98              $shared[$key] = false;
  99          }
 100  
 101          foreach ( $to as $index => &$el ) {
 102              if ( array_key_exists( $el, $shared ) ) {
 103                  // keep it
 104                  $this->to[] = $el;
 105                  $shared[$el] = true;
 106                  $newToIndex[] = $index;
 107              }
 108          }
 109          foreach ( $from as $index => &$el ) {
 110              if ( $shared[$el] ) {
 111                  // keep it
 112                  $this->from[] = $el;
 113                  $newFromIndex[] = $index;
 114              }
 115          }
 116  
 117          unset( $shared, $from, $to );
 118  
 119          $this->m = count( $this->from );
 120          $this->n = count( $this->to );
 121  
 122          $this->removed = $this->m > 0 ? array_fill( 0, $this->m, true ) : array();
 123          $this->added = $this->n > 0 ? array_fill( 0, $this->n, true ) : array();
 124  
 125          if ( $this->m == 0 || $this->n == 0 ) {
 126              $this->length = 0;
 127          } else {
 128              $this->maxDifferences = ceil( ( $this->m + $this->n ) / 2.0 );
 129              if ( $this->m * $this->n > $this->tooLong ) {
 130                  // limit complexity to D^POW_LIMIT for long sequences
 131                  $this->maxDifferences = floor( pow( $this->maxDifferences, $this->powLimit - 1.0 ) );
 132                  wfDebug( "Limiting max number of differences to $this->maxDifferences\n" );
 133              }
 134  
 135              /*
 136               * The common prefixes and suffixes are always part of some LCS, include
 137               * them now to reduce our search space
 138               */
 139              $max = min( $this->m, $this->n );
 140              for ( $forwardBound = 0; $forwardBound < $max
 141                  && $this->from[$forwardBound] === $this->to[$forwardBound];
 142                  ++$forwardBound
 143              ) {
 144                  $this->removed[$forwardBound] = $this->added[$forwardBound] = false;
 145              }
 146  
 147              $backBoundL1 = $this->m - 1;
 148              $backBoundL2 = $this->n - 1;
 149  
 150              while ( $backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound
 151                  && $this->from[$backBoundL1] === $this->to[$backBoundL2]
 152              ) {
 153                  $this->removed[$backBoundL1--] = $this->added[$backBoundL2--] = false;
 154              }
 155  
 156              $temp = array_fill( 0, $this->m + $this->n + 1, 0 );
 157              $V = array( $temp, $temp );
 158              $snake = array( 0, 0, 0 );
 159  
 160              $this->length = $forwardBound + $this->m - $backBoundL1 - 1
 161                  + $this->lcs_rec(
 162                      $forwardBound,
 163                      $backBoundL1,
 164                      $forwardBound,
 165                      $backBoundL2,
 166                      $V,
 167                      $snake
 168              );
 169          }
 170  
 171          $this->m = $m;
 172          $this->n = $n;
 173  
 174          $this->length += $i + $j - 1;
 175  
 176          foreach ( $this->removed as $key => &$removed_elem ) {
 177              if ( !$removed_elem ) {
 178                  $removed[$newFromIndex[$key]] = false;
 179              }
 180          }
 181          foreach ( $this->added as $key => &$added_elem ) {
 182              if ( !$added_elem ) {
 183                  $added[$newToIndex[$key]] = false;
 184              }
 185          }
 186          $this->removed = $removed;
 187          $this->added = $added;
 188      }
 189  
 190  	function diff_range( $from_lines, $to_lines ) {
 191          // Diff and store locally
 192          $this->diff( $from_lines, $to_lines );
 193          unset( $from_lines, $to_lines );
 194  
 195          $ranges = array();
 196          $xi = $yi = 0;
 197          while ( $xi < $this->m || $yi < $this->n ) {
 198              // Matching "snake".
 199              while ( $xi < $this->m && $yi < $this->n
 200                  && !$this->removed[$xi]
 201                  && !$this->added[$yi]
 202              ) {
 203                  ++$xi;
 204                  ++$yi;
 205              }
 206              // Find deletes & adds.
 207              $xstart = $xi;
 208              while ( $xi < $this->m && $this->removed[$xi] ) {
 209                  ++$xi;
 210              }
 211  
 212              $ystart = $yi;
 213              while ( $yi < $this->n && $this->added[$yi] ) {
 214                  ++$yi;
 215              }
 216  
 217              if ( $xi > $xstart || $yi > $ystart ) {
 218                  $ranges[] = new RangeDifference( $xstart, $xi, $ystart, $yi );
 219              }
 220          }
 221  
 222          return $ranges;
 223      }
 224  
 225  	private function lcs_rec( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) {
 226          // check that both sequences are non-empty
 227          if ( $bottoml1 > $topl1 || $bottoml2 > $topl2 ) {
 228              return 0;
 229          }
 230  
 231          $d = $this->find_middle_snake( $bottoml1, $topl1, $bottoml2,
 232              $topl2, $V, $snake );
 233  
 234          // need to store these so we don't lose them when they're
 235          // overwritten by the recursion
 236          $len = $snake[2];
 237          $startx = $snake[0];
 238          $starty = $snake[1];
 239  
 240          // the middle snake is part of the LCS, store it
 241          for ( $i = 0; $i < $len; ++$i ) {
 242              $this->removed[$startx + $i] = $this->added[$starty + $i] = false;
 243          }
 244  
 245          if ( $d > 1 ) {
 246              return $len
 247              + $this->lcs_rec( $bottoml1, $startx - 1, $bottoml2,
 248                  $starty - 1, $V, $snake )
 249              + $this->lcs_rec( $startx + $len, $topl1, $starty + $len,
 250                  $topl2, $V, $snake );
 251          } elseif ( $d == 1 ) {
 252              /*
 253               * In this case the sequences differ by exactly 1 line. We have
 254               * already saved all the lines after the difference in the for loop
 255               * above, now we need to save all the lines before the difference.
 256               */
 257              $max = min( $startx - $bottoml1, $starty - $bottoml2 );
 258              for ( $i = 0; $i < $max; ++$i ) {
 259                  $this->removed[$bottoml1 + $i] =
 260                      $this->added[$bottoml2 + $i] = false;
 261              }
 262  
 263              return $max + $len;
 264          }
 265  
 266          return $len;
 267      }
 268  
 269  	private function find_middle_snake( $bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake ) {
 270          $from = &$this->from;
 271          $to = &$this->to;
 272          $V0 = &$V[0];
 273          $V1 = &$V[1];
 274          $snake0 = &$snake[0];
 275          $snake1 = &$snake[1];
 276          $snake2 = &$snake[2];
 277          $bottoml1_min_1 = $bottoml1 - 1;
 278          $bottoml2_min_1 = $bottoml2 - 1;
 279          $N = $topl1 - $bottoml1_min_1;
 280          $M = $topl2 - $bottoml2_min_1;
 281          $delta = $N - $M;
 282          $maxabsx = $N + $bottoml1;
 283          $maxabsy = $M + $bottoml2;
 284          $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) );
 285  
 286          // value_to_add_forward: a 0 or 1 that we add to the start
 287          // offset to make it odd/even
 288          if ( ( $M & 1 ) == 1 ) {
 289              $value_to_add_forward = 1;
 290          } else {
 291              $value_to_add_forward = 0;
 292          }
 293  
 294          if ( ( $N & 1 ) == 1 ) {
 295              $value_to_add_backward = 1;
 296          } else {
 297              $value_to_add_backward = 0;
 298          }
 299  
 300          $start_forward = -$M;
 301          $end_forward = $N;
 302          $start_backward = -$N;
 303          $end_backward = $M;
 304  
 305          $limit_min_1 = $limit - 1;
 306          $limit_plus_1 = $limit + 1;
 307  
 308          $V0[$limit_plus_1] = 0;
 309          $V1[$limit_min_1] = $N;
 310          $limit = min( $this->maxDifferences, ceil( ( $N + $M ) / 2 ) );
 311  
 312          if ( ( $delta & 1 ) == 1 ) {
 313              for ( $d = 0; $d <= $limit; ++$d ) {
 314                  $start_diag = max( $value_to_add_forward + $start_forward, -$d );
 315                  $end_diag = min( $end_forward, $d );
 316                  $value_to_add_forward = 1 - $value_to_add_forward;
 317  
 318                  // compute forward furthest reaching paths
 319                  for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
 320                      if ( $k == -$d || ( $k < $d
 321                              && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] )
 322                      ) {
 323                          $x = $V0[$limit_plus_1 + $k];
 324                      } else {
 325                          $x = $V0[$limit_min_1 + $k] + 1;
 326                      }
 327  
 328                      $absx = $snake0 = $x + $bottoml1;
 329                      $absy = $snake1 = $x - $k + $bottoml2;
 330  
 331                      while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) {
 332                          ++$absx;
 333                          ++$absy;
 334                      }
 335                      $x = $absx - $bottoml1;
 336  
 337                      $snake2 = $absx - $snake0;
 338                      $V0[$limit + $k] = $x;
 339                      if ( $k >= $delta - $d + 1 && $k <= $delta + $d - 1
 340                          && $x >= $V1[$limit + $k - $delta]
 341                      ) {
 342                          return 2 * $d - 1;
 343                      }
 344  
 345                      // check to see if we can cut down the diagonal range
 346                      if ( $x >= $N && $end_forward > $k - 1 ) {
 347                          $end_forward = $k - 1;
 348                      } elseif ( $absy - $bottoml2 >= $M ) {
 349                          $start_forward = $k + 1;
 350                          $value_to_add_forward = 0;
 351                      }
 352                  }
 353  
 354                  $start_diag = max( $value_to_add_backward + $start_backward, -$d );
 355                  $end_diag = min( $end_backward, $d );
 356                  $value_to_add_backward = 1 - $value_to_add_backward;
 357  
 358                  // compute backward furthest reaching paths
 359                  for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
 360                      if ( $k == $d
 361                          || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] )
 362                      ) {
 363                          $x = $V1[$limit_min_1 + $k];
 364                      } else {
 365                          $x = $V1[$limit_plus_1 + $k] - 1;
 366                      }
 367  
 368                      $y = $x - $k - $delta;
 369  
 370                      $snake2 = 0;
 371                      while ( $x > 0 && $y > 0
 372                          && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1]
 373                      ) {
 374                          --$x;
 375                          --$y;
 376                          ++$snake2;
 377                      }
 378                      $V1[$limit + $k] = $x;
 379  
 380                      // check to see if we can cut down our diagonal range
 381                      if ( $x <= 0 ) {
 382                          $start_backward = $k + 1;
 383                          $value_to_add_backward = 0;
 384                      } elseif ( $y <= 0 && $end_backward > $k - 1 ) {
 385                          $end_backward = $k - 1;
 386                      }
 387                  }
 388              }
 389          } else {
 390              for ( $d = 0; $d <= $limit; ++$d ) {
 391                  $start_diag = max( $value_to_add_forward + $start_forward, -$d );
 392                  $end_diag = min( $end_forward, $d );
 393                  $value_to_add_forward = 1 - $value_to_add_forward;
 394  
 395                  // compute forward furthest reaching paths
 396                  for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
 397                      if ( $k == -$d
 398                          || ( $k < $d && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k] )
 399                      ) {
 400                          $x = $V0[$limit_plus_1 + $k];
 401                      } else {
 402                          $x = $V0[$limit_min_1 + $k] + 1;
 403                      }
 404  
 405                      $absx = $snake0 = $x + $bottoml1;
 406                      $absy = $snake1 = $x - $k + $bottoml2;
 407  
 408                      while ( $absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy] ) {
 409                          ++$absx;
 410                          ++$absy;
 411                      }
 412                      $x = $absx - $bottoml1;
 413                      $snake2 = $absx - $snake0;
 414                      $V0[$limit + $k] = $x;
 415  
 416                      // check to see if we can cut down the diagonal range
 417                      if ( $x >= $N && $end_forward > $k - 1 ) {
 418                          $end_forward = $k - 1;
 419                      } elseif ( $absy - $bottoml2 >= $M ) {
 420                          $start_forward = $k + 1;
 421                          $value_to_add_forward = 0;
 422                      }
 423                  }
 424  
 425                  $start_diag = max( $value_to_add_backward + $start_backward, -$d );
 426                  $end_diag = min( $end_backward, $d );
 427                  $value_to_add_backward = 1 - $value_to_add_backward;
 428  
 429                  // compute backward furthest reaching paths
 430                  for ( $k = $start_diag; $k <= $end_diag; $k += 2 ) {
 431                      if ( $k == $d
 432                          || ( $k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k] )
 433                      ) {
 434                          $x = $V1[$limit_min_1 + $k];
 435                      } else {
 436                          $x = $V1[$limit_plus_1 + $k] - 1;
 437                      }
 438  
 439                      $y = $x - $k - $delta;
 440  
 441                      $snake2 = 0;
 442                      while ( $x > 0 && $y > 0
 443                          && $from[$x + $bottoml1_min_1] === $to[$y + $bottoml2_min_1]
 444                      ) {
 445                          --$x;
 446                          --$y;
 447                          ++$snake2;
 448                      }
 449                      $V1[$limit + $k] = $x;
 450  
 451                      if ( $k >= -$delta - $d && $k <= $d - $delta
 452                          && $x <= $V0[$limit + $k + $delta]
 453                      ) {
 454                          $snake0 = $bottoml1 + $x;
 455                          $snake1 = $bottoml2 + $y;
 456  
 457                          return 2 * $d;
 458                      }
 459  
 460                      // check to see if we can cut down our diagonal range
 461                      if ( $x <= 0 ) {
 462                          $start_backward = $k + 1;
 463                          $value_to_add_backward = 0;
 464                      } elseif ( $y <= 0 && $end_backward > $k - 1 ) {
 465                          $end_backward = $k - 1;
 466                      }
 467                  }
 468              }
 469          }
 470          /*
 471           * computing the true LCS is too expensive, instead find the diagonal
 472           * with the most progress and pretend a midle snake of length 0 occurs
 473           * there.
 474           */
 475  
 476          $most_progress = self::findMostProgress( $M, $N, $limit, $V );
 477  
 478          $snake0 = $bottoml1 + $most_progress[0];
 479          $snake1 = $bottoml2 + $most_progress[1];
 480          $snake2 = 0;
 481          wfDebug( "Computing the LCS is too expensive. Using a heuristic.\n" );
 482          $this->heuristicUsed = true;
 483  
 484          return 5; /*
 485          * HACK: since we didn't really finish the LCS computation
 486          * we don't really know the length of the SES. We don't do
 487          * anything with the result anyway, unless it's <=1. We know
 488          * for a fact SES > 1 so 5 is as good a number as any to
 489          * return here
 490          */
 491      }
 492  
 493  	private static function findMostProgress( $M, $N, $limit, $V ) {
 494          $delta = $N - $M;
 495  
 496          if ( ( $M & 1 ) == ( $limit & 1 ) ) {
 497              $forward_start_diag = max( -$M, -$limit );
 498          } else {
 499              $forward_start_diag = max( 1 - $M, -$limit );
 500          }
 501  
 502          $forward_end_diag = min( $N, $limit );
 503  
 504          if ( ( $N & 1 ) == ( $limit & 1 ) ) {
 505              $backward_start_diag = max( -$N, -$limit );
 506          } else {
 507              $backward_start_diag = max( 1 - $N, -$limit );
 508          }
 509  
 510          $backward_end_diag = -min( $M, $limit );
 511  
 512          $temp = array( 0, 0, 0 );
 513  
 514          $max_progress = array_fill( 0, ceil( max( $forward_end_diag - $forward_start_diag,
 515                  $backward_end_diag - $backward_start_diag ) / 2 ), $temp );
 516          $num_progress = 0; // the 1st entry is current, it is initialized
 517          // with 0s
 518  
 519          // first search the forward diagonals
 520          for ( $k = $forward_start_diag; $k <= $forward_end_diag; $k += 2 ) {
 521              $x = $V[0][$limit + $k];
 522              $y = $x - $k;
 523              if ( $x > $N || $y > $M ) {
 524                  continue;
 525              }
 526  
 527              $progress = $x + $y;
 528              if ( $progress > $max_progress[0][2] ) {
 529                  $num_progress = 0;
 530                  $max_progress[0][0] = $x;
 531                  $max_progress[0][1] = $y;
 532                  $max_progress[0][2] = $progress;
 533              } elseif ( $progress == $max_progress[0][2] ) {
 534                  ++$num_progress;
 535                  $max_progress[$num_progress][0] = $x;
 536                  $max_progress[$num_progress][1] = $y;
 537                  $max_progress[$num_progress][2] = $progress;
 538              }
 539          }
 540  
 541          $max_progress_forward = true; // initially the maximum
 542          // progress is in the forward
 543          // direction
 544  
 545          // now search the backward diagonals
 546          for ( $k = $backward_start_diag; $k <= $backward_end_diag; $k += 2 ) {
 547              $x = $V[1][$limit + $k];
 548              $y = $x - $k - $delta;
 549              if ( $x < 0 || $y < 0 ) {
 550                  continue;
 551              }
 552  
 553              $progress = $N - $x + $M - $y;
 554              if ( $progress > $max_progress[0][2] ) {
 555                  $num_progress = 0;
 556                  $max_progress_forward = false;
 557                  $max_progress[0][0] = $x;
 558                  $max_progress[0][1] = $y;
 559                  $max_progress[0][2] = $progress;
 560              } elseif ( $progress == $max_progress[0][2] && !$max_progress_forward ) {
 561                  ++$num_progress;
 562                  $max_progress[$num_progress][0] = $x;
 563                  $max_progress[$num_progress][1] = $y;
 564                  $max_progress[$num_progress][2] = $progress;
 565              }
 566          }
 567  
 568          // return the middle diagonal with maximal progress.
 569          return $max_progress[(int)floor( $num_progress / 2 )];
 570      }
 571  
 572      /**
 573       * @return mixed
 574       */
 575  	public function getLcsLength() {
 576          if ( $this->heuristicUsed && !$this->lcsLengthCorrectedForHeuristic ) {
 577              $this->lcsLengthCorrectedForHeuristic = true;
 578              $this->length = $this->m - array_sum( $this->added );
 579          }
 580  
 581          return $this->length;
 582      }
 583  
 584  }
 585  
 586  /**
 587   * Alternative representation of a set of changes, by the index
 588   * ranges that are changed.
 589   *
 590   * @ingroup DifferenceEngine
 591   */
 592  class RangeDifference {
 593  
 594      /** @var int */
 595      public $leftstart;
 596  
 597      /** @var int */
 598      public $leftend;
 599  
 600      /** @var int */
 601      public $leftlength;
 602  
 603      /** @var int */
 604      public $rightstart;
 605  
 606      /** @var int */
 607      public $rightend;
 608  
 609      /** @var int */
 610      public $rightlength;
 611  
 612  	function __construct( $leftstart, $leftend, $rightstart, $rightend ) {
 613          $this->leftstart = $leftstart;
 614          $this->leftend = $leftend;
 615          $this->leftlength = $leftend - $leftstart;
 616          $this->rightstart = $rightstart;
 617          $this->rightend = $rightend;
 618          $this->rightlength = $rightend - $rightstart;
 619      }
 620  
 621  }


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1