[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/includes/ -> HistoryBlob.php (source)

   1  <?php
   2  /**
   3   * Efficient concatenated text storage.
   4   *
   5   * This program is free software; you can redistribute it and/or modify
   6   * it under the terms of the GNU General Public License as published by
   7   * the Free Software Foundation; either version 2 of the License, or
   8   * (at your option) any later version.
   9   *
  10   * This program is distributed in the hope that it will be useful,
  11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13   * GNU General Public License for more details.
  14   *
  15   * You should have received a copy of the GNU General Public License along
  16   * with this program; if not, write to the Free Software Foundation, Inc.,
  17   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18   * http://www.gnu.org/copyleft/gpl.html
  19   *
  20   * @file
  21   */
  22  
  23  /**
  24   * Base class for general text storage via the "object" flag in old_flags, or
  25   * two-part external storage URLs. Used for represent efficient concatenated
  26   * storage, and migration-related pointer objects.
  27   */
  28  interface HistoryBlob {
  29      /**
  30       * Adds an item of text, returns a stub object which points to the item.
  31       * You must call setLocation() on the stub object before storing it to the
  32       * database
  33       *
  34       * @param string $text
  35       *
  36       * @return string The key for getItem()
  37       */
  38  	function addItem( $text );
  39  
  40      /**
  41       * Get item by key, or false if the key is not present
  42       *
  43       * @param string $key
  44       *
  45       * @return string|bool
  46       */
  47  	function getItem( $key );
  48  
  49      /**
  50       * Set the "default text"
  51       * This concept is an odd property of the current DB schema, whereby each text item has a revision
  52       * associated with it. The default text is the text of the associated revision. There may, however,
  53       * be other revisions in the same object.
  54       *
  55       * Default text is not required for two-part external storage URLs.
  56       *
  57       * @param string $text
  58       */
  59  	function setText( $text );
  60  
  61      /**
  62       * Get default text. This is called from Revision::getRevisionText()
  63       *
  64       * @return string
  65       */
  66  	function getText();
  67  }
  68  
  69  /**
  70   * Concatenated gzip (CGZ) storage
  71   * Improves compression ratio by concatenating like objects before gzipping
  72   */
  73  class ConcatenatedGzipHistoryBlob implements HistoryBlob {
  74      public $mVersion = 0, $mCompressed = false, $mItems = array(), $mDefaultHash = '';
  75      public $mSize = 0;
  76      public $mMaxSize = 10000000;
  77      public $mMaxCount = 100;
  78  
  79      /**
  80       * Constructor
  81       */
  82  	public function __construct() {
  83          if ( !function_exists( 'gzdeflate' ) ) {
  84              throw new MWException( "Need zlib support to read or write this "
  85                  . "kind of history object (ConcatenatedGzipHistoryBlob)\n" );
  86          }
  87      }
  88  
  89      /**
  90       * @param string $text
  91       * @return string
  92       */
  93  	public function addItem( $text ) {
  94          $this->uncompress();
  95          $hash = md5( $text );
  96          if ( !isset( $this->mItems[$hash] ) ) {
  97              $this->mItems[$hash] = $text;
  98              $this->mSize += strlen( $text );
  99          }
 100          return $hash;
 101      }
 102  
 103      /**
 104       * @param string $hash
 105       * @return array|bool
 106       */
 107  	public function getItem( $hash ) {
 108          $this->uncompress();
 109          if ( array_key_exists( $hash, $this->mItems ) ) {
 110              return $this->mItems[$hash];
 111          } else {
 112              return false;
 113          }
 114      }
 115  
 116      /**
 117       * @param string $text
 118       * @return void
 119       */
 120  	public function setText( $text ) {
 121          $this->uncompress();
 122          $this->mDefaultHash = $this->addItem( $text );
 123      }
 124  
 125      /**
 126       * @return array|bool
 127       */
 128  	public function getText() {
 129          $this->uncompress();
 130          return $this->getItem( $this->mDefaultHash );
 131      }
 132  
 133      /**
 134       * Remove an item
 135       *
 136       * @param string $hash
 137       */
 138  	public function removeItem( $hash ) {
 139          $this->mSize -= strlen( $this->mItems[$hash] );
 140          unset( $this->mItems[$hash] );
 141      }
 142  
 143      /**
 144       * Compress the bulk data in the object
 145       */
 146  	public function compress() {
 147          if ( !$this->mCompressed ) {
 148              $this->mItems = gzdeflate( serialize( $this->mItems ) );
 149              $this->mCompressed = true;
 150          }
 151      }
 152  
 153      /**
 154       * Uncompress bulk data
 155       */
 156  	public function uncompress() {
 157          if ( $this->mCompressed ) {
 158              $this->mItems = unserialize( gzinflate( $this->mItems ) );
 159              $this->mCompressed = false;
 160          }
 161      }
 162  
 163      /**
 164       * @return array
 165       */
 166  	function __sleep() {
 167          $this->compress();
 168          return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' );
 169      }
 170  
 171  	function __wakeup() {
 172          $this->uncompress();
 173      }
 174  
 175      /**
 176       * Helper function for compression jobs
 177       * Returns true until the object is "full" and ready to be committed
 178       *
 179       * @return bool
 180       */
 181  	public function isHappy() {
 182          return $this->mSize < $this->mMaxSize
 183              && count( $this->mItems ) < $this->mMaxCount;
 184      }
 185  }
 186  
 187  /**
 188   * Pointer object for an item within a CGZ blob stored in the text table.
 189   */
 190  class HistoryBlobStub {
 191      /**
 192       * @var array One-step cache variable to hold base blobs; operations that
 193       * pull multiple revisions may often pull multiple times from the same
 194       * blob. By keeping the last-used one open, we avoid redundant
 195       * unserialization and decompression overhead.
 196       */
 197      protected static $blobCache = array();
 198  
 199      /** @var int */
 200      public $mOldId;
 201  
 202      /** @var string */
 203      public $mHash;
 204  
 205      /** @var string */
 206      public $mRef;
 207  
 208      /**
 209       * @param string $hash The content hash of the text
 210       * @param int $oldid The old_id for the CGZ object
 211       */
 212  	function __construct( $hash = '', $oldid = 0 ) {
 213          $this->mHash = $hash;
 214      }
 215  
 216      /**
 217       * Sets the location (old_id) of the main object to which this object
 218       * points
 219       * @param int $id
 220       */
 221  	function setLocation( $id ) {
 222          $this->mOldId = $id;
 223      }
 224  
 225      /**
 226       * Sets the location (old_id) of the referring object
 227       * @param string $id
 228       */
 229  	function setReferrer( $id ) {
 230          $this->mRef = $id;
 231      }
 232  
 233      /**
 234       * Gets the location of the referring object
 235       * @return string
 236       */
 237  	function getReferrer() {
 238          return $this->mRef;
 239      }
 240  
 241      /**
 242       * @return string
 243       */
 244  	function getText() {
 245          if ( isset( self::$blobCache[$this->mOldId] ) ) {
 246              $obj = self::$blobCache[$this->mOldId];
 247          } else {
 248              $dbr = wfGetDB( DB_SLAVE );
 249              $row = $dbr->selectRow(
 250                  'text',
 251                  array( 'old_flags', 'old_text' ),
 252                  array( 'old_id' => $this->mOldId )
 253              );
 254  
 255              if ( !$row ) {
 256                  return false;
 257              }
 258  
 259              $flags = explode( ',', $row->old_flags );
 260              if ( in_array( 'external', $flags ) ) {
 261                  $url = $row->old_text;
 262                  $parts = explode( '://', $url, 2 );
 263                  if ( !isset( $parts[1] ) || $parts[1] == '' ) {
 264                      return false;
 265                  }
 266                  $row->old_text = ExternalStore::fetchFromUrl( $url );
 267  
 268              }
 269  
 270              if ( !in_array( 'object', $flags ) ) {
 271                  return false;
 272              }
 273  
 274              if ( in_array( 'gzip', $flags ) ) {
 275                  // This shouldn't happen, but a bug in the compress script
 276                  // may at times gzip-compress a HistoryBlob object row.
 277                  $obj = unserialize( gzinflate( $row->old_text ) );
 278              } else {
 279                  $obj = unserialize( $row->old_text );
 280              }
 281  
 282              if ( !is_object( $obj ) ) {
 283                  // Correct for old double-serialization bug.
 284                  $obj = unserialize( $obj );
 285              }
 286  
 287              // Save this item for reference; if pulling many
 288              // items in a row we'll likely use it again.
 289              $obj->uncompress();
 290              self::$blobCache = array( $this->mOldId => $obj );
 291          }
 292  
 293          return $obj->getItem( $this->mHash );
 294      }
 295  
 296      /**
 297       * Get the content hash
 298       *
 299       * @return string
 300       */
 301  	function getHash() {
 302          return $this->mHash;
 303      }
 304  }
 305  
 306  /**
 307   * To speed up conversion from 1.4 to 1.5 schema, text rows can refer to the
 308   * leftover cur table as the backend. This avoids expensively copying hundreds
 309   * of megabytes of data during the conversion downtime.
 310   *
 311   * Serialized HistoryBlobCurStub objects will be inserted into the text table
 312   * on conversion if $wgLegacySchemaConversion is set to true.
 313   */
 314  class HistoryBlobCurStub {
 315      /** @var int */
 316      public $mCurId;
 317  
 318      /**
 319       * @param int $curid The cur_id pointed to
 320       */
 321  	function __construct( $curid = 0 ) {
 322          $this->mCurId = $curid;
 323      }
 324  
 325      /**
 326       * Sets the location (cur_id) of the main object to which this object
 327       * points
 328       *
 329       * @param int $id
 330       */
 331  	function setLocation( $id ) {
 332          $this->mCurId = $id;
 333      }
 334  
 335      /**
 336       * @return string|bool
 337       */
 338  	function getText() {
 339          $dbr = wfGetDB( DB_SLAVE );
 340          $row = $dbr->selectRow( 'cur', array( 'cur_text' ), array( 'cur_id' => $this->mCurId ) );
 341          if ( !$row ) {
 342              return false;
 343          }
 344          return $row->cur_text;
 345      }
 346  }
 347  
 348  /**
 349   * Diff-based history compression
 350   * Requires xdiff 1.5+ and zlib
 351   */
 352  class DiffHistoryBlob implements HistoryBlob {
 353      /** @var array Uncompressed item cache */
 354      public $mItems = array();
 355  
 356      /** @var int Total uncompressed size */
 357      public $mSize = 0;
 358  
 359      /**
 360       * @var array Array of diffs. If a diff D from A to B is notated D = B - A,
 361       * and Z is an empty string:
 362       *
 363       *              { item[map[i]] - item[map[i-1]]   where i > 0
 364       *    diff[i] = {
 365       *              { item[map[i]] - Z                where i = 0
 366       */
 367      public $mDiffs;
 368  
 369      /** @var array The diff map, see above */
 370      public $mDiffMap;
 371  
 372      /** @var int The key for getText()
 373       */
 374      public $mDefaultKey;
 375  
 376      /** @var string Compressed storage */
 377      public $mCompressed;
 378  
 379      /** @var bool True if the object is locked against further writes */
 380      public $mFrozen = false;
 381  
 382      /**
 383       * @var int The maximum uncompressed size before the object becomes sad
 384       * Should be less than max_allowed_packet
 385       */
 386      public $mMaxSize = 10000000;
 387  
 388      /** @var int The maximum number of text items before the object becomes sad */
 389      public $mMaxCount = 100;
 390  
 391      /** Constants from xdiff.h */
 392      const XDL_BDOP_INS = 1;
 393      const XDL_BDOP_CPY = 2;
 394      const XDL_BDOP_INSB = 3;
 395  
 396  	function __construct() {
 397          if ( !function_exists( 'gzdeflate' ) ) {
 398              throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
 399          }
 400      }
 401  
 402      /**
 403       * @throws MWException
 404       * @param string $text
 405       * @return int
 406       */
 407  	function addItem( $text ) {
 408          if ( $this->mFrozen ) {
 409              throw new MWException( __METHOD__ . ": Cannot add more items after sleep/wakeup" );
 410          }
 411  
 412          $this->mItems[] = $text;
 413          $this->mSize += strlen( $text );
 414          $this->mDiffs = null; // later
 415          return count( $this->mItems ) - 1;
 416      }
 417  
 418      /**
 419       * @param string $key
 420       * @return string
 421       */
 422  	function getItem( $key ) {
 423          return $this->mItems[$key];
 424      }
 425  
 426      /**
 427       * @param string $text
 428       */
 429  	function setText( $text ) {
 430          $this->mDefaultKey = $this->addItem( $text );
 431      }
 432  
 433      /**
 434       * @return string
 435       */
 436  	function getText() {
 437          return $this->getItem( $this->mDefaultKey );
 438      }
 439  
 440      /**
 441       * @throws MWException
 442       */
 443  	function compress() {
 444          if ( !function_exists( 'xdiff_string_rabdiff' ) ) {
 445              throw new MWException( "Need xdiff 1.5+ support to write DiffHistoryBlob\n" );
 446          }
 447          if ( isset( $this->mDiffs ) ) {
 448              // Already compressed
 449              return;
 450          }
 451          if ( !count( $this->mItems ) ) {
 452              // Empty
 453              return;
 454          }
 455  
 456          // Create two diff sequences: one for main text and one for small text
 457          $sequences = array(
 458              'small' => array(
 459                  'tail' => '',
 460                  'diffs' => array(),
 461                  'map' => array(),
 462              ),
 463              'main' => array(
 464                  'tail' => '',
 465                  'diffs' => array(),
 466                  'map' => array(),
 467              ),
 468          );
 469          $smallFactor = 0.5;
 470  
 471          $mItemsCount = count( $this->mItems );
 472          for ( $i = 0; $i < $mItemsCount; $i++ ) {
 473              $text = $this->mItems[$i];
 474              if ( $i == 0 ) {
 475                  $seqName = 'main';
 476              } else {
 477                  $mainTail = $sequences['main']['tail'];
 478                  if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) {
 479                      $seqName = 'small';
 480                  } else {
 481                      $seqName = 'main';
 482                  }
 483              }
 484              $seq =& $sequences[$seqName];
 485              $tail = $seq['tail'];
 486              $diff = $this->diff( $tail, $text );
 487              $seq['diffs'][] = $diff;
 488              $seq['map'][] = $i;
 489              $seq['tail'] = $text;
 490          }
 491          unset( $seq ); // unlink dangerous alias
 492  
 493          // Knit the sequences together
 494          $tail = '';
 495          $this->mDiffs = array();
 496          $this->mDiffMap = array();
 497          foreach ( $sequences as $seq ) {
 498              if ( !count( $seq['diffs'] ) ) {
 499                  continue;
 500              }
 501              if ( $tail === '' ) {
 502                  $this->mDiffs[] = $seq['diffs'][0];
 503              } else {
 504                  $head = $this->patch( '', $seq['diffs'][0] );
 505                  $this->mDiffs[] = $this->diff( $tail, $head );
 506              }
 507              $this->mDiffMap[] = $seq['map'][0];
 508              $diffsCount = count( $seq['diffs'] );
 509              for ( $i = 1; $i < $diffsCount; $i++ ) {
 510                  $this->mDiffs[] = $seq['diffs'][$i];
 511                  $this->mDiffMap[] = $seq['map'][$i];
 512              }
 513              $tail = $seq['tail'];
 514          }
 515      }
 516  
 517      /**
 518       * @param string $t1
 519       * @param string $t2
 520       * @return string
 521       */
 522  	function diff( $t1, $t2 ) {
 523          # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
 524          # "String is not zero-terminated"
 525          wfSuppressWarnings();
 526          $diff = xdiff_string_rabdiff( $t1, $t2 ) . '';
 527          wfRestoreWarnings();
 528          return $diff;
 529      }
 530  
 531      /**
 532       * @param string $base
 533       * @param string $diff
 534       * @return bool|string
 535       */
 536  	function patch( $base, $diff ) {
 537          if ( function_exists( 'xdiff_string_bpatch' ) ) {
 538              wfSuppressWarnings();
 539              $text = xdiff_string_bpatch( $base, $diff ) . '';
 540              wfRestoreWarnings();
 541              return $text;
 542          }
 543  
 544          # Pure PHP implementation
 545  
 546          $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) );
 547  
 548          # Check the checksum if hash extension is available
 549          $ofp = $this->xdiffAdler32( $base );
 550          if ( $ofp !== false && $ofp !== substr( $diff, 0, 4 ) ) {
 551              wfDebug( __METHOD__ . ": incorrect base checksum\n" );
 552              return false;
 553          }
 554          if ( $header['csize'] != strlen( $base ) ) {
 555              wfDebug( __METHOD__ . ": incorrect base length\n" );
 556              return false;
 557          }
 558  
 559          $p = 8;
 560          $out = '';
 561          while ( $p < strlen( $diff ) ) {
 562              $x = unpack( 'Cop', substr( $diff, $p, 1 ) );
 563              $op = $x['op'];
 564              ++$p;
 565              switch ( $op ) {
 566              case self::XDL_BDOP_INS:
 567                  $x = unpack( 'Csize', substr( $diff, $p, 1 ) );
 568                  $p++;
 569                  $out .= substr( $diff, $p, $x['size'] );
 570                  $p += $x['size'];
 571                  break;
 572              case self::XDL_BDOP_INSB:
 573                  $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) );
 574                  $p += 4;
 575                  $out .= substr( $diff, $p, $x['csize'] );
 576                  $p += $x['csize'];
 577                  break;
 578              case self::XDL_BDOP_CPY:
 579                  $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) );
 580                  $p += 8;
 581                  $out .= substr( $base, $x['off'], $x['csize'] );
 582                  break;
 583              default:
 584                  wfDebug( __METHOD__ . ": invalid op\n" );
 585                  return false;
 586              }
 587          }
 588          return $out;
 589      }
 590  
 591      /**
 592       * Compute a binary "Adler-32" checksum as defined by LibXDiff, i.e. with
 593       * the bytes backwards and initialised with 0 instead of 1. See bug 34428.
 594       *
 595       * @param string $s
 596       * @return string|bool False if the hash extension is not available
 597       */
 598  	function xdiffAdler32( $s ) {
 599          if ( !function_exists( 'hash' ) ) {
 600              return false;
 601          }
 602  
 603          static $init;
 604          if ( $init === null ) {
 605              $init = str_repeat( "\xf0", 205 ) . "\xee" . str_repeat( "\xf0", 67 ) . "\x02";
 606          }
 607  
 608          // The real Adler-32 checksum of $init is zero, so it initialises the
 609          // state to zero, as it is at the start of LibXDiff's checksum
 610          // algorithm. Appending the subject string then simulates LibXDiff.
 611          return strrev( hash( 'adler32', $init . $s, true ) );
 612      }
 613  
 614  	function uncompress() {
 615          if ( !$this->mDiffs ) {
 616              return;
 617          }
 618          $tail = '';
 619          $mDiffsCount = count( $this->mDiffs );
 620          for ( $diffKey = 0; $diffKey < $mDiffsCount; $diffKey++ ) {
 621              $textKey = $this->mDiffMap[$diffKey];
 622              $text = $this->patch( $tail, $this->mDiffs[$diffKey] );
 623              $this->mItems[$textKey] = $text;
 624              $tail = $text;
 625          }
 626      }
 627  
 628      /**
 629       * @return array
 630       */
 631  	function __sleep() {
 632          $this->compress();
 633          if ( !count( $this->mItems ) ) {
 634              // Empty object
 635              $info = false;
 636          } else {
 637              // Take forward differences to improve the compression ratio for sequences
 638              $map = '';
 639              $prev = 0;
 640              foreach ( $this->mDiffMap as $i ) {
 641                  if ( $map !== '' ) {
 642                      $map .= ',';
 643                  }
 644                  $map .= $i - $prev;
 645                  $prev = $i;
 646              }
 647              $info = array(
 648                  'diffs' => $this->mDiffs,
 649                  'map' => $map
 650              );
 651          }
 652          if ( isset( $this->mDefaultKey ) ) {
 653              $info['default'] = $this->mDefaultKey;
 654          }
 655          $this->mCompressed = gzdeflate( serialize( $info ) );
 656          return array( 'mCompressed' );
 657      }
 658  
 659  	function __wakeup() {
 660          // addItem() doesn't work if mItems is partially filled from mDiffs
 661          $this->mFrozen = true;
 662          $info = unserialize( gzinflate( $this->mCompressed ) );
 663          unset( $this->mCompressed );
 664  
 665          if ( !$info ) {
 666              // Empty object
 667              return;
 668          }
 669  
 670          if ( isset( $info['default'] ) ) {
 671              $this->mDefaultKey = $info['default'];
 672          }
 673          $this->mDiffs = $info['diffs'];
 674          if ( isset( $info['base'] ) ) {
 675              // Old format
 676              $this->mDiffMap = range( 0, count( $this->mDiffs ) - 1 );
 677              array_unshift( $this->mDiffs,
 678                  pack( 'VVCV', 0, 0, self::XDL_BDOP_INSB, strlen( $info['base'] ) ) .
 679                  $info['base'] );
 680          } else {
 681              // New format
 682              $map = explode( ',', $info['map'] );
 683              $cur = 0;
 684              $this->mDiffMap = array();
 685              foreach ( $map as $i ) {
 686                  $cur += $i;
 687                  $this->mDiffMap[] = $cur;
 688              }
 689          }
 690          $this->uncompress();
 691      }
 692  
 693      /**
 694       * Helper function for compression jobs
 695       * Returns true until the object is "full" and ready to be committed
 696       *
 697       * @return bool
 698       */
 699  	function isHappy() {
 700          return $this->mSize < $this->mMaxSize
 701              && count( $this->mItems ) < $this->mMaxCount;
 702      }
 703  
 704  }


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1