[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/maintenance/storage/ -> compressOld.php (source)

   1  <?php
   2  /**
   3   * Compress the text of a wiki.
   4   *
   5   * Usage:
   6   *
   7   * Non-wikimedia
   8   * php compressOld.php [options...]
   9   *
  10   * Wikimedia
  11   * php compressOld.php <database> [options...]
  12   *
  13   * Options are:
  14   *  -t <type>           set compression type to either:
  15   *                          gzip: compress revisions independently
  16   *                          concat: concatenate revisions and compress in chunks (default)
  17   *  -c <chunk-size>     maximum number of revisions in a concat chunk
  18   *  -b <begin-date>     earliest date to check for uncompressed revisions
  19   *  -e <end-date>       latest revision date to compress
  20   *  -s <startid>        the id to start from (referring to the text table for
  21   *                      type gzip, and to the page table for type concat)
  22   *  -n <endid>          the page_id to stop at (only when using concat compression type)
  23   *  --extdb <cluster>   store specified revisions in an external cluster (untested)
  24   *
  25   * This program is free software; you can redistribute it and/or modify
  26   * it under the terms of the GNU General Public License as published by
  27   * the Free Software Foundation; either version 2 of the License, or
  28   * (at your option) any later version.
  29   *
  30   * This program is distributed in the hope that it will be useful,
  31   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  32   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  33   * GNU General Public License for more details.
  34   *
  35   * You should have received a copy of the GNU General Public License along
  36   * with this program; if not, write to the Free Software Foundation, Inc.,
  37   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  38   * http://www.gnu.org/copyleft/gpl.html
  39   *
  40   * @file
  41   * @ingroup Maintenance ExternalStorage
  42   */
  43  
  44  require_once  __DIR__ . '/../Maintenance.php';
  45  
  46  /**
  47   * Maintenance script that compress the text of a wiki.
  48   *
  49   * @ingroup Maintenance ExternalStorage
  50   */
  51  class CompressOld extends Maintenance {
  52      /**
  53       * @todo document
  54       */
  55      const LS_INDIVIDUAL = 0;
  56      const LS_CHUNKED = 1;
  57  
  58  	public function __construct() {
  59          parent::__construct();
  60          $this->mDescription = 'Compress the text of a wiki';
  61          $this->addOption( 'type', 'Set compression type to either: gzip|concat', false, true, 't' );
  62          $this->addOption(
  63              'chunksize',
  64              'Maximum number of revisions in a concat chunk',
  65              false,
  66              true,
  67              'c'
  68          );
  69          $this->addOption(
  70              'begin-date',
  71              'Earliest date to check for uncompressed revisions',
  72              false,
  73              true,
  74              'b'
  75          );
  76          $this->addOption( 'end-date', 'Latest revision date to compress', false, true, 'e' );
  77          $this->addOption(
  78              'startid',
  79              'The id to start from (gzip -> text table, concat -> page table)',
  80              false,
  81              true,
  82              's'
  83          );
  84          $this->addOption(
  85              'extdb',
  86              'Store specified revisions in an external cluster (untested)',
  87              false,
  88              true
  89          );
  90          $this->addOption(
  91              'endid',
  92              'The page_id to stop at (only when using concat compression type)',
  93              false,
  94              true,
  95              'n'
  96          );
  97      }
  98  
  99  	public function execute() {
 100          global $wgDBname;
 101          if ( !function_exists( "gzdeflate" ) ) {
 102              $this->error( "You must enable zlib support in PHP to compress old revisions!\n" .
 103                  "Please see http://www.php.net/manual/en/ref.zlib.php\n", true );
 104          }
 105  
 106          $type = $this->getOption( 'type', 'concat' );
 107          $chunkSize = $this->getOption( 'chunksize', 20 );
 108          $startId = $this->getOption( 'startid', 0 );
 109          $beginDate = $this->getOption( 'begin-date', '' );
 110          $endDate = $this->getOption( 'end-date', '' );
 111          $extDB = $this->getOption( 'extdb', '' );
 112          $endId = $this->getOption( 'endid', false );
 113  
 114          if ( $type != 'concat' && $type != 'gzip' ) {
 115              $this->error( "Type \"{$type}\" not supported" );
 116          }
 117  
 118          if ( $extDB != '' ) {
 119              $this->output( "Compressing database {$wgDBname} to external cluster {$extDB}\n"
 120                  . str_repeat( '-', 76 ) . "\n\n" );
 121          } else {
 122              $this->output( "Compressing database {$wgDBname}\n"
 123                  . str_repeat( '-', 76 ) . "\n\n" );
 124          }
 125  
 126          $success = true;
 127          if ( $type == 'concat' ) {
 128              $success = $this->compressWithConcat( $startId, $chunkSize, $beginDate,
 129                  $endDate, $extDB, $endId );
 130          } else {
 131              $this->compressOldPages( $startId, $extDB );
 132          }
 133  
 134          if ( $success ) {
 135              $this->output( "Done.\n" );
 136          }
 137      }
 138  
 139      /**
 140       * @todo document
 141       * @param int $start
 142       * @param string $extdb
 143       */
 144  	private function compressOldPages( $start = 0, $extdb = '' ) {
 145          $chunksize = 50;
 146          $this->output( "Starting from old_id $start...\n" );
 147          $dbw = wfGetDB( DB_MASTER );
 148          do {
 149              $res = $dbw->select(
 150                  'text',
 151                  array( 'old_id', 'old_flags', 'old_text' ),
 152                  "old_id>=$start",
 153                  __METHOD__,
 154                  array( 'ORDER BY' => 'old_id', 'LIMIT' => $chunksize, 'FOR UPDATE' )
 155              );
 156  
 157              if ( $res->numRows() == 0 ) {
 158                  break;
 159              }
 160  
 161              $last = $start;
 162  
 163              foreach ( $res as $row ) {
 164                  # print "  {$row->old_id} - {$row->old_namespace}:{$row->old_title}\n";
 165                  $this->compressPage( $row, $extdb );
 166                  $last = $row->old_id;
 167              }
 168  
 169              $start = $last + 1; # Deletion may leave long empty stretches
 170              $this->output( "$start...\n" );
 171          } while ( true );
 172      }
 173  
 174      /**
 175       * @todo document
 176       * @param stdClass $row
 177       * @param string $extdb
 178       * @return bool
 179       */
 180  	private function compressPage( $row, $extdb ) {
 181          if ( false !== strpos( $row->old_flags, 'gzip' )
 182              || false !== strpos( $row->old_flags, 'object' )
 183          ) {
 184              #print "Already compressed row {$row->old_id}\n";
 185              return false;
 186          }
 187          $dbw = wfGetDB( DB_MASTER );
 188          $flags = $row->old_flags ? "{$row->old_flags},gzip" : "gzip";
 189          $compress = gzdeflate( $row->old_text );
 190  
 191          # Store in external storage if required
 192          if ( $extdb !== '' ) {
 193              $storeObj = new ExternalStoreDB;
 194              $compress = $storeObj->store( $extdb, $compress );
 195              if ( $compress === false ) {
 196                  $this->error( "Unable to store object" );
 197  
 198                  return false;
 199              }
 200          }
 201  
 202          # Update text row
 203          $dbw->update( 'text',
 204              array( /* SET */
 205                  'old_flags' => $flags,
 206                  'old_text' => $compress
 207              ), array( /* WHERE */
 208                  'old_id' => $row->old_id
 209              ), __METHOD__,
 210              array( 'LIMIT' => 1 )
 211          );
 212  
 213          return true;
 214      }
 215  
 216      /**
 217       * @param int $startId
 218       * @param int $maxChunkSize
 219       * @param string $beginDate
 220       * @param string $endDate
 221       * @param string $extdb
 222       * @param bool|int $maxPageId
 223       * @return bool
 224       */
 225  	private function compressWithConcat( $startId, $maxChunkSize, $beginDate,
 226          $endDate, $extdb = "", $maxPageId = false
 227      ) {
 228          $loadStyle = self::LS_CHUNKED;
 229  
 230          $dbr = wfGetDB( DB_SLAVE );
 231          $dbw = wfGetDB( DB_MASTER );
 232  
 233          # Set up external storage
 234          if ( $extdb != '' ) {
 235              $storeObj = new ExternalStoreDB;
 236          }
 237  
 238          # Get all articles by page_id
 239          if ( !$maxPageId ) {
 240              $maxPageId = $dbr->selectField( 'page', 'max(page_id)', '', __METHOD__ );
 241          }
 242          $this->output( "Starting from $startId of $maxPageId\n" );
 243          $pageConds = array();
 244  
 245          /*
 246          if ( $exclude_ns0 ) {
 247              print "Excluding main namespace\n";
 248              $pageConds[] = 'page_namespace<>0';
 249          }
 250          if ( $queryExtra ) {
 251                      $pageConds[] = $queryExtra;
 252          }
 253           */
 254  
 255          # For each article, get a list of revisions which fit the criteria
 256  
 257          # No recompression, use a condition on old_flags
 258          # Don't compress object type entities, because that might produce data loss when
 259          # overwriting bulk storage concat rows. Don't compress external references, because
 260          # the script doesn't yet delete rows from external storage.
 261          $conds = array(
 262              'old_flags NOT ' . $dbr->buildLike( $dbr->anyString(), 'object', $dbr->anyString() )
 263              . ' AND old_flags NOT '
 264              . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() )
 265          );
 266  
 267          if ( $beginDate ) {
 268              if ( !preg_match( '/^\d{14}$/', $beginDate ) ) {
 269                  $this->error( "Invalid begin date \"$beginDate\"\n" );
 270  
 271                  return false;
 272              }
 273              $conds[] = "rev_timestamp>'" . $beginDate . "'";
 274          }
 275          if ( $endDate ) {
 276              if ( !preg_match( '/^\d{14}$/', $endDate ) ) {
 277                  $this->error( "Invalid end date \"$endDate\"\n" );
 278  
 279                  return false;
 280              }
 281              $conds[] = "rev_timestamp<'" . $endDate . "'";
 282          }
 283          if ( $loadStyle == self::LS_CHUNKED ) {
 284              $tables = array( 'revision', 'text' );
 285              $fields = array( 'rev_id', 'rev_text_id', 'old_flags', 'old_text' );
 286              $conds[] = 'rev_text_id=old_id';
 287              $revLoadOptions = 'FOR UPDATE';
 288          } else {
 289              $tables = array( 'revision' );
 290              $fields = array( 'rev_id', 'rev_text_id' );
 291              $revLoadOptions = array();
 292          }
 293  
 294          # Don't work with current revisions
 295          # Don't lock the page table for update either -- TS 2006-04-04
 296          #$tables[] = 'page';
 297          #$conds[] = 'page_id=rev_page AND rev_id != page_latest';
 298  
 299          for ( $pageId = $startId; $pageId <= $maxPageId; $pageId++ ) {
 300              wfWaitForSlaves();
 301  
 302              # Wake up
 303              $dbr->ping();
 304  
 305              # Get the page row
 306              $pageRes = $dbr->select( 'page',
 307                  array( 'page_id', 'page_namespace', 'page_title', 'page_latest' ),
 308                  $pageConds + array( 'page_id' => $pageId ), __METHOD__ );
 309              if ( $pageRes->numRows() == 0 ) {
 310                  continue;
 311              }
 312              $pageRow = $dbr->fetchObject( $pageRes );
 313  
 314              # Display progress
 315              $titleObj = Title::makeTitle( $pageRow->page_namespace, $pageRow->page_title );
 316              $this->output( "$pageId\t" . $titleObj->getPrefixedDBkey() . " " );
 317  
 318              # Load revisions
 319              $revRes = $dbw->select( $tables, $fields,
 320                  array_merge( array(
 321                      'rev_page' => $pageRow->page_id,
 322                      # Don't operate on the current revision
 323                      # Use < instead of <> in case the current revision has changed
 324                      # since the page select, which wasn't locking
 325                      'rev_id < ' . $pageRow->page_latest
 326                  ), $conds ),
 327                  __METHOD__,
 328                  $revLoadOptions
 329              );
 330              $revs = array();
 331              foreach ( $revRes as $revRow ) {
 332                  $revs[] = $revRow;
 333              }
 334  
 335              if ( count( $revs ) < 2 ) {
 336                  # No revisions matching, no further processing
 337                  $this->output( "\n" );
 338                  continue;
 339              }
 340  
 341              # For each chunk
 342              $i = 0;
 343              while ( $i < count( $revs ) ) {
 344                  if ( $i < count( $revs ) - $maxChunkSize ) {
 345                      $thisChunkSize = $maxChunkSize;
 346                  } else {
 347                      $thisChunkSize = count( $revs ) - $i;
 348                  }
 349  
 350                  $chunk = new ConcatenatedGzipHistoryBlob();
 351                  $stubs = array();
 352                  $dbw->begin( __METHOD__ );
 353                  $usedChunk = false;
 354                  $primaryOldid = $revs[$i]->rev_text_id;
 355  
 356                  // @codingStandardsIgnoreStart Ignore avoid function calls in a FOR loop test part warning
 357                  # Get the text of each revision and add it to the object
 358                  for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy(); $j++ ) {
 359                      // @codingStandardsIgnoreEnd
 360                      $oldid = $revs[$i + $j]->rev_text_id;
 361  
 362                      # Get text
 363                      if ( $loadStyle == self::LS_INDIVIDUAL ) {
 364                          $textRow = $dbw->selectRow( 'text',
 365                              array( 'old_flags', 'old_text' ),
 366                              array( 'old_id' => $oldid ),
 367                              __METHOD__,
 368                              'FOR UPDATE'
 369                          );
 370                          $text = Revision::getRevisionText( $textRow );
 371                      } else {
 372                          $text = Revision::getRevisionText( $revs[$i + $j] );
 373                      }
 374  
 375                      if ( $text === false ) {
 376                          $this->error( "\nError, unable to get text in old_id $oldid" );
 377                          #$dbw->delete( 'old', array( 'old_id' => $oldid ) );
 378                      }
 379  
 380                      if ( $extdb == "" && $j == 0 ) {
 381                          $chunk->setText( $text );
 382                          $this->output( '.' );
 383                      } else {
 384                          # Don't make a stub if it's going to be longer than the article
 385                          # Stubs are typically about 100 bytes
 386                          if ( strlen( $text ) < 120 ) {
 387                              $stub = false;
 388                              $this->output( 'x' );
 389                          } else {
 390                              $stub = new HistoryBlobStub( $chunk->addItem( $text ) );
 391                              $stub->setLocation( $primaryOldid );
 392                              $stub->setReferrer( $oldid );
 393                              $this->output( '.' );
 394                              $usedChunk = true;
 395                          }
 396                          $stubs[$j] = $stub;
 397                      }
 398                  }
 399                  $thisChunkSize = $j;
 400  
 401                  # If we couldn't actually use any stubs because the pages were too small, do nothing
 402                  if ( $usedChunk ) {
 403                      if ( $extdb != "" ) {
 404                          # Move blob objects to External Storage
 405                          $stored = $storeObj->store( $extdb, serialize( $chunk ) );
 406                          if ( $stored === false ) {
 407                              $this->error( "Unable to store object" );
 408  
 409                              return false;
 410                          }
 411                          # Store External Storage URLs instead of Stub placeholders
 412                          foreach ( $stubs as $stub ) {
 413                              if ( $stub === false ) {
 414                                  continue;
 415                              }
 416                              # $stored should provide base path to a BLOB
 417                              $url = $stored . "/" . $stub->getHash();
 418                              $dbw->update( 'text',
 419                                  array( /* SET */
 420                                      'old_text' => $url,
 421                                      'old_flags' => 'external,utf-8',
 422                                  ), array( /* WHERE */
 423                                      'old_id' => $stub->getReferrer(),
 424                                  )
 425                              );
 426                          }
 427                      } else {
 428                          # Store the main object locally
 429                          $dbw->update( 'text',
 430                              array( /* SET */
 431                                  'old_text' => serialize( $chunk ),
 432                                  'old_flags' => 'object,utf-8',
 433                              ), array( /* WHERE */
 434                                  'old_id' => $primaryOldid
 435                              )
 436                          );
 437  
 438                          # Store the stub objects
 439                          for ( $j = 1; $j < $thisChunkSize; $j++ ) {
 440                              # Skip if not compressing and don't overwrite the first revision
 441                              if ( $stubs[$j] !== false && $revs[$i + $j]->rev_text_id != $primaryOldid ) {
 442                                  $dbw->update( 'text',
 443                                      array( /* SET */
 444                                          'old_text' => serialize( $stubs[$j] ),
 445                                          'old_flags' => 'object,utf-8',
 446                                      ), array( /* WHERE */
 447                                          'old_id' => $revs[$i + $j]->rev_text_id
 448                                      )
 449                                  );
 450                              }
 451                          }
 452                      }
 453                  }
 454                  # Done, next
 455                  $this->output( "/" );
 456                  $dbw->commit( __METHOD__ );
 457                  $i += $thisChunkSize;
 458                  wfWaitForSlaves();
 459              }
 460              $this->output( "\n" );
 461          }
 462  
 463          return true;
 464      }
 465  }
 466  
 467  $maintClass = 'CompressOld';
 468  require_once RUN_MAINTENANCE_IF_MAIN;


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1