MediaWiki  REL1_24
compressOld.php
Go to the documentation of this file.
00001 <?php
00044 require_once __DIR__ . '/../Maintenance.php';
00045 
00051 class CompressOld extends Maintenance {
00055     const LS_INDIVIDUAL = 0;
00056     const LS_CHUNKED = 1;
00057 
00058     public function __construct() {
00059         parent::__construct();
00060         $this->mDescription = 'Compress the text of a wiki';
00061         $this->addOption( 'type', 'Set compression type to either: gzip|concat', false, true, 't' );
00062         $this->addOption(
00063             'chunksize',
00064             'Maximum number of revisions in a concat chunk',
00065             false,
00066             true,
00067             'c'
00068         );
00069         $this->addOption(
00070             'begin-date',
00071             'Earliest date to check for uncompressed revisions',
00072             false,
00073             true,
00074             'b'
00075         );
00076         $this->addOption( 'end-date', 'Latest revision date to compress', false, true, 'e' );
00077         $this->addOption(
00078             'startid',
00079             'The id to start from (gzip -> text table, concat -> page table)',
00080             false,
00081             true,
00082             's'
00083         );
00084         $this->addOption(
00085             'extdb',
00086             'Store specified revisions in an external cluster (untested)',
00087             false,
00088             true
00089         );
00090         $this->addOption(
00091             'endid',
00092             'The page_id to stop at (only when using concat compression type)',
00093             false,
00094             true,
00095             'n'
00096         );
00097     }
00098 
00099     public function execute() {
00100         global $wgDBname;
00101         if ( !function_exists( "gzdeflate" ) ) {
00102             $this->error( "You must enable zlib support in PHP to compress old revisions!\n" .
00103                 "Please see http://www.php.net/manual/en/ref.zlib.php\n", true );
00104         }
00105 
00106         $type = $this->getOption( 'type', 'concat' );
00107         $chunkSize = $this->getOption( 'chunksize', 20 );
00108         $startId = $this->getOption( 'startid', 0 );
00109         $beginDate = $this->getOption( 'begin-date', '' );
00110         $endDate = $this->getOption( 'end-date', '' );
00111         $extDB = $this->getOption( 'extdb', '' );
00112         $endId = $this->getOption( 'endid', false );
00113 
00114         if ( $type != 'concat' && $type != 'gzip' ) {
00115             $this->error( "Type \"{$type}\" not supported" );
00116         }
00117 
00118         if ( $extDB != '' ) {
00119             $this->output( "Compressing database {$wgDBname} to external cluster {$extDB}\n"
00120                 . str_repeat( '-', 76 ) . "\n\n" );
00121         } else {
00122             $this->output( "Compressing database {$wgDBname}\n"
00123                 . str_repeat( '-', 76 ) . "\n\n" );
00124         }
00125 
00126         $success = true;
00127         if ( $type == 'concat' ) {
00128             $success = $this->compressWithConcat( $startId, $chunkSize, $beginDate,
00129                 $endDate, $extDB, $endId );
00130         } else {
00131             $this->compressOldPages( $startId, $extDB );
00132         }
00133 
00134         if ( $success ) {
00135             $this->output( "Done.\n" );
00136         }
00137     }
00138 
00144     private function compressOldPages( $start = 0, $extdb = '' ) {
00145         $chunksize = 50;
00146         $this->output( "Starting from old_id $start...\n" );
00147         $dbw = wfGetDB( DB_MASTER );
00148         do {
00149             $res = $dbw->select(
00150                 'text',
00151                 array( 'old_id', 'old_flags', 'old_text' ),
00152                 "old_id>=$start",
00153                 __METHOD__,
00154                 array( 'ORDER BY' => 'old_id', 'LIMIT' => $chunksize, 'FOR UPDATE' )
00155             );
00156 
00157             if ( $res->numRows() == 0 ) {
00158                 break;
00159             }
00160 
00161             $last = $start;
00162 
00163             foreach ( $res as $row ) {
00164                 # print "  {$row->old_id} - {$row->old_namespace}:{$row->old_title}\n";
00165                 $this->compressPage( $row, $extdb );
00166                 $last = $row->old_id;
00167             }
00168 
00169             $start = $last + 1; # Deletion may leave long empty stretches
00170             $this->output( "$start...\n" );
00171         } while ( true );
00172     }
00173 
00180     private function compressPage( $row, $extdb ) {
00181         if ( false !== strpos( $row->old_flags, 'gzip' )
00182             || false !== strpos( $row->old_flags, 'object' )
00183         ) {
00184             #print "Already compressed row {$row->old_id}\n";
00185             return false;
00186         }
00187         $dbw = wfGetDB( DB_MASTER );
00188         $flags = $row->old_flags ? "{$row->old_flags},gzip" : "gzip";
00189         $compress = gzdeflate( $row->old_text );
00190 
00191         # Store in external storage if required
00192         if ( $extdb !== '' ) {
00193             $storeObj = new ExternalStoreDB;
00194             $compress = $storeObj->store( $extdb, $compress );
00195             if ( $compress === false ) {
00196                 $this->error( "Unable to store object" );
00197 
00198                 return false;
00199             }
00200         }
00201 
00202         # Update text row
00203         $dbw->update( 'text',
00204             array( /* SET */
00205                 'old_flags' => $flags,
00206                 'old_text' => $compress
00207             ), array( /* WHERE */
00208                 'old_id' => $row->old_id
00209             ), __METHOD__,
00210             array( 'LIMIT' => 1 )
00211         );
00212 
00213         return true;
00214     }
00215 
00225     private function compressWithConcat( $startId, $maxChunkSize, $beginDate,
00226         $endDate, $extdb = "", $maxPageId = false
00227     ) {
00228         $loadStyle = self::LS_CHUNKED;
00229 
00230         $dbr = wfGetDB( DB_SLAVE );
00231         $dbw = wfGetDB( DB_MASTER );
00232 
00233         # Set up external storage
00234         if ( $extdb != '' ) {
00235             $storeObj = new ExternalStoreDB;
00236         }
00237 
00238         # Get all articles by page_id
00239         if ( !$maxPageId ) {
00240             $maxPageId = $dbr->selectField( 'page', 'max(page_id)', '', __METHOD__ );
00241         }
00242         $this->output( "Starting from $startId of $maxPageId\n" );
00243         $pageConds = array();
00244 
00245         /*
00246         if ( $exclude_ns0 ) {
00247             print "Excluding main namespace\n";
00248             $pageConds[] = 'page_namespace<>0';
00249         }
00250         if ( $queryExtra ) {
00251                     $pageConds[] = $queryExtra;
00252         }
00253          */
00254 
00255         # For each article, get a list of revisions which fit the criteria
00256 
00257         # No recompression, use a condition on old_flags
00258         # Don't compress object type entities, because that might produce data loss when
00259         # overwriting bulk storage concat rows. Don't compress external references, because
00260         # the script doesn't yet delete rows from external storage.
00261         $conds = array(
00262             'old_flags NOT ' . $dbr->buildLike( $dbr->anyString(), 'object', $dbr->anyString() )
00263             . ' AND old_flags NOT '
00264             . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() )
00265         );
00266 
00267         if ( $beginDate ) {
00268             if ( !preg_match( '/^\d{14}$/', $beginDate ) ) {
00269                 $this->error( "Invalid begin date \"$beginDate\"\n" );
00270 
00271                 return false;
00272             }
00273             $conds[] = "rev_timestamp>'" . $beginDate . "'";
00274         }
00275         if ( $endDate ) {
00276             if ( !preg_match( '/^\d{14}$/', $endDate ) ) {
00277                 $this->error( "Invalid end date \"$endDate\"\n" );
00278 
00279                 return false;
00280             }
00281             $conds[] = "rev_timestamp<'" . $endDate . "'";
00282         }
00283         if ( $loadStyle == self::LS_CHUNKED ) {
00284             $tables = array( 'revision', 'text' );
00285             $fields = array( 'rev_id', 'rev_text_id', 'old_flags', 'old_text' );
00286             $conds[] = 'rev_text_id=old_id';
00287             $revLoadOptions = 'FOR UPDATE';
00288         } else {
00289             $tables = array( 'revision' );
00290             $fields = array( 'rev_id', 'rev_text_id' );
00291             $revLoadOptions = array();
00292         }
00293 
00294         # Don't work with current revisions
00295         # Don't lock the page table for update either -- TS 2006-04-04
00296         #$tables[] = 'page';
00297         #$conds[] = 'page_id=rev_page AND rev_id != page_latest';
00298 
00299         for ( $pageId = $startId; $pageId <= $maxPageId; $pageId++ ) {
00300             wfWaitForSlaves();
00301 
00302             # Wake up
00303             $dbr->ping();
00304 
00305             # Get the page row
00306             $pageRes = $dbr->select( 'page',
00307                 array( 'page_id', 'page_namespace', 'page_title', 'page_latest' ),
00308                 $pageConds + array( 'page_id' => $pageId ), __METHOD__ );
00309             if ( $pageRes->numRows() == 0 ) {
00310                 continue;
00311             }
00312             $pageRow = $dbr->fetchObject( $pageRes );
00313 
00314             # Display progress
00315             $titleObj = Title::makeTitle( $pageRow->page_namespace, $pageRow->page_title );
00316             $this->output( "$pageId\t" . $titleObj->getPrefixedDBkey() . " " );
00317 
00318             # Load revisions
00319             $revRes = $dbw->select( $tables, $fields,
00320                 array_merge( array(
00321                     'rev_page' => $pageRow->page_id,
00322                     # Don't operate on the current revision
00323                     # Use < instead of <> in case the current revision has changed
00324                     # since the page select, which wasn't locking
00325                     'rev_id < ' . $pageRow->page_latest
00326                 ), $conds ),
00327                 __METHOD__,
00328                 $revLoadOptions
00329             );
00330             $revs = array();
00331             foreach ( $revRes as $revRow ) {
00332                 $revs[] = $revRow;
00333             }
00334 
00335             if ( count( $revs ) < 2 ) {
00336                 # No revisions matching, no further processing
00337                 $this->output( "\n" );
00338                 continue;
00339             }
00340 
00341             # For each chunk
00342             $i = 0;
00343             while ( $i < count( $revs ) ) {
00344                 if ( $i < count( $revs ) - $maxChunkSize ) {
00345                     $thisChunkSize = $maxChunkSize;
00346                 } else {
00347                     $thisChunkSize = count( $revs ) - $i;
00348                 }
00349 
00350                 $chunk = new ConcatenatedGzipHistoryBlob();
00351                 $stubs = array();
00352                 $dbw->begin( __METHOD__ );
00353                 $usedChunk = false;
00354                 $primaryOldid = $revs[$i]->rev_text_id;
00355 
00356                 // @codingStandardsIgnoreStart Ignore avoid function calls in a FOR loop test part warning
00357                 # Get the text of each revision and add it to the object
00358                 for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy(); $j++ ) {
00359                     // @codingStandardsIgnoreEnd
00360                     $oldid = $revs[$i + $j]->rev_text_id;
00361 
00362                     # Get text
00363                     if ( $loadStyle == self::LS_INDIVIDUAL ) {
00364                         $textRow = $dbw->selectRow( 'text',
00365                             array( 'old_flags', 'old_text' ),
00366                             array( 'old_id' => $oldid ),
00367                             __METHOD__,
00368                             'FOR UPDATE'
00369                         );
00370                         $text = Revision::getRevisionText( $textRow );
00371                     } else {
00372                         $text = Revision::getRevisionText( $revs[$i + $j] );
00373                     }
00374 
00375                     if ( $text === false ) {
00376                         $this->error( "\nError, unable to get text in old_id $oldid" );
00377                         #$dbw->delete( 'old', array( 'old_id' => $oldid ) );
00378                     }
00379 
00380                     if ( $extdb == "" && $j == 0 ) {
00381                         $chunk->setText( $text );
00382                         $this->output( '.' );
00383                     } else {
00384                         # Don't make a stub if it's going to be longer than the article
00385                         # Stubs are typically about 100 bytes
00386                         if ( strlen( $text ) < 120 ) {
00387                             $stub = false;
00388                             $this->output( 'x' );
00389                         } else {
00390                             $stub = new HistoryBlobStub( $chunk->addItem( $text ) );
00391                             $stub->setLocation( $primaryOldid );
00392                             $stub->setReferrer( $oldid );
00393                             $this->output( '.' );
00394                             $usedChunk = true;
00395                         }
00396                         $stubs[$j] = $stub;
00397                     }
00398                 }
00399                 $thisChunkSize = $j;
00400 
00401                 # If we couldn't actually use any stubs because the pages were too small, do nothing
00402                 if ( $usedChunk ) {
00403                     if ( $extdb != "" ) {
00404                         # Move blob objects to External Storage
00405                         $stored = $storeObj->store( $extdb, serialize( $chunk ) );
00406                         if ( $stored === false ) {
00407                             $this->error( "Unable to store object" );
00408 
00409                             return false;
00410                         }
00411                         # Store External Storage URLs instead of Stub placeholders
00412                         foreach ( $stubs as $stub ) {
00413                             if ( $stub === false ) {
00414                                 continue;
00415                             }
00416                             # $stored should provide base path to a BLOB
00417                             $url = $stored . "/" . $stub->getHash();
00418                             $dbw->update( 'text',
00419                                 array( /* SET */
00420                                     'old_text' => $url,
00421                                     'old_flags' => 'external,utf-8',
00422                                 ), array( /* WHERE */
00423                                     'old_id' => $stub->getReferrer(),
00424                                 )
00425                             );
00426                         }
00427                     } else {
00428                         # Store the main object locally
00429                         $dbw->update( 'text',
00430                             array( /* SET */
00431                                 'old_text' => serialize( $chunk ),
00432                                 'old_flags' => 'object,utf-8',
00433                             ), array( /* WHERE */
00434                                 'old_id' => $primaryOldid
00435                             )
00436                         );
00437 
00438                         # Store the stub objects
00439                         for ( $j = 1; $j < $thisChunkSize; $j++ ) {
00440                             # Skip if not compressing and don't overwrite the first revision
00441                             if ( $stubs[$j] !== false && $revs[$i + $j]->rev_text_id != $primaryOldid ) {
00442                                 $dbw->update( 'text',
00443                                     array( /* SET */
00444                                         'old_text' => serialize( $stubs[$j] ),
00445                                         'old_flags' => 'object,utf-8',
00446                                     ), array( /* WHERE */
00447                                         'old_id' => $revs[$i + $j]->rev_text_id
00448                                     )
00449                                 );
00450                             }
00451                         }
00452                     }
00453                 }
00454                 # Done, next
00455                 $this->output( "/" );
00456                 $dbw->commit( __METHOD__ );
00457                 $i += $thisChunkSize;
00458                 wfWaitForSlaves();
00459             }
00460             $this->output( "\n" );
00461         }
00462 
00463         return true;
00464     }
00465 }
00466 
00467 $maintClass = 'CompressOld';
00468 require_once RUN_MAINTENANCE_IF_MAIN;