MediaWiki  REL1_21
fixBug20757.php
Go to the documentation of this file.
00001 <?php
00024 require_once( __DIR__ . '/../Maintenance.php' );
00025 
00031 class FixBug20757 extends Maintenance {
00032         public $batchSize = 10000;
00033         public $mapCache = array();
00034         public $mapCacheSize = 0;
00035         public $maxMapCacheSize = 1000000;
00036 
00037         function __construct() {
00038                 parent::__construct();
00039                 $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact';
00040                 $this->addOption( 'dry-run', 'Report only' );
00041                 $this->addOption( 'start', 'old_id to start at', false, true );
00042         }
00043 
00044         function execute() {
00045                 $dbr = wfGetDB( DB_SLAVE );
00046                 $dbw = wfGetDB( DB_MASTER );
00047 
00048                 $dryRun = $this->getOption( 'dry-run' );
00049                 if ( $dryRun ) {
00050                         print "Dry run only.\n";
00051                 }
00052 
00053                 $startId = $this->getOption( 'start', 0 );
00054                 $numGood = 0;
00055                 $numFixed = 0;
00056                 $numBad = 0;
00057 
00058                 $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
00059 
00060                 if ( $dbr->getType() == 'mysql'
00061                         && version_compare( $dbr->getServerVersion(), '4.1.0', '>=' ) )
00062                 {
00063                         // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function
00064                         $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))';
00065                 } else {
00066                         // No CONVERT() in MySQL 4.0
00067                         $lowerLeft = 'LOWER(LEFT(old_text,22))';
00068                 }
00069 
00070                 while ( true ) {
00071                         print "ID: $startId / $totalRevs\r";
00072 
00073                         $res = $dbr->select(
00074                                 'text',
00075                                 array( 'old_id', 'old_flags', 'old_text' ),
00076                                 array(
00077                                         'old_id > ' . intval( $startId ),
00078                                         'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
00079                                         "$lowerLeft = 'o:15:\"historyblobstub\"'",
00080                                 ),
00081                                 __METHOD__,
00082                                 array(
00083                                         'ORDER BY' => 'old_id',
00084                                         'LIMIT' => $this->batchSize,
00085                                 )
00086                         );
00087 
00088                         if ( !$res->numRows() ) {
00089                                 break;
00090                         }
00091 
00092                         $secondaryIds = array();
00093                         $stubs = array();
00094 
00095                         foreach ( $res as $row ) {
00096                                 $startId = $row->old_id;
00097 
00098                                 // Basic sanity checks
00099                                 $obj = unserialize( $row->old_text );
00100                                 if ( $obj === false ) {
00101                                         print "{$row->old_id}: unrecoverable: cannot unserialize\n";
00102                                         ++$numBad;
00103                                         continue;
00104                                 }
00105 
00106                                 if ( !is_object( $obj ) ) {
00107                                         print "{$row->old_id}: unrecoverable: unserialized to type " .
00108                                                 gettype( $obj ) . ", possible double-serialization\n";
00109                                         ++$numBad;
00110                                         continue;
00111                                 }
00112 
00113                                 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
00114                                         print "{$row->old_id}: unrecoverable: unexpected object class " .
00115                                                 get_class( $obj ) . "\n";
00116                                         ++$numBad;
00117                                         continue;
00118                                 }
00119 
00120                                 // Process flags
00121                                 $flags = explode( ',', $row->old_flags );
00122                                 if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
00123                                         $legacyEncoding = false;
00124                                 } else {
00125                                         $legacyEncoding = true;
00126                                 }
00127 
00128                                 // Queue the stub for future batch processing
00129                                 $id = intval( $obj->mOldId );
00130                                 $secondaryIds[] = $id;
00131                                 $stubs[$row->old_id] = array(
00132                                         'legacyEncoding' => $legacyEncoding,
00133                                         'secondaryId' => $id,
00134                                         'hash' => $obj->mHash,
00135                                 );
00136                         }
00137 
00138                         $secondaryIds = array_unique( $secondaryIds );
00139 
00140                         if ( !count( $secondaryIds ) ) {
00141                                 continue;
00142                         }
00143 
00144                         // Run the batch query on blob_tracking
00145                         $res = $dbr->select(
00146                                 'blob_tracking',
00147                                 '*',
00148                                 array(
00149                                         'bt_text_id' => $secondaryIds,
00150                                 ),
00151                                 __METHOD__
00152                         );
00153                         $trackedBlobs = array();
00154                         foreach ( $res as $row ) {
00155                                 $trackedBlobs[$row->bt_text_id] = $row;
00156                         }
00157 
00158                         // Process the stubs
00159                         foreach ( $stubs as $primaryId => $stub ) {
00160                                 $secondaryId = $stub['secondaryId'];
00161                                 if ( !isset( $trackedBlobs[$secondaryId] ) ) {
00162                                         // No tracked blob. Work out what went wrong
00163                                         $secondaryRow = $dbr->selectRow(
00164                                                 'text',
00165                                                 array( 'old_flags', 'old_text' ),
00166                                                 array( 'old_id' => $secondaryId ),
00167                                                 __METHOD__
00168                                         );
00169                                         if ( !$secondaryRow ) {
00170                                                 print "$primaryId: unrecoverable: secondary row is missing\n";
00171                                                 ++$numBad;
00172                                         } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
00173                                                 // Not broken yet, and not in the tracked clusters so it won't get
00174                                                 // broken by the current RCT run.
00175                                                 ++$numGood;
00176                                         } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
00177                                                 print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
00178                                                 ++$numBad;
00179                                         } else {
00180                                                 print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
00181                                                 ++$numBad;
00182                                         }
00183                                         unset( $stubs[$primaryId] );
00184                                         continue;
00185                                 }
00186                                 $trackRow = $trackedBlobs[$secondaryId];
00187 
00188                                 // Check that the specified text really is available in the tracked source row
00189                                 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
00190                                 $text = ExternalStore::fetchFromURL( $url );
00191                                 if ( $text === false ) {
00192                                         print "$primaryId: unrecoverable: source text missing\n";
00193                                         ++$numBad;
00194                                         unset( $stubs[$primaryId] );
00195                                         continue;
00196                                 }
00197                                 if ( md5( $text ) !== $stub['hash'] ) {
00198                                         print "$primaryId: unrecoverable: content hashes do not match\n";
00199                                         ++$numBad;
00200                                         unset( $stubs[$primaryId] );
00201                                         continue;
00202                                 }
00203 
00204                                 // Find the page_id and rev_id
00205                                 // The page is probably the same as the page of the secondary row
00206                                 $pageId = intval( $trackRow->bt_page );
00207                                 if ( !$pageId ) {
00208                                         $revId = $pageId = 0;
00209                                 } else {
00210                                         $revId = $this->findTextIdInPage( $pageId, $primaryId );
00211                                         if ( !$revId ) {
00212                                                 // Actually an orphan
00213                                                 $pageId = $revId = 0;
00214                                         }
00215                                 }
00216 
00217                                 $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
00218 
00219                                 if ( !$dryRun ) {
00220                                         // Reset the text row to point to the original copy
00221                                         $dbw->begin( __METHOD__ );
00222                                         $dbw->update(
00223                                                 'text',
00224                                                 // SET
00225                                                 array(
00226                                                         'old_flags' => $newFlags,
00227                                                         'old_text' => $url
00228                                                 ),
00229                                                 // WHERE
00230                                                 array( 'old_id' => $primaryId ),
00231                                                 __METHOD__
00232                                         );
00233 
00234                                         // Add a blob_tracking row so that the new reference can be recompressed
00235                                         // without needing to run trackBlobs.php again
00236                                         $dbw->insert( 'blob_tracking',
00237                                                 array(
00238                                                         'bt_page' => $pageId,
00239                                                         'bt_rev_id' => $revId,
00240                                                         'bt_text_id' => $primaryId,
00241                                                         'bt_cluster' => $trackRow->bt_cluster,
00242                                                         'bt_blob_id' => $trackRow->bt_blob_id,
00243                                                         'bt_cgz_hash' => $stub['hash'],
00244                                                         'bt_new_url' => null,
00245                                                         'bt_moved' => 0,
00246                                                 ),
00247                                                 __METHOD__
00248                                         );
00249                                         $dbw->commit( __METHOD__ );
00250                                         $this->waitForSlaves();
00251                                 }
00252 
00253                                 print "$primaryId: resolved to $url\n";
00254                                 ++$numFixed;
00255                         }
00256                 }
00257 
00258                 print "\n";
00259                 print "Fixed: $numFixed\n";
00260                 print "Unrecoverable: $numBad\n";
00261                 print "Good stubs: $numGood\n";
00262         }
00263 
00264         function waitForSlaves() {
00265                 static $iteration = 0;
00266                 ++$iteration;
00267                 if ( ++$iteration > 50 == 0 ) {
00268                         wfWaitForSlaves();
00269                         $iteration = 0;
00270                 }
00271         }
00272 
00273         function findTextIdInPage( $pageId, $textId ) {
00274                 $ids = $this->getRevTextMap( $pageId );
00275                 if ( !isset( $ids[$textId] ) ) {
00276                         return null;
00277                 } else {
00278                         return $ids[$textId];
00279                 }
00280         }
00281 
00282         function getRevTextMap( $pageId ) {
00283                 if ( !isset( $this->mapCache[$pageId] ) ) {
00284                         // Limit cache size
00285                         while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
00286                                 $key = key( $this->mapCache );
00287                                 $this->mapCacheSize -= count( $this->mapCache[$key] );
00288                                 unset( $this->mapCache[$key] );
00289                         }
00290 
00291                         $dbr = wfGetDB( DB_SLAVE );
00292                         $map = array();
00293                         $res = $dbr->select( 'revision',
00294                                 array( 'rev_id', 'rev_text_id' ),
00295                                 array( 'rev_page' => $pageId ),
00296                                 __METHOD__
00297                         );
00298                         foreach ( $res as $row ) {
00299                                 $map[$row->rev_text_id] = $row->rev_id;
00300                         }
00301                         $this->mapCache[$pageId] = $map;
00302                         $this->mapCacheSize += count( $map );
00303                 }
00304                 return $this->mapCache[$pageId];
00305         }
00306 
00314         function isUnbrokenStub( $stub, $secondaryRow ) {
00315                 $flags = explode( ',', $secondaryRow->old_flags );
00316                 $text = $secondaryRow->old_text;
00317                 if ( in_array( 'external', $flags ) ) {
00318                         $url = $text;
00319                         @list( /* $proto */ , $path ) = explode( '://', $url, 2 );
00320                         if ( $path == "" ) {
00321                                 return false;
00322                         }
00323                         $text = ExternalStore::fetchFromUrl( $url );
00324                 }
00325                 if ( !in_array( 'object', $flags ) ) {
00326                         return false;
00327                 }
00328 
00329                 if ( in_array( 'gzip', $flags ) ) {
00330                         $obj = unserialize( gzinflate( $text ) );
00331                 } else {
00332                         $obj = unserialize( $text );
00333                 }
00334 
00335                 if ( !is_object( $obj ) ) {
00336                         // Correct for old double-serialization bug.
00337                         $obj = unserialize( $obj );
00338                 }
00339 
00340                 if ( !is_object( $obj ) ) {
00341                         return false;
00342                 }
00343 
00344                 $obj->uncompress();
00345                 $text = $obj->getItem( $stub['hash'] );
00346                 return $text !== false;
00347         }
00348 }
00349 
00350 $maintClass = 'FixBug20757';
00351 require_once( RUN_MAINTENANCE_IF_MAIN );