MediaWiki  REL1_19
fixBug20757.php
Go to the documentation of this file.
00001 <?php
00024 require_once( dirname( __FILE__ ) . '/../Maintenance.php' );
00025 
00026 class FixBug20757 extends Maintenance {
00027         var $batchSize = 10000;
00028         var $mapCache = array();
00029         var $mapCacheSize = 0;
00030         var $maxMapCacheSize = 1000000;
00031 
00032         function __construct() {
00033                 parent::__construct();
00034                 $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact';
00035                 $this->addOption( 'dry-run', 'Report only' );
00036                 $this->addOption( 'start', 'old_id to start at', false, true );
00037         }
00038 
00039         function execute() {
00040                 $dbr = wfGetDB( DB_SLAVE );
00041                 $dbw = wfGetDB( DB_MASTER );
00042 
00043                 $dryRun = $this->getOption( 'dry-run' );
00044                 if ( $dryRun ) {
00045                         print "Dry run only.\n";
00046                 }
00047 
00048                 $startId = $this->getOption( 'start', 0 );
00049                 $numGood = 0;
00050                 $numFixed = 0;
00051                 $numBad = 0;
00052 
00053                 $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
00054 
00055                 if ( $dbr->getType() == 'mysql'
00056                         && version_compare( $dbr->getServerVersion(), '4.1.0', '>=' ) )
00057                 {
00058                         // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function
00059                         $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))';
00060                 } else {
00061                         // No CONVERT() in MySQL 4.0
00062                         $lowerLeft = 'LOWER(LEFT(old_text,22))';
00063                 }
00064 
00065                 while ( true ) {
00066                         print "ID: $startId / $totalRevs\r";
00067 
00068                         $res = $dbr->select(
00069                                 'text',
00070                                 array( 'old_id', 'old_flags', 'old_text' ),
00071                                 array(
00072                                         'old_id > ' . intval( $startId ),
00073                                         'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
00074                                         "$lowerLeft = 'o:15:\"historyblobstub\"'",
00075                                 ),
00076                                 __METHOD__,
00077                                 array(
00078                                         'ORDER BY' => 'old_id',
00079                                         'LIMIT' => $this->batchSize,
00080                                 )
00081                         );
00082 
00083                         if ( !$res->numRows() ) {
00084                                 break;
00085                         }
00086 
00087                         $secondaryIds = array();
00088                         $stubs = array();
00089 
00090                         foreach ( $res as $row ) {
00091                                 $startId = $row->old_id;
00092 
00093                                 // Basic sanity checks
00094                                 $obj = unserialize( $row->old_text );
00095                                 if ( $obj === false ) {
00096                                         print "{$row->old_id}: unrecoverable: cannot unserialize\n";
00097                                         ++$numBad;
00098                                         continue;
00099                                 }
00100 
00101                                 if ( !is_object( $obj ) ) {
00102                                         print "{$row->old_id}: unrecoverable: unserialized to type " .
00103                                                 gettype( $obj ) . ", possible double-serialization\n";
00104                                         ++$numBad;
00105                                         continue;
00106                                 }
00107 
00108                                 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
00109                                         print "{$row->old_id}: unrecoverable: unexpected object class " .
00110                                                 get_class( $obj ) . "\n";
00111                                         ++$numBad;
00112                                         continue;
00113                                 }
00114 
00115                                 // Process flags
00116                                 $flags = explode( ',', $row->old_flags );
00117                                 if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
00118                                         $legacyEncoding = false;
00119                                 } else {
00120                                         $legacyEncoding = true;
00121                                 }
00122 
00123                                 // Queue the stub for future batch processing
00124                                 $id = intval( $obj->mOldId );
00125                                 $secondaryIds[] = $id;
00126                                 $stubs[$row->old_id] = array(
00127                                         'legacyEncoding' => $legacyEncoding,
00128                                         'secondaryId' => $id,
00129                                         'hash' => $obj->mHash,
00130                                 );
00131                         }
00132 
00133                         $secondaryIds = array_unique( $secondaryIds );
00134 
00135                         if ( !count( $secondaryIds ) ) {
00136                                 continue;
00137                         }
00138 
00139                         // Run the batch query on blob_tracking
00140                         $res = $dbr->select(
00141                                 'blob_tracking',
00142                                 '*',
00143                                 array(
00144                                         'bt_text_id' => $secondaryIds,
00145                                 ),
00146                                 __METHOD__
00147                         );
00148                         $trackedBlobs = array();
00149                         foreach ( $res as $row ) {
00150                                 $trackedBlobs[$row->bt_text_id] = $row;
00151                         }
00152 
00153                         // Process the stubs
00154                         foreach ( $stubs as $primaryId => $stub ) {
00155                                 $secondaryId = $stub['secondaryId'];
00156                                 if ( !isset( $trackedBlobs[$secondaryId] ) ) {
00157                                         // No tracked blob. Work out what went wrong
00158                                         $secondaryRow = $dbr->selectRow(
00159                                                 'text',
00160                                                 array( 'old_flags', 'old_text' ),
00161                                                 array( 'old_id' => $secondaryId ),
00162                                                 __METHOD__
00163                                         );
00164                                         if ( !$secondaryRow ) {
00165                                                 print "$primaryId: unrecoverable: secondary row is missing\n";
00166                                                 ++$numBad;
00167                                         } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
00168                                                 // Not broken yet, and not in the tracked clusters so it won't get
00169                                                 // broken by the current RCT run.
00170                                                 ++$numGood;
00171                                         } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
00172                                                 print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
00173                                                 ++$numBad;
00174                                         } else {
00175                                                 print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
00176                                                 ++$numBad;
00177                                         }
00178                                         unset( $stubs[$primaryId] );
00179                                         continue;
00180                                 }
00181                                 $trackRow = $trackedBlobs[$secondaryId];
00182 
00183                                 // Check that the specified text really is available in the tracked source row
00184                                 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
00185                                 $text = ExternalStore::fetchFromURL( $url );
00186                                 if ( $text === false ) {
00187                                         print "$primaryId: unrecoverable: source text missing\n";
00188                                         ++$numBad;
00189                                         unset( $stubs[$primaryId] );
00190                                         continue;
00191                                 }
00192                                 if ( md5( $text ) !== $stub['hash'] ) {
00193                                         print "$primaryId: unrecoverable: content hashes do not match\n";
00194                                         ++$numBad;
00195                                         unset( $stubs[$primaryId] );
00196                                         continue;
00197                                 }
00198 
00199                                 // Find the page_id and rev_id
00200                                 // The page is probably the same as the page of the secondary row
00201                                 $pageId = intval( $trackRow->bt_page );
00202                                 if ( !$pageId ) {
00203                                         $revId = $pageId = 0;
00204                                 } else {
00205                                         $revId = $this->findTextIdInPage( $pageId, $primaryId );
00206                                         if ( !$revId ) {
00207                                                 // Actually an orphan
00208                                                 $pageId = $revId = 0;
00209                                         }
00210                                 }
00211 
00212                                 $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
00213 
00214                                 if ( !$dryRun ) {
00215                                         // Reset the text row to point to the original copy
00216                                         $dbw->begin();
00217                                         $dbw->update(
00218                                                 'text',
00219                                                 // SET
00220                                                 array(
00221                                                         'old_flags' => $newFlags,
00222                                                         'old_text' => $url
00223                                                 ),
00224                                                 // WHERE
00225                                                 array( 'old_id' => $primaryId ),
00226                                                 __METHOD__
00227                                         );
00228 
00229                                         // Add a blob_tracking row so that the new reference can be recompressed
00230                                         // without needing to run trackBlobs.php again
00231                                         $dbw->insert( 'blob_tracking',
00232                                                 array(
00233                                                         'bt_page' => $pageId,
00234                                                         'bt_rev_id' => $revId,
00235                                                         'bt_text_id' => $primaryId,
00236                                                         'bt_cluster' => $trackRow->bt_cluster,
00237                                                         'bt_blob_id' => $trackRow->bt_blob_id,
00238                                                         'bt_cgz_hash' => $stub['hash'],
00239                                                         'bt_new_url' => null,
00240                                                         'bt_moved' => 0,
00241                                                 ),
00242                                                 __METHOD__
00243                                         );
00244                                         $dbw->commit();
00245                                         $this->waitForSlaves();
00246                                 }
00247 
00248                                 print "$primaryId: resolved to $url\n";
00249                                 ++$numFixed;
00250                         }
00251                 }
00252 
00253                 print "\n";
00254                 print "Fixed: $numFixed\n";
00255                 print "Unrecoverable: $numBad\n";
00256                 print "Good stubs: $numGood\n";
00257         }
00258 
00259         function waitForSlaves() {
00260                 static $iteration = 0;
00261                 ++$iteration;
00262                 if ( ++$iteration > 50 == 0 ) {
00263                         wfWaitForSlaves();
00264                         $iteration = 0;
00265                 }
00266         }
00267 
00268         function findTextIdInPage( $pageId, $textId ) {
00269                 $ids = $this->getRevTextMap( $pageId );
00270                 if ( !isset( $ids[$textId] ) ) {
00271                         return null;
00272                 } else {
00273                         return $ids[$textId];
00274                 }
00275         }
00276 
00277         function getRevTextMap( $pageId ) {
00278                 if ( !isset( $this->mapCache[$pageId] ) ) {
00279                         // Limit cache size
00280                         while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
00281                                 $key = key( $this->mapCache );
00282                                 $this->mapCacheSize -= count( $this->mapCache[$key] );
00283                                 unset( $this->mapCache[$key] );
00284                         }
00285 
00286                         $dbr = wfGetDB( DB_SLAVE );
00287                         $map = array();
00288                         $res = $dbr->select( 'revision',
00289                                 array( 'rev_id', 'rev_text_id' ),
00290                                 array( 'rev_page' => $pageId ),
00291                                 __METHOD__
00292                         );
00293                         foreach ( $res as $row ) {
00294                                 $map[$row->rev_text_id] = $row->rev_id;
00295                         }
00296                         $this->mapCache[$pageId] = $map;
00297                         $this->mapCacheSize += count( $map );
00298                 }
00299                 return $this->mapCache[$pageId];
00300         }
00301 
00309         function isUnbrokenStub( $stub, $secondaryRow ) {
00310                 $flags = explode( ',', $secondaryRow->old_flags );
00311                 $text = $secondaryRow->old_text;
00312                 if ( in_array( 'external', $flags ) ) {
00313                         $url = $text;
00314                         @list( /* $proto */ , $path ) = explode( '://', $url, 2 );
00315                         if ( $path == "" ) {
00316                                 return false;
00317                         }
00318                         $text = ExternalStore::fetchFromUrl( $url );
00319                 }
00320                 if ( !in_array( 'object', $flags ) ) {
00321                         return false;
00322                 }
00323 
00324                 if ( in_array( 'gzip', $flags ) ) {
00325                         $obj = unserialize( gzinflate( $text ) );
00326                 } else {
00327                         $obj = unserialize( $text );
00328                 }
00329 
00330                 if ( !is_object( $obj ) ) {
00331                         // Correct for old double-serialization bug.
00332                         $obj = unserialize( $obj );
00333                 }
00334 
00335                 if ( !is_object( $obj ) ) {
00336                         return false;
00337                 }
00338 
00339                 $obj->uncompress();
00340                 $text = $obj->getItem( $stub['hash'] );
00341                 return $text !== false;
00342         }
00343 }
00344 
00345 $maintClass = 'FixBug20757';
00346 require_once( RUN_MAINTENANCE_IF_MAIN );
00347