MediaWiki  REL1_24
fixBug20757.php
Go to the documentation of this file.
00001 <?php
00024 require_once __DIR__ . '/../Maintenance.php';
00025 
00031 class FixBug20757 extends Maintenance {
00032     public $batchSize = 10000;
00033     public $mapCache = array();
00034     public $mapCacheSize = 0;
00035     public $maxMapCacheSize = 1000000;
00036 
00037     function __construct() {
00038         parent::__construct();
00039         $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact';
00040         $this->addOption( 'dry-run', 'Report only' );
00041         $this->addOption( 'start', 'old_id to start at', false, true );
00042     }
00043 
00044     function execute() {
00045         $dbr = wfGetDB( DB_SLAVE );
00046         $dbw = wfGetDB( DB_MASTER );
00047 
00048         $dryRun = $this->getOption( 'dry-run' );
00049         if ( $dryRun ) {
00050             print "Dry run only.\n";
00051         }
00052 
00053         $startId = $this->getOption( 'start', 0 );
00054         $numGood = 0;
00055         $numFixed = 0;
00056         $numBad = 0;
00057 
00058         $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
00059 
00060         if ( $dbr->getType() == 'mysql' ) {
00061             // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function
00062             $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))';
00063         }
00064 
00065         while ( true ) {
00066             print "ID: $startId / $totalRevs\r";
00067 
00068             $res = $dbr->select(
00069                 'text',
00070                 array( 'old_id', 'old_flags', 'old_text' ),
00071                 array(
00072                     'old_id > ' . intval( $startId ),
00073                     'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
00074                     "$lowerLeft = 'o:15:\"historyblobstub\"'",
00075                 ),
00076                 __METHOD__,
00077                 array(
00078                     'ORDER BY' => 'old_id',
00079                     'LIMIT' => $this->batchSize,
00080                 )
00081             );
00082 
00083             if ( !$res->numRows() ) {
00084                 break;
00085             }
00086 
00087             $secondaryIds = array();
00088             $stubs = array();
00089 
00090             foreach ( $res as $row ) {
00091                 $startId = $row->old_id;
00092 
00093                 // Basic sanity checks
00094                 $obj = unserialize( $row->old_text );
00095                 if ( $obj === false ) {
00096                     print "{$row->old_id}: unrecoverable: cannot unserialize\n";
00097                     ++$numBad;
00098                     continue;
00099                 }
00100 
00101                 if ( !is_object( $obj ) ) {
00102                     print "{$row->old_id}: unrecoverable: unserialized to type " .
00103                         gettype( $obj ) . ", possible double-serialization\n";
00104                     ++$numBad;
00105                     continue;
00106                 }
00107 
00108                 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
00109                     print "{$row->old_id}: unrecoverable: unexpected object class " .
00110                         get_class( $obj ) . "\n";
00111                     ++$numBad;
00112                     continue;
00113                 }
00114 
00115                 // Process flags
00116                 $flags = explode( ',', $row->old_flags );
00117                 if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
00118                     $legacyEncoding = false;
00119                 } else {
00120                     $legacyEncoding = true;
00121                 }
00122 
00123                 // Queue the stub for future batch processing
00124                 $id = intval( $obj->mOldId );
00125                 $secondaryIds[] = $id;
00126                 $stubs[$row->old_id] = array(
00127                     'legacyEncoding' => $legacyEncoding,
00128                     'secondaryId' => $id,
00129                     'hash' => $obj->mHash,
00130                 );
00131             }
00132 
00133             $secondaryIds = array_unique( $secondaryIds );
00134 
00135             if ( !count( $secondaryIds ) ) {
00136                 continue;
00137             }
00138 
00139             // Run the batch query on blob_tracking
00140             $res = $dbr->select(
00141                 'blob_tracking',
00142                 '*',
00143                 array(
00144                     'bt_text_id' => $secondaryIds,
00145                 ),
00146                 __METHOD__
00147             );
00148             $trackedBlobs = array();
00149             foreach ( $res as $row ) {
00150                 $trackedBlobs[$row->bt_text_id] = $row;
00151             }
00152 
00153             // Process the stubs
00154             foreach ( $stubs as $primaryId => $stub ) {
00155                 $secondaryId = $stub['secondaryId'];
00156                 if ( !isset( $trackedBlobs[$secondaryId] ) ) {
00157                     // No tracked blob. Work out what went wrong
00158                     $secondaryRow = $dbr->selectRow(
00159                         'text',
00160                         array( 'old_flags', 'old_text' ),
00161                         array( 'old_id' => $secondaryId ),
00162                         __METHOD__
00163                     );
00164                     if ( !$secondaryRow ) {
00165                         print "$primaryId: unrecoverable: secondary row is missing\n";
00166                         ++$numBad;
00167                     } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
00168                         // Not broken yet, and not in the tracked clusters so it won't get
00169                         // broken by the current RCT run.
00170                         ++$numGood;
00171                     } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
00172                         print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
00173                         ++$numBad;
00174                     } else {
00175                         print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
00176                         ++$numBad;
00177                     }
00178                     unset( $stubs[$primaryId] );
00179                     continue;
00180                 }
00181                 $trackRow = $trackedBlobs[$secondaryId];
00182 
00183                 // Check that the specified text really is available in the tracked source row
00184                 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
00185                 $text = ExternalStore::fetchFromURL( $url );
00186                 if ( $text === false ) {
00187                     print "$primaryId: unrecoverable: source text missing\n";
00188                     ++$numBad;
00189                     unset( $stubs[$primaryId] );
00190                     continue;
00191                 }
00192                 if ( md5( $text ) !== $stub['hash'] ) {
00193                     print "$primaryId: unrecoverable: content hashes do not match\n";
00194                     ++$numBad;
00195                     unset( $stubs[$primaryId] );
00196                     continue;
00197                 }
00198 
00199                 // Find the page_id and rev_id
00200                 // The page is probably the same as the page of the secondary row
00201                 $pageId = intval( $trackRow->bt_page );
00202                 if ( !$pageId ) {
00203                     $revId = $pageId = 0;
00204                 } else {
00205                     $revId = $this->findTextIdInPage( $pageId, $primaryId );
00206                     if ( !$revId ) {
00207                         // Actually an orphan
00208                         $pageId = $revId = 0;
00209                     }
00210                 }
00211 
00212                 $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
00213 
00214                 if ( !$dryRun ) {
00215                     // Reset the text row to point to the original copy
00216                     $dbw->begin( __METHOD__ );
00217                     $dbw->update(
00218                         'text',
00219                         // SET
00220                         array(
00221                             'old_flags' => $newFlags,
00222                             'old_text' => $url
00223                         ),
00224                         // WHERE
00225                         array( 'old_id' => $primaryId ),
00226                         __METHOD__
00227                     );
00228 
00229                     // Add a blob_tracking row so that the new reference can be recompressed
00230                     // without needing to run trackBlobs.php again
00231                     $dbw->insert( 'blob_tracking',
00232                         array(
00233                             'bt_page' => $pageId,
00234                             'bt_rev_id' => $revId,
00235                             'bt_text_id' => $primaryId,
00236                             'bt_cluster' => $trackRow->bt_cluster,
00237                             'bt_blob_id' => $trackRow->bt_blob_id,
00238                             'bt_cgz_hash' => $stub['hash'],
00239                             'bt_new_url' => null,
00240                             'bt_moved' => 0,
00241                         ),
00242                         __METHOD__
00243                     );
00244                     $dbw->commit( __METHOD__ );
00245                     $this->waitForSlaves();
00246                 }
00247 
00248                 print "$primaryId: resolved to $url\n";
00249                 ++$numFixed;
00250             }
00251         }
00252 
00253         print "\n";
00254         print "Fixed: $numFixed\n";
00255         print "Unrecoverable: $numBad\n";
00256         print "Good stubs: $numGood\n";
00257     }
00258 
00259     function waitForSlaves() {
00260         static $iteration = 0;
00261         ++$iteration;
00262         if ( ++$iteration > 50 == 0 ) {
00263             wfWaitForSlaves();
00264             $iteration = 0;
00265         }
00266     }
00267 
00268     function findTextIdInPage( $pageId, $textId ) {
00269         $ids = $this->getRevTextMap( $pageId );
00270         if ( !isset( $ids[$textId] ) ) {
00271             return null;
00272         } else {
00273             return $ids[$textId];
00274         }
00275     }
00276 
00277     function getRevTextMap( $pageId ) {
00278         if ( !isset( $this->mapCache[$pageId] ) ) {
00279             // Limit cache size
00280             while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
00281                 $key = key( $this->mapCache );
00282                 $this->mapCacheSize -= count( $this->mapCache[$key] );
00283                 unset( $this->mapCache[$key] );
00284             }
00285 
00286             $dbr = wfGetDB( DB_SLAVE );
00287             $map = array();
00288             $res = $dbr->select( 'revision',
00289                 array( 'rev_id', 'rev_text_id' ),
00290                 array( 'rev_page' => $pageId ),
00291                 __METHOD__
00292             );
00293             foreach ( $res as $row ) {
00294                 $map[$row->rev_text_id] = $row->rev_id;
00295             }
00296             $this->mapCache[$pageId] = $map;
00297             $this->mapCacheSize += count( $map );
00298         }
00299 
00300         return $this->mapCache[$pageId];
00301     }
00302 
00310     function isUnbrokenStub( $stub, $secondaryRow ) {
00311         $flags = explode( ',', $secondaryRow->old_flags );
00312         $text = $secondaryRow->old_text;
00313         if ( in_array( 'external', $flags ) ) {
00314             $url = $text;
00315             wfSuppressWarnings();
00316             list( /* $proto */, $path ) = explode( '://', $url, 2 );
00317             wfRestoreWarnings();
00318 
00319             if ( $path == "" ) {
00320                 return false;
00321             }
00322             $text = ExternalStore::fetchFromUrl( $url );
00323         }
00324         if ( !in_array( 'object', $flags ) ) {
00325             return false;
00326         }
00327 
00328         if ( in_array( 'gzip', $flags ) ) {
00329             $obj = unserialize( gzinflate( $text ) );
00330         } else {
00331             $obj = unserialize( $text );
00332         }
00333 
00334         if ( !is_object( $obj ) ) {
00335             // Correct for old double-serialization bug.
00336             $obj = unserialize( $obj );
00337         }
00338 
00339         if ( !is_object( $obj ) ) {
00340             return false;
00341         }
00342 
00343         $obj->uncompress();
00344         $text = $obj->getItem( $stub['hash'] );
00345 
00346         return $text !== false;
00347     }
00348 }
00349 
00350 $maintClass = 'FixBug20757';
00351 require_once RUN_MAINTENANCE_IF_MAIN;