MediaWiki  REL1_22
fixBug20757.php
Go to the documentation of this file.
00001 <?php
00024 require_once __DIR__ . '/../Maintenance.php';
00025 
00031 class FixBug20757 extends Maintenance {
00032     public $batchSize = 10000;
00033     public $mapCache = array();
00034     public $mapCacheSize = 0;
00035     public $maxMapCacheSize = 1000000;
00036 
00037     function __construct() {
00038         parent::__construct();
00039         $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact';
00040         $this->addOption( 'dry-run', 'Report only' );
00041         $this->addOption( 'start', 'old_id to start at', false, true );
00042     }
00043 
00044     function execute() {
00045         $dbr = wfGetDB( DB_SLAVE );
00046         $dbw = wfGetDB( DB_MASTER );
00047 
00048         $dryRun = $this->getOption( 'dry-run' );
00049         if ( $dryRun ) {
00050             print "Dry run only.\n";
00051         }
00052 
00053         $startId = $this->getOption( 'start', 0 );
00054         $numGood = 0;
00055         $numFixed = 0;
00056         $numBad = 0;
00057 
00058         $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
00059 
00060         if ( $dbr->getType() == 'mysql'
00061             && version_compare( $dbr->getServerVersion(), '4.1.0', '>=' ) )
00062         {
00063             // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function
00064             $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))';
00065         } else {
00066             // No CONVERT() in MySQL 4.0
00067             $lowerLeft = 'LOWER(LEFT(old_text,22))';
00068         }
00069 
00070         while ( true ) {
00071             print "ID: $startId / $totalRevs\r";
00072 
00073             $res = $dbr->select(
00074                 'text',
00075                 array( 'old_id', 'old_flags', 'old_text' ),
00076                 array(
00077                     'old_id > ' . intval( $startId ),
00078                     'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'',
00079                     "$lowerLeft = 'o:15:\"historyblobstub\"'",
00080                 ),
00081                 __METHOD__,
00082                 array(
00083                     'ORDER BY' => 'old_id',
00084                     'LIMIT' => $this->batchSize,
00085                 )
00086             );
00087 
00088             if ( !$res->numRows() ) {
00089                 break;
00090             }
00091 
00092             $secondaryIds = array();
00093             $stubs = array();
00094 
00095             foreach ( $res as $row ) {
00096                 $startId = $row->old_id;
00097 
00098                 // Basic sanity checks
00099                 $obj = unserialize( $row->old_text );
00100                 if ( $obj === false ) {
00101                     print "{$row->old_id}: unrecoverable: cannot unserialize\n";
00102                     ++$numBad;
00103                     continue;
00104                 }
00105 
00106                 if ( !is_object( $obj ) ) {
00107                     print "{$row->old_id}: unrecoverable: unserialized to type " .
00108                         gettype( $obj ) . ", possible double-serialization\n";
00109                     ++$numBad;
00110                     continue;
00111                 }
00112 
00113                 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) {
00114                     print "{$row->old_id}: unrecoverable: unexpected object class " .
00115                         get_class( $obj ) . "\n";
00116                     ++$numBad;
00117                     continue;
00118                 }
00119 
00120                 // Process flags
00121                 $flags = explode( ',', $row->old_flags );
00122                 if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) {
00123                     $legacyEncoding = false;
00124                 } else {
00125                     $legacyEncoding = true;
00126                 }
00127 
00128                 // Queue the stub for future batch processing
00129                 $id = intval( $obj->mOldId );
00130                 $secondaryIds[] = $id;
00131                 $stubs[$row->old_id] = array(
00132                     'legacyEncoding' => $legacyEncoding,
00133                     'secondaryId' => $id,
00134                     'hash' => $obj->mHash,
00135                 );
00136             }
00137 
00138             $secondaryIds = array_unique( $secondaryIds );
00139 
00140             if ( !count( $secondaryIds ) ) {
00141                 continue;
00142             }
00143 
00144             // Run the batch query on blob_tracking
00145             $res = $dbr->select(
00146                 'blob_tracking',
00147                 '*',
00148                 array(
00149                     'bt_text_id' => $secondaryIds,
00150                 ),
00151                 __METHOD__
00152             );
00153             $trackedBlobs = array();
00154             foreach ( $res as $row ) {
00155                 $trackedBlobs[$row->bt_text_id] = $row;
00156             }
00157 
00158             // Process the stubs
00159             foreach ( $stubs as $primaryId => $stub ) {
00160                 $secondaryId = $stub['secondaryId'];
00161                 if ( !isset( $trackedBlobs[$secondaryId] ) ) {
00162                     // No tracked blob. Work out what went wrong
00163                     $secondaryRow = $dbr->selectRow(
00164                         'text',
00165                         array( 'old_flags', 'old_text' ),
00166                         array( 'old_id' => $secondaryId ),
00167                         __METHOD__
00168                     );
00169                     if ( !$secondaryRow ) {
00170                         print "$primaryId: unrecoverable: secondary row is missing\n";
00171                         ++$numBad;
00172                     } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) {
00173                         // Not broken yet, and not in the tracked clusters so it won't get
00174                         // broken by the current RCT run.
00175                         ++$numGood;
00176                     } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) {
00177                         print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n";
00178                         ++$numBad;
00179                     } else {
00180                         print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n";
00181                         ++$numBad;
00182                     }
00183                     unset( $stubs[$primaryId] );
00184                     continue;
00185                 }
00186                 $trackRow = $trackedBlobs[$secondaryId];
00187 
00188                 // Check that the specified text really is available in the tracked source row
00189                 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}";
00190                 $text = ExternalStore::fetchFromURL( $url );
00191                 if ( $text === false ) {
00192                     print "$primaryId: unrecoverable: source text missing\n";
00193                     ++$numBad;
00194                     unset( $stubs[$primaryId] );
00195                     continue;
00196                 }
00197                 if ( md5( $text ) !== $stub['hash'] ) {
00198                     print "$primaryId: unrecoverable: content hashes do not match\n";
00199                     ++$numBad;
00200                     unset( $stubs[$primaryId] );
00201                     continue;
00202                 }
00203 
00204                 // Find the page_id and rev_id
00205                 // The page is probably the same as the page of the secondary row
00206                 $pageId = intval( $trackRow->bt_page );
00207                 if ( !$pageId ) {
00208                     $revId = $pageId = 0;
00209                 } else {
00210                     $revId = $this->findTextIdInPage( $pageId, $primaryId );
00211                     if ( !$revId ) {
00212                         // Actually an orphan
00213                         $pageId = $revId = 0;
00214                     }
00215                 }
00216 
00217                 $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8';
00218 
00219                 if ( !$dryRun ) {
00220                     // Reset the text row to point to the original copy
00221                     $dbw->begin( __METHOD__ );
00222                     $dbw->update(
00223                         'text',
00224                         // SET
00225                         array(
00226                             'old_flags' => $newFlags,
00227                             'old_text' => $url
00228                         ),
00229                         // WHERE
00230                         array( 'old_id' => $primaryId ),
00231                         __METHOD__
00232                     );
00233 
00234                     // Add a blob_tracking row so that the new reference can be recompressed
00235                     // without needing to run trackBlobs.php again
00236                     $dbw->insert( 'blob_tracking',
00237                         array(
00238                             'bt_page' => $pageId,
00239                             'bt_rev_id' => $revId,
00240                             'bt_text_id' => $primaryId,
00241                             'bt_cluster' => $trackRow->bt_cluster,
00242                             'bt_blob_id' => $trackRow->bt_blob_id,
00243                             'bt_cgz_hash' => $stub['hash'],
00244                             'bt_new_url' => null,
00245                             'bt_moved' => 0,
00246                         ),
00247                         __METHOD__
00248                     );
00249                     $dbw->commit( __METHOD__ );
00250                     $this->waitForSlaves();
00251                 }
00252 
00253                 print "$primaryId: resolved to $url\n";
00254                 ++$numFixed;
00255             }
00256         }
00257 
00258         print "\n";
00259         print "Fixed: $numFixed\n";
00260         print "Unrecoverable: $numBad\n";
00261         print "Good stubs: $numGood\n";
00262     }
00263 
00264     function waitForSlaves() {
00265         static $iteration = 0;
00266         ++$iteration;
00267         if ( ++$iteration > 50 == 0 ) {
00268             wfWaitForSlaves();
00269             $iteration = 0;
00270         }
00271     }
00272 
00273     function findTextIdInPage( $pageId, $textId ) {
00274         $ids = $this->getRevTextMap( $pageId );
00275         if ( !isset( $ids[$textId] ) ) {
00276             return null;
00277         } else {
00278             return $ids[$textId];
00279         }
00280     }
00281 
00282     function getRevTextMap( $pageId ) {
00283         if ( !isset( $this->mapCache[$pageId] ) ) {
00284             // Limit cache size
00285             while ( $this->mapCacheSize > $this->maxMapCacheSize ) {
00286                 $key = key( $this->mapCache );
00287                 $this->mapCacheSize -= count( $this->mapCache[$key] );
00288                 unset( $this->mapCache[$key] );
00289             }
00290 
00291             $dbr = wfGetDB( DB_SLAVE );
00292             $map = array();
00293             $res = $dbr->select( 'revision',
00294                 array( 'rev_id', 'rev_text_id' ),
00295                 array( 'rev_page' => $pageId ),
00296                 __METHOD__
00297             );
00298             foreach ( $res as $row ) {
00299                 $map[$row->rev_text_id] = $row->rev_id;
00300             }
00301             $this->mapCache[$pageId] = $map;
00302             $this->mapCacheSize += count( $map );
00303         }
00304         return $this->mapCache[$pageId];
00305     }
00306 
00314     function isUnbrokenStub( $stub, $secondaryRow ) {
00315         $flags = explode( ',', $secondaryRow->old_flags );
00316         $text = $secondaryRow->old_text;
00317         if ( in_array( 'external', $flags ) ) {
00318             $url = $text;
00319             @list( /* $proto */ , $path ) = explode( '://', $url, 2 );
00320             if ( $path == "" ) {
00321                 return false;
00322             }
00323             $text = ExternalStore::fetchFromUrl( $url );
00324         }
00325         if ( !in_array( 'object', $flags ) ) {
00326             return false;
00327         }
00328 
00329         if ( in_array( 'gzip', $flags ) ) {
00330             $obj = unserialize( gzinflate( $text ) );
00331         } else {
00332             $obj = unserialize( $text );
00333         }
00334 
00335         if ( !is_object( $obj ) ) {
00336             // Correct for old double-serialization bug.
00337             $obj = unserialize( $obj );
00338         }
00339 
00340         if ( !is_object( $obj ) ) {
00341             return false;
00342         }
00343 
00344         $obj->uncompress();
00345         $text = $obj->getItem( $stub['hash'] );
00346         return $text !== false;
00347     }
00348 }
00349 
00350 $maintClass = 'FixBug20757';
00351 require_once RUN_MAINTENANCE_IF_MAIN;