MediaWiki
REL1_22
|
00001 <?php 00024 require_once __DIR__ . '/../Maintenance.php'; 00025 00031 class FixBug20757 extends Maintenance { 00032 public $batchSize = 10000; 00033 public $mapCache = array(); 00034 public $mapCacheSize = 0; 00035 public $maxMapCacheSize = 1000000; 00036 00037 function __construct() { 00038 parent::__construct(); 00039 $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact'; 00040 $this->addOption( 'dry-run', 'Report only' ); 00041 $this->addOption( 'start', 'old_id to start at', false, true ); 00042 } 00043 00044 function execute() { 00045 $dbr = wfGetDB( DB_SLAVE ); 00046 $dbw = wfGetDB( DB_MASTER ); 00047 00048 $dryRun = $this->getOption( 'dry-run' ); 00049 if ( $dryRun ) { 00050 print "Dry run only.\n"; 00051 } 00052 00053 $startId = $this->getOption( 'start', 0 ); 00054 $numGood = 0; 00055 $numFixed = 0; 00056 $numBad = 0; 00057 00058 $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ ); 00059 00060 if ( $dbr->getType() == 'mysql' 00061 && version_compare( $dbr->getServerVersion(), '4.1.0', '>=' ) ) 00062 { 00063 // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function 00064 $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))'; 00065 } else { 00066 // No CONVERT() in MySQL 4.0 00067 $lowerLeft = 'LOWER(LEFT(old_text,22))'; 00068 } 00069 00070 while ( true ) { 00071 print "ID: $startId / $totalRevs\r"; 00072 00073 $res = $dbr->select( 00074 'text', 00075 array( 'old_id', 'old_flags', 'old_text' ), 00076 array( 00077 'old_id > ' . intval( $startId ), 00078 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'', 00079 "$lowerLeft = 'o:15:\"historyblobstub\"'", 00080 ), 00081 __METHOD__, 00082 array( 00083 'ORDER BY' => 'old_id', 00084 'LIMIT' => $this->batchSize, 00085 ) 00086 ); 00087 00088 if ( !$res->numRows() ) { 00089 break; 00090 } 00091 00092 $secondaryIds = array(); 00093 $stubs = array(); 00094 00095 foreach ( $res as $row ) { 00096 $startId = $row->old_id; 00097 00098 // Basic sanity checks 00099 $obj = unserialize( $row->old_text ); 00100 if ( $obj === false ) { 00101 print "{$row->old_id}: unrecoverable: cannot unserialize\n"; 00102 ++$numBad; 00103 continue; 00104 } 00105 00106 if ( !is_object( $obj ) ) { 00107 print "{$row->old_id}: unrecoverable: unserialized to type " . 00108 gettype( $obj ) . ", possible double-serialization\n"; 00109 ++$numBad; 00110 continue; 00111 } 00112 00113 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) { 00114 print "{$row->old_id}: unrecoverable: unexpected object class " . 00115 get_class( $obj ) . "\n"; 00116 ++$numBad; 00117 continue; 00118 } 00119 00120 // Process flags 00121 $flags = explode( ',', $row->old_flags ); 00122 if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) { 00123 $legacyEncoding = false; 00124 } else { 00125 $legacyEncoding = true; 00126 } 00127 00128 // Queue the stub for future batch processing 00129 $id = intval( $obj->mOldId ); 00130 $secondaryIds[] = $id; 00131 $stubs[$row->old_id] = array( 00132 'legacyEncoding' => $legacyEncoding, 00133 'secondaryId' => $id, 00134 'hash' => $obj->mHash, 00135 ); 00136 } 00137 00138 $secondaryIds = array_unique( $secondaryIds ); 00139 00140 if ( !count( $secondaryIds ) ) { 00141 continue; 00142 } 00143 00144 // Run the batch query on blob_tracking 00145 $res = $dbr->select( 00146 'blob_tracking', 00147 '*', 00148 array( 00149 'bt_text_id' => $secondaryIds, 00150 ), 00151 __METHOD__ 00152 ); 00153 $trackedBlobs = array(); 00154 foreach ( $res as $row ) { 00155 $trackedBlobs[$row->bt_text_id] = $row; 00156 } 00157 00158 // Process the stubs 00159 foreach ( $stubs as $primaryId => $stub ) { 00160 $secondaryId = $stub['secondaryId']; 00161 if ( !isset( $trackedBlobs[$secondaryId] ) ) { 00162 // No tracked blob. Work out what went wrong 00163 $secondaryRow = $dbr->selectRow( 00164 'text', 00165 array( 'old_flags', 'old_text' ), 00166 array( 'old_id' => $secondaryId ), 00167 __METHOD__ 00168 ); 00169 if ( !$secondaryRow ) { 00170 print "$primaryId: unrecoverable: secondary row is missing\n"; 00171 ++$numBad; 00172 } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) { 00173 // Not broken yet, and not in the tracked clusters so it won't get 00174 // broken by the current RCT run. 00175 ++$numGood; 00176 } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) { 00177 print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n"; 00178 ++$numBad; 00179 } else { 00180 print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n"; 00181 ++$numBad; 00182 } 00183 unset( $stubs[$primaryId] ); 00184 continue; 00185 } 00186 $trackRow = $trackedBlobs[$secondaryId]; 00187 00188 // Check that the specified text really is available in the tracked source row 00189 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}"; 00190 $text = ExternalStore::fetchFromURL( $url ); 00191 if ( $text === false ) { 00192 print "$primaryId: unrecoverable: source text missing\n"; 00193 ++$numBad; 00194 unset( $stubs[$primaryId] ); 00195 continue; 00196 } 00197 if ( md5( $text ) !== $stub['hash'] ) { 00198 print "$primaryId: unrecoverable: content hashes do not match\n"; 00199 ++$numBad; 00200 unset( $stubs[$primaryId] ); 00201 continue; 00202 } 00203 00204 // Find the page_id and rev_id 00205 // The page is probably the same as the page of the secondary row 00206 $pageId = intval( $trackRow->bt_page ); 00207 if ( !$pageId ) { 00208 $revId = $pageId = 0; 00209 } else { 00210 $revId = $this->findTextIdInPage( $pageId, $primaryId ); 00211 if ( !$revId ) { 00212 // Actually an orphan 00213 $pageId = $revId = 0; 00214 } 00215 } 00216 00217 $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8'; 00218 00219 if ( !$dryRun ) { 00220 // Reset the text row to point to the original copy 00221 $dbw->begin( __METHOD__ ); 00222 $dbw->update( 00223 'text', 00224 // SET 00225 array( 00226 'old_flags' => $newFlags, 00227 'old_text' => $url 00228 ), 00229 // WHERE 00230 array( 'old_id' => $primaryId ), 00231 __METHOD__ 00232 ); 00233 00234 // Add a blob_tracking row so that the new reference can be recompressed 00235 // without needing to run trackBlobs.php again 00236 $dbw->insert( 'blob_tracking', 00237 array( 00238 'bt_page' => $pageId, 00239 'bt_rev_id' => $revId, 00240 'bt_text_id' => $primaryId, 00241 'bt_cluster' => $trackRow->bt_cluster, 00242 'bt_blob_id' => $trackRow->bt_blob_id, 00243 'bt_cgz_hash' => $stub['hash'], 00244 'bt_new_url' => null, 00245 'bt_moved' => 0, 00246 ), 00247 __METHOD__ 00248 ); 00249 $dbw->commit( __METHOD__ ); 00250 $this->waitForSlaves(); 00251 } 00252 00253 print "$primaryId: resolved to $url\n"; 00254 ++$numFixed; 00255 } 00256 } 00257 00258 print "\n"; 00259 print "Fixed: $numFixed\n"; 00260 print "Unrecoverable: $numBad\n"; 00261 print "Good stubs: $numGood\n"; 00262 } 00263 00264 function waitForSlaves() { 00265 static $iteration = 0; 00266 ++$iteration; 00267 if ( ++$iteration > 50 == 0 ) { 00268 wfWaitForSlaves(); 00269 $iteration = 0; 00270 } 00271 } 00272 00273 function findTextIdInPage( $pageId, $textId ) { 00274 $ids = $this->getRevTextMap( $pageId ); 00275 if ( !isset( $ids[$textId] ) ) { 00276 return null; 00277 } else { 00278 return $ids[$textId]; 00279 } 00280 } 00281 00282 function getRevTextMap( $pageId ) { 00283 if ( !isset( $this->mapCache[$pageId] ) ) { 00284 // Limit cache size 00285 while ( $this->mapCacheSize > $this->maxMapCacheSize ) { 00286 $key = key( $this->mapCache ); 00287 $this->mapCacheSize -= count( $this->mapCache[$key] ); 00288 unset( $this->mapCache[$key] ); 00289 } 00290 00291 $dbr = wfGetDB( DB_SLAVE ); 00292 $map = array(); 00293 $res = $dbr->select( 'revision', 00294 array( 'rev_id', 'rev_text_id' ), 00295 array( 'rev_page' => $pageId ), 00296 __METHOD__ 00297 ); 00298 foreach ( $res as $row ) { 00299 $map[$row->rev_text_id] = $row->rev_id; 00300 } 00301 $this->mapCache[$pageId] = $map; 00302 $this->mapCacheSize += count( $map ); 00303 } 00304 return $this->mapCache[$pageId]; 00305 } 00306 00314 function isUnbrokenStub( $stub, $secondaryRow ) { 00315 $flags = explode( ',', $secondaryRow->old_flags ); 00316 $text = $secondaryRow->old_text; 00317 if ( in_array( 'external', $flags ) ) { 00318 $url = $text; 00319 @list( /* $proto */ , $path ) = explode( '://', $url, 2 ); 00320 if ( $path == "" ) { 00321 return false; 00322 } 00323 $text = ExternalStore::fetchFromUrl( $url ); 00324 } 00325 if ( !in_array( 'object', $flags ) ) { 00326 return false; 00327 } 00328 00329 if ( in_array( 'gzip', $flags ) ) { 00330 $obj = unserialize( gzinflate( $text ) ); 00331 } else { 00332 $obj = unserialize( $text ); 00333 } 00334 00335 if ( !is_object( $obj ) ) { 00336 // Correct for old double-serialization bug. 00337 $obj = unserialize( $obj ); 00338 } 00339 00340 if ( !is_object( $obj ) ) { 00341 return false; 00342 } 00343 00344 $obj->uncompress(); 00345 $text = $obj->getItem( $stub['hash'] ); 00346 return $text !== false; 00347 } 00348 } 00349 00350 $maintClass = 'FixBug20757'; 00351 require_once RUN_MAINTENANCE_IF_MAIN;