MediaWiki
REL1_19
|
00001 <?php 00024 require_once( dirname( __FILE__ ) . '/../Maintenance.php' ); 00025 00026 class FixBug20757 extends Maintenance { 00027 var $batchSize = 10000; 00028 var $mapCache = array(); 00029 var $mapCacheSize = 0; 00030 var $maxMapCacheSize = 1000000; 00031 00032 function __construct() { 00033 parent::__construct(); 00034 $this->mDescription = 'Script to fix bug 20757 assuming that blob_tracking is intact'; 00035 $this->addOption( 'dry-run', 'Report only' ); 00036 $this->addOption( 'start', 'old_id to start at', false, true ); 00037 } 00038 00039 function execute() { 00040 $dbr = wfGetDB( DB_SLAVE ); 00041 $dbw = wfGetDB( DB_MASTER ); 00042 00043 $dryRun = $this->getOption( 'dry-run' ); 00044 if ( $dryRun ) { 00045 print "Dry run only.\n"; 00046 } 00047 00048 $startId = $this->getOption( 'start', 0 ); 00049 $numGood = 0; 00050 $numFixed = 0; 00051 $numBad = 0; 00052 00053 $totalRevs = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ ); 00054 00055 if ( $dbr->getType() == 'mysql' 00056 && version_compare( $dbr->getServerVersion(), '4.1.0', '>=' ) ) 00057 { 00058 // In MySQL 4.1+, the binary field old_text has a non-working LOWER() function 00059 $lowerLeft = 'LOWER(CONVERT(LEFT(old_text,22) USING latin1))'; 00060 } else { 00061 // No CONVERT() in MySQL 4.0 00062 $lowerLeft = 'LOWER(LEFT(old_text,22))'; 00063 } 00064 00065 while ( true ) { 00066 print "ID: $startId / $totalRevs\r"; 00067 00068 $res = $dbr->select( 00069 'text', 00070 array( 'old_id', 'old_flags', 'old_text' ), 00071 array( 00072 'old_id > ' . intval( $startId ), 00073 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\'', 00074 "$lowerLeft = 'o:15:\"historyblobstub\"'", 00075 ), 00076 __METHOD__, 00077 array( 00078 'ORDER BY' => 'old_id', 00079 'LIMIT' => $this->batchSize, 00080 ) 00081 ); 00082 00083 if ( !$res->numRows() ) { 00084 break; 00085 } 00086 00087 $secondaryIds = array(); 00088 $stubs = array(); 00089 00090 foreach ( $res as $row ) { 00091 $startId = $row->old_id; 00092 00093 // Basic sanity checks 00094 $obj = unserialize( $row->old_text ); 00095 if ( $obj === false ) { 00096 print "{$row->old_id}: unrecoverable: cannot unserialize\n"; 00097 ++$numBad; 00098 continue; 00099 } 00100 00101 if ( !is_object( $obj ) ) { 00102 print "{$row->old_id}: unrecoverable: unserialized to type " . 00103 gettype( $obj ) . ", possible double-serialization\n"; 00104 ++$numBad; 00105 continue; 00106 } 00107 00108 if ( strtolower( get_class( $obj ) ) !== 'historyblobstub' ) { 00109 print "{$row->old_id}: unrecoverable: unexpected object class " . 00110 get_class( $obj ) . "\n"; 00111 ++$numBad; 00112 continue; 00113 } 00114 00115 // Process flags 00116 $flags = explode( ',', $row->old_flags ); 00117 if ( in_array( 'utf-8', $flags ) || in_array( 'utf8', $flags ) ) { 00118 $legacyEncoding = false; 00119 } else { 00120 $legacyEncoding = true; 00121 } 00122 00123 // Queue the stub for future batch processing 00124 $id = intval( $obj->mOldId ); 00125 $secondaryIds[] = $id; 00126 $stubs[$row->old_id] = array( 00127 'legacyEncoding' => $legacyEncoding, 00128 'secondaryId' => $id, 00129 'hash' => $obj->mHash, 00130 ); 00131 } 00132 00133 $secondaryIds = array_unique( $secondaryIds ); 00134 00135 if ( !count( $secondaryIds ) ) { 00136 continue; 00137 } 00138 00139 // Run the batch query on blob_tracking 00140 $res = $dbr->select( 00141 'blob_tracking', 00142 '*', 00143 array( 00144 'bt_text_id' => $secondaryIds, 00145 ), 00146 __METHOD__ 00147 ); 00148 $trackedBlobs = array(); 00149 foreach ( $res as $row ) { 00150 $trackedBlobs[$row->bt_text_id] = $row; 00151 } 00152 00153 // Process the stubs 00154 foreach ( $stubs as $primaryId => $stub ) { 00155 $secondaryId = $stub['secondaryId']; 00156 if ( !isset( $trackedBlobs[$secondaryId] ) ) { 00157 // No tracked blob. Work out what went wrong 00158 $secondaryRow = $dbr->selectRow( 00159 'text', 00160 array( 'old_flags', 'old_text' ), 00161 array( 'old_id' => $secondaryId ), 00162 __METHOD__ 00163 ); 00164 if ( !$secondaryRow ) { 00165 print "$primaryId: unrecoverable: secondary row is missing\n"; 00166 ++$numBad; 00167 } elseif ( $this->isUnbrokenStub( $stub, $secondaryRow ) ) { 00168 // Not broken yet, and not in the tracked clusters so it won't get 00169 // broken by the current RCT run. 00170 ++$numGood; 00171 } elseif ( strpos( $secondaryRow->old_flags, 'external' ) !== false ) { 00172 print "$primaryId: unrecoverable: secondary gone to {$secondaryRow->old_text}\n"; 00173 ++$numBad; 00174 } else { 00175 print "$primaryId: unrecoverable: miscellaneous corruption of secondary row\n"; 00176 ++$numBad; 00177 } 00178 unset( $stubs[$primaryId] ); 00179 continue; 00180 } 00181 $trackRow = $trackedBlobs[$secondaryId]; 00182 00183 // Check that the specified text really is available in the tracked source row 00184 $url = "DB://{$trackRow->bt_cluster}/{$trackRow->bt_blob_id}/{$stub['hash']}"; 00185 $text = ExternalStore::fetchFromURL( $url ); 00186 if ( $text === false ) { 00187 print "$primaryId: unrecoverable: source text missing\n"; 00188 ++$numBad; 00189 unset( $stubs[$primaryId] ); 00190 continue; 00191 } 00192 if ( md5( $text ) !== $stub['hash'] ) { 00193 print "$primaryId: unrecoverable: content hashes do not match\n"; 00194 ++$numBad; 00195 unset( $stubs[$primaryId] ); 00196 continue; 00197 } 00198 00199 // Find the page_id and rev_id 00200 // The page is probably the same as the page of the secondary row 00201 $pageId = intval( $trackRow->bt_page ); 00202 if ( !$pageId ) { 00203 $revId = $pageId = 0; 00204 } else { 00205 $revId = $this->findTextIdInPage( $pageId, $primaryId ); 00206 if ( !$revId ) { 00207 // Actually an orphan 00208 $pageId = $revId = 0; 00209 } 00210 } 00211 00212 $newFlags = $stub['legacyEncoding'] ? 'external' : 'external,utf-8'; 00213 00214 if ( !$dryRun ) { 00215 // Reset the text row to point to the original copy 00216 $dbw->begin(); 00217 $dbw->update( 00218 'text', 00219 // SET 00220 array( 00221 'old_flags' => $newFlags, 00222 'old_text' => $url 00223 ), 00224 // WHERE 00225 array( 'old_id' => $primaryId ), 00226 __METHOD__ 00227 ); 00228 00229 // Add a blob_tracking row so that the new reference can be recompressed 00230 // without needing to run trackBlobs.php again 00231 $dbw->insert( 'blob_tracking', 00232 array( 00233 'bt_page' => $pageId, 00234 'bt_rev_id' => $revId, 00235 'bt_text_id' => $primaryId, 00236 'bt_cluster' => $trackRow->bt_cluster, 00237 'bt_blob_id' => $trackRow->bt_blob_id, 00238 'bt_cgz_hash' => $stub['hash'], 00239 'bt_new_url' => null, 00240 'bt_moved' => 0, 00241 ), 00242 __METHOD__ 00243 ); 00244 $dbw->commit(); 00245 $this->waitForSlaves(); 00246 } 00247 00248 print "$primaryId: resolved to $url\n"; 00249 ++$numFixed; 00250 } 00251 } 00252 00253 print "\n"; 00254 print "Fixed: $numFixed\n"; 00255 print "Unrecoverable: $numBad\n"; 00256 print "Good stubs: $numGood\n"; 00257 } 00258 00259 function waitForSlaves() { 00260 static $iteration = 0; 00261 ++$iteration; 00262 if ( ++$iteration > 50 == 0 ) { 00263 wfWaitForSlaves(); 00264 $iteration = 0; 00265 } 00266 } 00267 00268 function findTextIdInPage( $pageId, $textId ) { 00269 $ids = $this->getRevTextMap( $pageId ); 00270 if ( !isset( $ids[$textId] ) ) { 00271 return null; 00272 } else { 00273 return $ids[$textId]; 00274 } 00275 } 00276 00277 function getRevTextMap( $pageId ) { 00278 if ( !isset( $this->mapCache[$pageId] ) ) { 00279 // Limit cache size 00280 while ( $this->mapCacheSize > $this->maxMapCacheSize ) { 00281 $key = key( $this->mapCache ); 00282 $this->mapCacheSize -= count( $this->mapCache[$key] ); 00283 unset( $this->mapCache[$key] ); 00284 } 00285 00286 $dbr = wfGetDB( DB_SLAVE ); 00287 $map = array(); 00288 $res = $dbr->select( 'revision', 00289 array( 'rev_id', 'rev_text_id' ), 00290 array( 'rev_page' => $pageId ), 00291 __METHOD__ 00292 ); 00293 foreach ( $res as $row ) { 00294 $map[$row->rev_text_id] = $row->rev_id; 00295 } 00296 $this->mapCache[$pageId] = $map; 00297 $this->mapCacheSize += count( $map ); 00298 } 00299 return $this->mapCache[$pageId]; 00300 } 00301 00309 function isUnbrokenStub( $stub, $secondaryRow ) { 00310 $flags = explode( ',', $secondaryRow->old_flags ); 00311 $text = $secondaryRow->old_text; 00312 if ( in_array( 'external', $flags ) ) { 00313 $url = $text; 00314 @list( /* $proto */ , $path ) = explode( '://', $url, 2 ); 00315 if ( $path == "" ) { 00316 return false; 00317 } 00318 $text = ExternalStore::fetchFromUrl( $url ); 00319 } 00320 if ( !in_array( 'object', $flags ) ) { 00321 return false; 00322 } 00323 00324 if ( in_array( 'gzip', $flags ) ) { 00325 $obj = unserialize( gzinflate( $text ) ); 00326 } else { 00327 $obj = unserialize( $text ); 00328 } 00329 00330 if ( !is_object( $obj ) ) { 00331 // Correct for old double-serialization bug. 00332 $obj = unserialize( $obj ); 00333 } 00334 00335 if ( !is_object( $obj ) ) { 00336 return false; 00337 } 00338 00339 $obj->uncompress(); 00340 $text = $obj->getItem( $stub['hash'] ); 00341 return $text !== false; 00342 } 00343 } 00344 00345 $maintClass = 'FixBug20757'; 00346 require_once( RUN_MAINTENANCE_IF_MAIN ); 00347