MediaWiki
REL1_24
|
00001 <?php 00025 require __DIR__ . '/../commandLine.inc'; 00026 00027 if ( count( $args ) < 1 ) { 00028 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n"; 00029 echo "Adds blobs from a given ES cluster to the blob_tracking table\n"; 00030 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n"; 00031 00032 exit( 1 ); 00033 } 00034 $tracker = new TrackBlobs( $args ); 00035 $tracker->run(); 00036 echo "All done.\n"; 00037 00038 class TrackBlobs { 00039 public $clusters, $textClause; 00040 public $doBlobOrphans; 00041 public $trackedBlobs = array(); 00042 00043 public $batchSize = 1000; 00044 public $reportingInterval = 10; 00045 00046 function __construct( $clusters ) { 00047 $this->clusters = $clusters; 00048 if ( extension_loaded( 'gmp' ) ) { 00049 $this->doBlobOrphans = true; 00050 foreach ( $clusters as $cluster ) { 00051 $this->trackedBlobs[$cluster] = gmp_init( 0 ); 00052 } 00053 } else { 00054 echo "Warning: the gmp extension is needed to find orphan blobs\n"; 00055 } 00056 } 00057 00058 function run() { 00059 $this->checkIntegrity(); 00060 $this->initTrackingTable(); 00061 $this->trackRevisions(); 00062 $this->trackOrphanText(); 00063 if ( $this->doBlobOrphans ) { 00064 $this->findOrphanBlobs(); 00065 } 00066 } 00067 00068 function checkIntegrity() { 00069 echo "Doing integrity check...\n"; 00070 $dbr = wfGetDB( DB_SLAVE ); 00071 00072 // Scan for HistoryBlobStub objects in the text table (bug 20757) 00073 00074 $exists = $dbr->selectField( 'text', 1, 00075 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' . 00076 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'', 00077 __METHOD__ 00078 ); 00079 00080 if ( $exists ) { 00081 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" . 00082 "This script could destroy these objects if it continued. Run resolveStubs.php\n" . 00083 "to fix this.\n"; 00084 exit( 1 ); 00085 } 00086 00087 // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624) 00088 $flags = $dbr->selectField( 'archive', 'ar_flags', 00089 'ar_flags LIKE \'%external%\' OR (' . 00090 'ar_flags LIKE \'%object%\' ' . 00091 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )', 00092 __METHOD__ 00093 ); 00094 00095 if ( strpos( $flags, 'external' ) !== false ) { 00096 echo "Integrity check failed: found external storage pointers in your archive table.\n" . 00097 "Run normaliseArchiveTable.php to fix this.\n"; 00098 exit( 1 ); 00099 } elseif ( $flags ) { 00100 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" . 00101 "These objects are probably already broken, continuing would make them\n" . 00102 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n"; 00103 exit( 1 ); 00104 } 00105 00106 echo "Integrity check OK\n"; 00107 } 00108 00109 function initTrackingTable() { 00110 $dbw = wfGetDB( DB_MASTER ); 00111 if ( $dbw->tableExists( 'blob_tracking' ) ) { 00112 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) ); 00113 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) ); 00114 } 00115 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' ); 00116 } 00117 00118 function getTextClause() { 00119 if ( !$this->textClause ) { 00120 $dbr = wfGetDB( DB_SLAVE ); 00121 $this->textClause = ''; 00122 foreach ( $this->clusters as $cluster ) { 00123 if ( $this->textClause != '' ) { 00124 $this->textClause .= ' OR '; 00125 } 00126 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() ); 00127 } 00128 } 00129 00130 return $this->textClause; 00131 } 00132 00133 function interpretPointer( $text ) { 00134 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) { 00135 return false; 00136 } 00137 00138 return array( 00139 'cluster' => $m[1], 00140 'id' => intval( $m[2] ), 00141 'hash' => isset( $m[3] ) ? $m[3] : null 00142 ); 00143 } 00144 00148 function trackRevisions() { 00149 $dbw = wfGetDB( DB_MASTER ); 00150 $dbr = wfGetDB( DB_SLAVE ); 00151 00152 $textClause = $this->getTextClause(); 00153 $startId = 0; 00154 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ ); 00155 $batchesDone = 0; 00156 $rowsInserted = 0; 00157 00158 echo "Finding revisions...\n"; 00159 00160 while ( true ) { 00161 $res = $dbr->select( array( 'revision', 'text' ), 00162 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ), 00163 array( 00164 'rev_id > ' . $dbr->addQuotes( $startId ), 00165 'rev_text_id=old_id', 00166 $textClause, 00167 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ), 00168 ), 00169 __METHOD__, 00170 array( 00171 'ORDER BY' => 'rev_id', 00172 'LIMIT' => $this->batchSize 00173 ) 00174 ); 00175 if ( !$res->numRows() ) { 00176 break; 00177 } 00178 00179 $insertBatch = array(); 00180 foreach ( $res as $row ) { 00181 $startId = $row->rev_id; 00182 $info = $this->interpretPointer( $row->old_text ); 00183 if ( !$info ) { 00184 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n"; 00185 continue; 00186 } 00187 if ( !in_array( $info['cluster'], $this->clusters ) ) { 00188 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n"; 00189 continue; 00190 } 00191 $insertBatch[] = array( 00192 'bt_page' => $row->rev_page, 00193 'bt_rev_id' => $row->rev_id, 00194 'bt_text_id' => $row->old_id, 00195 'bt_cluster' => $info['cluster'], 00196 'bt_blob_id' => $info['id'], 00197 'bt_cgz_hash' => $info['hash'] 00198 ); 00199 if ( $this->doBlobOrphans ) { 00200 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] ); 00201 } 00202 } 00203 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ ); 00204 $rowsInserted += count( $insertBatch ); 00205 00206 ++$batchesDone; 00207 if ( $batchesDone >= $this->reportingInterval ) { 00208 $batchesDone = 0; 00209 echo "$startId / $endId\n"; 00210 wfWaitForSlaves(); 00211 } 00212 } 00213 echo "Found $rowsInserted revisions\n"; 00214 } 00215 00221 function trackOrphanText() { 00222 # Wait until the blob_tracking table is available in the slave 00223 $dbw = wfGetDB( DB_MASTER ); 00224 $dbr = wfGetDB( DB_SLAVE ); 00225 $pos = $dbw->getMasterPos(); 00226 $dbr->masterPosWait( $pos, 100000 ); 00227 00228 $textClause = $this->getTextClause( $this->clusters ); 00229 $startId = 0; 00230 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ ); 00231 $rowsInserted = 0; 00232 $batchesDone = 0; 00233 00234 echo "Finding orphan text...\n"; 00235 00236 # Scan the text table for orphan text 00237 while ( true ) { 00238 $res = $dbr->select( array( 'text', 'blob_tracking' ), 00239 array( 'old_id', 'old_flags', 'old_text' ), 00240 array( 00241 'old_id>' . $dbr->addQuotes( $startId ), 00242 $textClause, 00243 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ), 00244 'bt_text_id IS NULL' 00245 ), 00246 __METHOD__, 00247 array( 00248 'ORDER BY' => 'old_id', 00249 'LIMIT' => $this->batchSize 00250 ), 00251 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) ) 00252 ); 00253 $ids = array(); 00254 foreach ( $res as $row ) { 00255 $ids[] = $row->old_id; 00256 } 00257 00258 if ( !$res->numRows() ) { 00259 break; 00260 } 00261 00262 $insertBatch = array(); 00263 foreach ( $res as $row ) { 00264 $startId = $row->old_id; 00265 $info = $this->interpretPointer( $row->old_text ); 00266 if ( !$info ) { 00267 echo "Invalid DB:// URL in old_id {$row->old_id}\n"; 00268 continue; 00269 } 00270 if ( !in_array( $info['cluster'], $this->clusters ) ) { 00271 echo "Invalid cluster returned in SQL query\n"; 00272 continue; 00273 } 00274 00275 $insertBatch[] = array( 00276 'bt_page' => 0, 00277 'bt_rev_id' => 0, 00278 'bt_text_id' => $row->old_id, 00279 'bt_cluster' => $info['cluster'], 00280 'bt_blob_id' => $info['id'], 00281 'bt_cgz_hash' => $info['hash'] 00282 ); 00283 if ( $this->doBlobOrphans ) { 00284 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] ); 00285 } 00286 } 00287 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ ); 00288 00289 $rowsInserted += count( $insertBatch ); 00290 ++$batchesDone; 00291 if ( $batchesDone >= $this->reportingInterval ) { 00292 $batchesDone = 0; 00293 echo "$startId / $endId\n"; 00294 wfWaitForSlaves(); 00295 } 00296 } 00297 echo "Found $rowsInserted orphan text rows\n"; 00298 } 00299 00307 function findOrphanBlobs() { 00308 if ( !extension_loaded( 'gmp' ) ) { 00309 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n"; 00310 00311 return; 00312 } 00313 00314 $dbw = wfGetDB( DB_MASTER ); 00315 00316 foreach ( $this->clusters as $cluster ) { 00317 echo "Searching for orphan blobs in $cluster...\n"; 00318 $lb = wfGetLBFactory()->getExternalLB( $cluster ); 00319 try { 00320 $extDB = $lb->getConnection( DB_SLAVE ); 00321 } catch ( DBConnectionError $e ) { 00322 if ( strpos( $e->error, 'Unknown database' ) !== false ) { 00323 echo "No database on $cluster\n"; 00324 } else { 00325 echo "Error on $cluster: " . $e->getMessage() . "\n"; 00326 } 00327 continue; 00328 } 00329 $table = $extDB->getLBInfo( 'blobs table' ); 00330 if ( is_null( $table ) ) { 00331 $table = 'blobs'; 00332 } 00333 if ( !$extDB->tableExists( $table ) ) { 00334 echo "No blobs table on cluster $cluster\n"; 00335 continue; 00336 } 00337 $startId = 0; 00338 $batchesDone = 0; 00339 $actualBlobs = gmp_init( 0 ); 00340 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ ); 00341 00342 // Build a bitmap of actual blob rows 00343 while ( true ) { 00344 $res = $extDB->select( $table, 00345 array( 'blob_id' ), 00346 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ), 00347 __METHOD__, 00348 array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ) 00349 ); 00350 00351 if ( !$res->numRows() ) { 00352 break; 00353 } 00354 00355 foreach ( $res as $row ) { 00356 gmp_setbit( $actualBlobs, $row->blob_id ); 00357 } 00358 $startId = $row->blob_id; 00359 00360 ++$batchesDone; 00361 if ( $batchesDone >= $this->reportingInterval ) { 00362 $batchesDone = 0; 00363 echo "$startId / $endId\n"; 00364 } 00365 } 00366 00367 // Find actual blobs that weren't tracked by the previous passes 00368 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B 00369 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) ); 00370 00371 // Traverse the orphan list 00372 $insertBatch = array(); 00373 $id = 0; 00374 $numOrphans = 0; 00375 while ( true ) { 00376 $id = gmp_scan1( $orphans, $id ); 00377 if ( $id == -1 ) { 00378 break; 00379 } 00380 $insertBatch[] = array( 00381 'bo_cluster' => $cluster, 00382 'bo_blob_id' => $id 00383 ); 00384 if ( count( $insertBatch ) > $this->batchSize ) { 00385 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ ); 00386 $insertBatch = array(); 00387 } 00388 00389 ++$id; 00390 ++$numOrphans; 00391 } 00392 if ( $insertBatch ) { 00393 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ ); 00394 } 00395 echo "Found $numOrphans orphan(s) in $cluster\n"; 00396 } 00397 } 00398 }