MediaWiki
REL1_22
|
00001 <?php 00025 require __DIR__ . '/../commandLine.inc'; 00026 00027 00028 if ( count( $args ) < 1 ) { 00029 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n"; 00030 echo "Adds blobs from a given ES cluster to the blob_tracking table\n"; 00031 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n"; 00032 00033 exit( 1 ); 00034 } 00035 $tracker = new TrackBlobs( $args ); 00036 $tracker->run(); 00037 echo "All done.\n"; 00038 00039 class TrackBlobs { 00040 public $clusters, $textClause; 00041 public $doBlobOrphans; 00042 public $trackedBlobs = array(); 00043 00044 public $batchSize = 1000; 00045 public $reportingInterval = 10; 00046 00047 function __construct( $clusters ) { 00048 $this->clusters = $clusters; 00049 if ( extension_loaded( 'gmp' ) ) { 00050 $this->doBlobOrphans = true; 00051 foreach ( $clusters as $cluster ) { 00052 $this->trackedBlobs[$cluster] = gmp_init( 0 ); 00053 } 00054 } else { 00055 echo "Warning: the gmp extension is needed to find orphan blobs\n"; 00056 } 00057 } 00058 00059 function run() { 00060 $this->checkIntegrity(); 00061 $this->initTrackingTable(); 00062 $this->trackRevisions(); 00063 $this->trackOrphanText(); 00064 if ( $this->doBlobOrphans ) { 00065 $this->findOrphanBlobs(); 00066 } 00067 } 00068 00069 function checkIntegrity() { 00070 echo "Doing integrity check...\n"; 00071 $dbr = wfGetDB( DB_SLAVE ); 00072 00073 // Scan for HistoryBlobStub objects in the text table (bug 20757) 00074 00075 $exists = $dbr->selectField( 'text', 1, 00076 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' . 00077 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'', 00078 __METHOD__ 00079 ); 00080 00081 if ( $exists ) { 00082 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" . 00083 "This script could destroy these objects if it continued. Run resolveStubs.php\n" . 00084 "to fix this.\n"; 00085 exit( 1 ); 00086 } 00087 00088 // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624) 00089 $flags = $dbr->selectField( 'archive', 'ar_flags', 00090 'ar_flags LIKE \'%external%\' OR (' . 00091 'ar_flags LIKE \'%object%\' ' . 00092 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )', 00093 __METHOD__ 00094 ); 00095 00096 if ( strpos( $flags, 'external' ) !== false ) { 00097 echo "Integrity check failed: found external storage pointers in your archive table.\n" . 00098 "Run normaliseArchiveTable.php to fix this.\n"; 00099 exit( 1 ); 00100 } elseif ( $flags ) { 00101 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" . 00102 "These objects are probably already broken, continuing would make them\n" . 00103 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n"; 00104 exit( 1 ); 00105 } 00106 00107 echo "Integrity check OK\n"; 00108 } 00109 00110 function initTrackingTable() { 00111 $dbw = wfGetDB( DB_MASTER ); 00112 if ( $dbw->tableExists( 'blob_tracking' ) ) { 00113 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) ); 00114 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) ); 00115 } 00116 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' ); 00117 } 00118 00119 function getTextClause() { 00120 if ( !$this->textClause ) { 00121 $dbr = wfGetDB( DB_SLAVE ); 00122 $this->textClause = ''; 00123 foreach ( $this->clusters as $cluster ) { 00124 if ( $this->textClause != '' ) { 00125 $this->textClause .= ' OR '; 00126 } 00127 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() ); 00128 } 00129 } 00130 return $this->textClause; 00131 } 00132 00133 function interpretPointer( $text ) { 00134 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) { 00135 return false; 00136 } 00137 return array( 00138 'cluster' => $m[1], 00139 'id' => intval( $m[2] ), 00140 'hash' => isset( $m[3] ) ? $m[3] : null 00141 ); 00142 } 00143 00147 function trackRevisions() { 00148 $dbw = wfGetDB( DB_MASTER ); 00149 $dbr = wfGetDB( DB_SLAVE ); 00150 00151 $textClause = $this->getTextClause(); 00152 $startId = 0; 00153 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ ); 00154 $batchesDone = 0; 00155 $rowsInserted = 0; 00156 00157 echo "Finding revisions...\n"; 00158 00159 while ( true ) { 00160 $res = $dbr->select( array( 'revision', 'text' ), 00161 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ), 00162 array( 00163 'rev_id > ' . $dbr->addQuotes( $startId ), 00164 'rev_text_id=old_id', 00165 $textClause, 00166 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ), 00167 ), 00168 __METHOD__, 00169 array( 00170 'ORDER BY' => 'rev_id', 00171 'LIMIT' => $this->batchSize 00172 ) 00173 ); 00174 if ( !$res->numRows() ) { 00175 break; 00176 } 00177 00178 $insertBatch = array(); 00179 foreach ( $res as $row ) { 00180 $startId = $row->rev_id; 00181 $info = $this->interpretPointer( $row->old_text ); 00182 if ( !$info ) { 00183 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n"; 00184 continue; 00185 } 00186 if ( !in_array( $info['cluster'], $this->clusters ) ) { 00187 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n"; 00188 continue; 00189 } 00190 $insertBatch[] = array( 00191 'bt_page' => $row->rev_page, 00192 'bt_rev_id' => $row->rev_id, 00193 'bt_text_id' => $row->old_id, 00194 'bt_cluster' => $info['cluster'], 00195 'bt_blob_id' => $info['id'], 00196 'bt_cgz_hash' => $info['hash'] 00197 ); 00198 if ( $this->doBlobOrphans ) { 00199 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] ); 00200 } 00201 } 00202 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ ); 00203 $rowsInserted += count( $insertBatch ); 00204 00205 ++$batchesDone; 00206 if ( $batchesDone >= $this->reportingInterval ) { 00207 $batchesDone = 0; 00208 echo "$startId / $endId\n"; 00209 wfWaitForSlaves(); 00210 } 00211 } 00212 echo "Found $rowsInserted revisions\n"; 00213 } 00214 00220 function trackOrphanText() { 00221 # Wait until the blob_tracking table is available in the slave 00222 $dbw = wfGetDB( DB_MASTER ); 00223 $dbr = wfGetDB( DB_SLAVE ); 00224 $pos = $dbw->getMasterPos(); 00225 $dbr->masterPosWait( $pos, 100000 ); 00226 00227 $textClause = $this->getTextClause( $this->clusters ); 00228 $startId = 0; 00229 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ ); 00230 $rowsInserted = 0; 00231 $batchesDone = 0; 00232 00233 echo "Finding orphan text...\n"; 00234 00235 # Scan the text table for orphan text 00236 while ( true ) { 00237 $res = $dbr->select( array( 'text', 'blob_tracking' ), 00238 array( 'old_id', 'old_flags', 'old_text' ), 00239 array( 00240 'old_id>' . $dbr->addQuotes( $startId ), 00241 $textClause, 00242 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ), 00243 'bt_text_id IS NULL' 00244 ), 00245 __METHOD__, 00246 array( 00247 'ORDER BY' => 'old_id', 00248 'LIMIT' => $this->batchSize 00249 ), 00250 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) ) 00251 ); 00252 $ids = array(); 00253 foreach ( $res as $row ) { 00254 $ids[] = $row->old_id; 00255 } 00256 00257 if ( !$res->numRows() ) { 00258 break; 00259 } 00260 00261 $insertBatch = array(); 00262 foreach ( $res as $row ) { 00263 $startId = $row->old_id; 00264 $info = $this->interpretPointer( $row->old_text ); 00265 if ( !$info ) { 00266 echo "Invalid DB:// URL in old_id {$row->old_id}\n"; 00267 continue; 00268 } 00269 if ( !in_array( $info['cluster'], $this->clusters ) ) { 00270 echo "Invalid cluster returned in SQL query\n"; 00271 continue; 00272 } 00273 00274 $insertBatch[] = array( 00275 'bt_page' => 0, 00276 'bt_rev_id' => 0, 00277 'bt_text_id' => $row->old_id, 00278 'bt_cluster' => $info['cluster'], 00279 'bt_blob_id' => $info['id'], 00280 'bt_cgz_hash' => $info['hash'] 00281 ); 00282 if ( $this->doBlobOrphans ) { 00283 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] ); 00284 } 00285 } 00286 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ ); 00287 00288 $rowsInserted += count( $insertBatch ); 00289 ++$batchesDone; 00290 if ( $batchesDone >= $this->reportingInterval ) { 00291 $batchesDone = 0; 00292 echo "$startId / $endId\n"; 00293 wfWaitForSlaves(); 00294 } 00295 } 00296 echo "Found $rowsInserted orphan text rows\n"; 00297 } 00298 00306 function findOrphanBlobs() { 00307 if ( !extension_loaded( 'gmp' ) ) { 00308 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n"; 00309 return; 00310 } 00311 00312 $dbw = wfGetDB( DB_MASTER ); 00313 00314 foreach ( $this->clusters as $cluster ) { 00315 echo "Searching for orphan blobs in $cluster...\n"; 00316 $lb = wfGetLBFactory()->getExternalLB( $cluster ); 00317 try { 00318 $extDB = $lb->getConnection( DB_SLAVE ); 00319 } catch ( DBConnectionError $e ) { 00320 if ( strpos( $e->error, 'Unknown database' ) !== false ) { 00321 echo "No database on $cluster\n"; 00322 } else { 00323 echo "Error on $cluster: " . $e->getMessage() . "\n"; 00324 } 00325 continue; 00326 } 00327 $table = $extDB->getLBInfo( 'blobs table' ); 00328 if ( is_null( $table ) ) { 00329 $table = 'blobs'; 00330 } 00331 if ( !$extDB->tableExists( $table ) ) { 00332 echo "No blobs table on cluster $cluster\n"; 00333 continue; 00334 } 00335 $startId = 0; 00336 $batchesDone = 0; 00337 $actualBlobs = gmp_init( 0 ); 00338 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ ); 00339 00340 // Build a bitmap of actual blob rows 00341 while ( true ) { 00342 $res = $extDB->select( $table, 00343 array( 'blob_id' ), 00344 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ), 00345 __METHOD__, 00346 array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ) 00347 ); 00348 00349 if ( !$res->numRows() ) { 00350 break; 00351 } 00352 00353 foreach ( $res as $row ) { 00354 gmp_setbit( $actualBlobs, $row->blob_id ); 00355 } 00356 $startId = $row->blob_id; 00357 00358 ++$batchesDone; 00359 if ( $batchesDone >= $this->reportingInterval ) { 00360 $batchesDone = 0; 00361 echo "$startId / $endId\n"; 00362 } 00363 } 00364 00365 // Find actual blobs that weren't tracked by the previous passes 00366 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B 00367 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) ); 00368 00369 // Traverse the orphan list 00370 $insertBatch = array(); 00371 $id = 0; 00372 $numOrphans = 0; 00373 while ( true ) { 00374 $id = gmp_scan1( $orphans, $id ); 00375 if ( $id == -1 ) { 00376 break; 00377 } 00378 $insertBatch[] = array( 00379 'bo_cluster' => $cluster, 00380 'bo_blob_id' => $id 00381 ); 00382 if ( count( $insertBatch ) > $this->batchSize ) { 00383 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ ); 00384 $insertBatch = array(); 00385 } 00386 00387 ++$id; 00388 ++$numOrphans; 00389 } 00390 if ( $insertBatch ) { 00391 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ ); 00392 } 00393 echo "Found $numOrphans orphan(s) in $cluster\n"; 00394 } 00395 } 00396 }