MediaWiki
REL1_23
|
00001 <?php 00025 require __DIR__ . '/../commandLine.inc'; 00026 00027 if ( count( $args ) < 1 ) { 00028 echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n"; 00029 echo "Adds blobs from a given ES cluster to the blob_tracking table\n"; 00030 echo "Automatically deletes the tracking table and starts from the start again when restarted.\n"; 00031 00032 exit( 1 ); 00033 } 00034 $tracker = new TrackBlobs( $args ); 00035 $tracker->run(); 00036 echo "All done.\n"; 00037 00038 class TrackBlobs { 00039 public $clusters, $textClause; 00040 public $doBlobOrphans; 00041 public $trackedBlobs = array(); 00042 00043 public $batchSize = 1000; 00044 public $reportingInterval = 10; 00045 00046 function __construct( $clusters ) { 00047 $this->clusters = $clusters; 00048 if ( extension_loaded( 'gmp' ) ) { 00049 $this->doBlobOrphans = true; 00050 foreach ( $clusters as $cluster ) { 00051 $this->trackedBlobs[$cluster] = gmp_init( 0 ); 00052 } 00053 } else { 00054 echo "Warning: the gmp extension is needed to find orphan blobs\n"; 00055 } 00056 } 00057 00058 function run() { 00059 $this->checkIntegrity(); 00060 $this->initTrackingTable(); 00061 $this->trackRevisions(); 00062 $this->trackOrphanText(); 00063 if ( $this->doBlobOrphans ) { 00064 $this->findOrphanBlobs(); 00065 } 00066 } 00067 00068 function checkIntegrity() { 00069 echo "Doing integrity check...\n"; 00070 $dbr = wfGetDB( DB_SLAVE ); 00071 00072 // Scan for HistoryBlobStub objects in the text table (bug 20757) 00073 00074 $exists = $dbr->selectField( 'text', 1, 00075 'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' . 00076 'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'', 00077 __METHOD__ 00078 ); 00079 00080 if ( $exists ) { 00081 echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" . 00082 "This script could destroy these objects if it continued. Run resolveStubs.php\n" . 00083 "to fix this.\n"; 00084 exit( 1 ); 00085 } 00086 00087 // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624) 00088 $flags = $dbr->selectField( 'archive', 'ar_flags', 00089 'ar_flags LIKE \'%external%\' OR (' . 00090 'ar_flags LIKE \'%object%\' ' . 00091 'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )', 00092 __METHOD__ 00093 ); 00094 00095 if ( strpos( $flags, 'external' ) !== false ) { 00096 echo "Integrity check failed: found external storage pointers in your archive table.\n" . 00097 "Run normaliseArchiveTable.php to fix this.\n"; 00098 exit( 1 ); 00099 } elseif ( $flags ) { 00100 echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" . 00101 "These objects are probably already broken, continuing would make them\n" . 00102 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n"; 00103 exit( 1 ); 00104 } 00105 00106 echo "Integrity check OK\n"; 00107 } 00108 00109 function initTrackingTable() { 00110 $dbw = wfGetDB( DB_MASTER ); 00111 if ( $dbw->tableExists( 'blob_tracking' ) ) { 00112 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) ); 00113 $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) ); 00114 } 00115 $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' ); 00116 } 00117 00118 function getTextClause() { 00119 if ( !$this->textClause ) { 00120 $dbr = wfGetDB( DB_SLAVE ); 00121 $this->textClause = ''; 00122 foreach ( $this->clusters as $cluster ) { 00123 if ( $this->textClause != '' ) { 00124 $this->textClause .= ' OR '; 00125 } 00126 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() ); 00127 } 00128 } 00129 return $this->textClause; 00130 } 00131 00132 function interpretPointer( $text ) { 00133 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) { 00134 return false; 00135 } 00136 return array( 00137 'cluster' => $m[1], 00138 'id' => intval( $m[2] ), 00139 'hash' => isset( $m[3] ) ? $m[3] : null 00140 ); 00141 } 00142 00146 function trackRevisions() { 00147 $dbw = wfGetDB( DB_MASTER ); 00148 $dbr = wfGetDB( DB_SLAVE ); 00149 00150 $textClause = $this->getTextClause(); 00151 $startId = 0; 00152 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ ); 00153 $batchesDone = 0; 00154 $rowsInserted = 0; 00155 00156 echo "Finding revisions...\n"; 00157 00158 while ( true ) { 00159 $res = $dbr->select( array( 'revision', 'text' ), 00160 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ), 00161 array( 00162 'rev_id > ' . $dbr->addQuotes( $startId ), 00163 'rev_text_id=old_id', 00164 $textClause, 00165 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ), 00166 ), 00167 __METHOD__, 00168 array( 00169 'ORDER BY' => 'rev_id', 00170 'LIMIT' => $this->batchSize 00171 ) 00172 ); 00173 if ( !$res->numRows() ) { 00174 break; 00175 } 00176 00177 $insertBatch = array(); 00178 foreach ( $res as $row ) { 00179 $startId = $row->rev_id; 00180 $info = $this->interpretPointer( $row->old_text ); 00181 if ( !$info ) { 00182 echo "Invalid DB:// URL in rev_id {$row->rev_id}\n"; 00183 continue; 00184 } 00185 if ( !in_array( $info['cluster'], $this->clusters ) ) { 00186 echo "Invalid cluster returned in SQL query: {$info['cluster']}\n"; 00187 continue; 00188 } 00189 $insertBatch[] = array( 00190 'bt_page' => $row->rev_page, 00191 'bt_rev_id' => $row->rev_id, 00192 'bt_text_id' => $row->old_id, 00193 'bt_cluster' => $info['cluster'], 00194 'bt_blob_id' => $info['id'], 00195 'bt_cgz_hash' => $info['hash'] 00196 ); 00197 if ( $this->doBlobOrphans ) { 00198 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] ); 00199 } 00200 } 00201 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ ); 00202 $rowsInserted += count( $insertBatch ); 00203 00204 ++$batchesDone; 00205 if ( $batchesDone >= $this->reportingInterval ) { 00206 $batchesDone = 0; 00207 echo "$startId / $endId\n"; 00208 wfWaitForSlaves(); 00209 } 00210 } 00211 echo "Found $rowsInserted revisions\n"; 00212 } 00213 00219 function trackOrphanText() { 00220 # Wait until the blob_tracking table is available in the slave 00221 $dbw = wfGetDB( DB_MASTER ); 00222 $dbr = wfGetDB( DB_SLAVE ); 00223 $pos = $dbw->getMasterPos(); 00224 $dbr->masterPosWait( $pos, 100000 ); 00225 00226 $textClause = $this->getTextClause( $this->clusters ); 00227 $startId = 0; 00228 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ ); 00229 $rowsInserted = 0; 00230 $batchesDone = 0; 00231 00232 echo "Finding orphan text...\n"; 00233 00234 # Scan the text table for orphan text 00235 while ( true ) { 00236 $res = $dbr->select( array( 'text', 'blob_tracking' ), 00237 array( 'old_id', 'old_flags', 'old_text' ), 00238 array( 00239 'old_id>' . $dbr->addQuotes( $startId ), 00240 $textClause, 00241 'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ), 00242 'bt_text_id IS NULL' 00243 ), 00244 __METHOD__, 00245 array( 00246 'ORDER BY' => 'old_id', 00247 'LIMIT' => $this->batchSize 00248 ), 00249 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) ) 00250 ); 00251 $ids = array(); 00252 foreach ( $res as $row ) { 00253 $ids[] = $row->old_id; 00254 } 00255 00256 if ( !$res->numRows() ) { 00257 break; 00258 } 00259 00260 $insertBatch = array(); 00261 foreach ( $res as $row ) { 00262 $startId = $row->old_id; 00263 $info = $this->interpretPointer( $row->old_text ); 00264 if ( !$info ) { 00265 echo "Invalid DB:// URL in old_id {$row->old_id}\n"; 00266 continue; 00267 } 00268 if ( !in_array( $info['cluster'], $this->clusters ) ) { 00269 echo "Invalid cluster returned in SQL query\n"; 00270 continue; 00271 } 00272 00273 $insertBatch[] = array( 00274 'bt_page' => 0, 00275 'bt_rev_id' => 0, 00276 'bt_text_id' => $row->old_id, 00277 'bt_cluster' => $info['cluster'], 00278 'bt_blob_id' => $info['id'], 00279 'bt_cgz_hash' => $info['hash'] 00280 ); 00281 if ( $this->doBlobOrphans ) { 00282 gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] ); 00283 } 00284 } 00285 $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ ); 00286 00287 $rowsInserted += count( $insertBatch ); 00288 ++$batchesDone; 00289 if ( $batchesDone >= $this->reportingInterval ) { 00290 $batchesDone = 0; 00291 echo "$startId / $endId\n"; 00292 wfWaitForSlaves(); 00293 } 00294 } 00295 echo "Found $rowsInserted orphan text rows\n"; 00296 } 00297 00305 function findOrphanBlobs() { 00306 if ( !extension_loaded( 'gmp' ) ) { 00307 echo "Can't find orphan blobs, need bitfield support provided by GMP.\n"; 00308 return; 00309 } 00310 00311 $dbw = wfGetDB( DB_MASTER ); 00312 00313 foreach ( $this->clusters as $cluster ) { 00314 echo "Searching for orphan blobs in $cluster...\n"; 00315 $lb = wfGetLBFactory()->getExternalLB( $cluster ); 00316 try { 00317 $extDB = $lb->getConnection( DB_SLAVE ); 00318 } catch ( DBConnectionError $e ) { 00319 if ( strpos( $e->error, 'Unknown database' ) !== false ) { 00320 echo "No database on $cluster\n"; 00321 } else { 00322 echo "Error on $cluster: " . $e->getMessage() . "\n"; 00323 } 00324 continue; 00325 } 00326 $table = $extDB->getLBInfo( 'blobs table' ); 00327 if ( is_null( $table ) ) { 00328 $table = 'blobs'; 00329 } 00330 if ( !$extDB->tableExists( $table ) ) { 00331 echo "No blobs table on cluster $cluster\n"; 00332 continue; 00333 } 00334 $startId = 0; 00335 $batchesDone = 0; 00336 $actualBlobs = gmp_init( 0 ); 00337 $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ ); 00338 00339 // Build a bitmap of actual blob rows 00340 while ( true ) { 00341 $res = $extDB->select( $table, 00342 array( 'blob_id' ), 00343 array( 'blob_id > ' . $extDB->addQuotes( $startId ) ), 00344 __METHOD__, 00345 array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' ) 00346 ); 00347 00348 if ( !$res->numRows() ) { 00349 break; 00350 } 00351 00352 foreach ( $res as $row ) { 00353 gmp_setbit( $actualBlobs, $row->blob_id ); 00354 } 00355 $startId = $row->blob_id; 00356 00357 ++$batchesDone; 00358 if ( $batchesDone >= $this->reportingInterval ) { 00359 $batchesDone = 0; 00360 echo "$startId / $endId\n"; 00361 } 00362 } 00363 00364 // Find actual blobs that weren't tracked by the previous passes 00365 // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B 00366 $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) ); 00367 00368 // Traverse the orphan list 00369 $insertBatch = array(); 00370 $id = 0; 00371 $numOrphans = 0; 00372 while ( true ) { 00373 $id = gmp_scan1( $orphans, $id ); 00374 if ( $id == -1 ) { 00375 break; 00376 } 00377 $insertBatch[] = array( 00378 'bo_cluster' => $cluster, 00379 'bo_blob_id' => $id 00380 ); 00381 if ( count( $insertBatch ) > $this->batchSize ) { 00382 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ ); 00383 $insertBatch = array(); 00384 } 00385 00386 ++$id; 00387 ++$numOrphans; 00388 } 00389 if ( $insertBatch ) { 00390 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ ); 00391 } 00392 echo "Found $numOrphans orphan(s) in $cluster\n"; 00393 } 00394 } 00395 }