MediaWiki  REL1_19
trackBlobs.php
Go to the documentation of this file.
00001 <?php
00025 require( dirname( __FILE__ ) . '/../commandLine.inc' );
00026 
00027 
00028 if ( count( $args ) < 1 ) {
00029         echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
00030         echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
00031         echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
00032 
00033         exit( 1 );
00034 }
00035 $tracker = new TrackBlobs( $args );
00036 $tracker->run();
00037 echo "All done.\n";
00038 
00039 class TrackBlobs {
00040         var $clusters, $textClause;
00041         var $doBlobOrphans;
00042         var $trackedBlobs = array();
00043 
00044         var $batchSize = 1000;
00045         var $reportingInterval = 10;
00046 
00047         function __construct( $clusters ) {
00048                 $this->clusters = $clusters;
00049                 if ( extension_loaded( 'gmp' ) ) {
00050                         $this->doBlobOrphans = true;
00051                         foreach ( $clusters as $cluster ) {
00052                                 $this->trackedBlobs[$cluster] = gmp_init( 0 );
00053                         }
00054                 } else {
00055                         echo "Warning: the gmp extension is needed to find orphan blobs\n";
00056                 }
00057         }
00058 
00059         function run() {
00060                 $this->checkIntegrity();
00061                 $this->initTrackingTable();
00062                 $this->trackRevisions();
00063                 $this->trackOrphanText();
00064                 if ( $this->doBlobOrphans ) {
00065                         $this->findOrphanBlobs();
00066                 }
00067         }
00068 
00069         function checkIntegrity() {
00070                 echo "Doing integrity check...\n";
00071                 $dbr = wfGetDB( DB_SLAVE );
00072 
00073                 // Scan for HistoryBlobStub objects in the text table (bug 20757)
00074 
00075                 $exists = $dbr->selectField( 'text', 1,
00076                         'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
00077                         'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
00078                         __METHOD__
00079                 );
00080 
00081                 if ( $exists ) {
00082                         echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
00083                                 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
00084                                 "to fix this.\n";
00085                         exit( 1 );
00086                 }
00087 
00088                 // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
00089                 $flags = $dbr->selectField( 'archive', 'ar_flags',
00090                         'ar_flags LIKE \'%external%\' OR (' .
00091                         'ar_flags LIKE \'%object%\' ' .
00092                         'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
00093                         __METHOD__
00094                 );
00095 
00096                 if ( strpos( $flags, 'external' ) !== false ) {
00097                         echo "Integrity check failed: found external storage pointers in your archive table.\n" .
00098                                 "Run normaliseArchiveTable.php to fix this.\n";
00099                         exit( 1 );
00100                 } elseif ( $flags ) {
00101                         echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
00102                                 "These objects are probably already broken, continuing would make them\n" .
00103                                 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
00104                         exit( 1 );
00105                 }
00106 
00107                 echo "Integrity check OK\n";
00108         }
00109 
00110         function initTrackingTable() {
00111                 $dbw = wfGetDB( DB_MASTER );
00112                 if ( $dbw->tableExists( 'blob_tracking' ) ) {
00113                         $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
00114                         $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
00115                 }
00116                 $dbw->sourceFile( dirname( __FILE__ ) . '/blob_tracking.sql' );
00117         }
00118 
00119         function getTextClause() {
00120                 if ( !$this->textClause ) {
00121                         $dbr = wfGetDB( DB_SLAVE );
00122                         $this->textClause = '';
00123                         foreach ( $this->clusters as $cluster ) {
00124                                 if ( $this->textClause != '' ) {
00125                                         $this->textClause .= ' OR ';
00126                                 }
00127                                 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
00128                         }
00129                 }
00130                 return $this->textClause;
00131         }
00132 
00133         function interpretPointer( $text ) {
00134                 if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
00135                         return false;
00136                 }
00137                 return array(
00138                         'cluster' => $m[1],
00139                         'id' => intval( $m[2] ),
00140                         'hash' => isset( $m[3] ) ? $m[3] : null
00141                 );
00142         }
00143 
00147         function trackRevisions() {
00148                 $dbw = wfGetDB( DB_MASTER );
00149                 $dbr = wfGetDB( DB_SLAVE );
00150 
00151                 $textClause = $this->getTextClause();
00152                 $startId = 0;
00153                 $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
00154                 $batchesDone = 0;
00155                 $rowsInserted = 0;
00156 
00157                 echo "Finding revisions...\n";
00158 
00159                 while ( true ) {
00160                         $res = $dbr->select( array( 'revision', 'text' ),
00161                                 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
00162                                 array(
00163                                         'rev_id > ' . $dbr->addQuotes( $startId ),
00164                                         'rev_text_id=old_id',
00165                                         $textClause,
00166                                         'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
00167                                 ),
00168                                 __METHOD__,
00169                                 array(
00170                                         'ORDER BY' => 'rev_id',
00171                                         'LIMIT' => $this->batchSize
00172                                 )
00173                         );
00174                         if ( !$res->numRows() ) {
00175                                 break;
00176                         }
00177 
00178                         $insertBatch = array();
00179                         foreach ( $res as $row ) {
00180                                 $startId = $row->rev_id;
00181                                 $info = $this->interpretPointer( $row->old_text );
00182                                 if ( !$info ) {
00183                                         echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
00184                                         continue;
00185                                 }
00186                                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00187                                         echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
00188                                         continue;
00189                                 }
00190                                 $insertBatch[] = array(
00191                                         'bt_page' => $row->rev_page,
00192                                         'bt_rev_id' => $row->rev_id,
00193                                         'bt_text_id' => $row->old_id,
00194                                         'bt_cluster' => $info['cluster'],
00195                                         'bt_blob_id' => $info['id'],
00196                                         'bt_cgz_hash' => $info['hash']
00197                                 );
00198                                 if ( $this->doBlobOrphans ) {
00199                                         gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00200                                 }
00201                         }
00202                         $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00203                         $rowsInserted += count( $insertBatch );
00204 
00205                         ++$batchesDone;
00206                         if ( $batchesDone >= $this->reportingInterval ) {
00207                                 $batchesDone = 0;
00208                                 echo "$startId / $endId\n";
00209                                 wfWaitForSlaves();
00210                         }
00211                 }
00212                 echo "Found $rowsInserted revisions\n";
00213         }
00214 
00220         function trackOrphanText() {
00221                 # Wait until the blob_tracking table is available in the slave
00222                 $dbw = wfGetDB( DB_MASTER );
00223                 $dbr = wfGetDB( DB_SLAVE );
00224                 $pos = $dbw->getMasterPos();
00225                 $dbr->masterPosWait( $pos, 100000 );
00226 
00227                 $textClause = $this->getTextClause( $this->clusters );
00228                 $startId = 0;
00229                 $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
00230                 $rowsInserted = 0;
00231                 $batchesDone = 0;
00232 
00233                 echo "Finding orphan text...\n";
00234 
00235                 # Scan the text table for orphan text
00236                 while ( true ) {
00237                         $res = $dbr->select( array( 'text', 'blob_tracking' ),
00238                                 array( 'old_id', 'old_flags', 'old_text' ),
00239                                 array(
00240                                         'old_id>' . $dbr->addQuotes( $startId ),
00241                                         $textClause,
00242                                         'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
00243                                         'bt_text_id IS NULL'
00244                                 ),
00245                                 __METHOD__,
00246                                 array(
00247                                         'ORDER BY' => 'old_id',
00248                                         'LIMIT' => $this->batchSize
00249                                 ),
00250                                 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
00251                         );
00252                         $ids = array();
00253                         foreach ( $res as $row ) {
00254                                 $ids[] = $row->old_id;
00255                         }
00256 
00257                         if ( !$res->numRows() ) {
00258                                 break;
00259                         }
00260 
00261                         $insertBatch = array();
00262                         foreach ( $res as $row ) {
00263                                 $startId = $row->old_id;
00264                                 $info = $this->interpretPointer( $row->old_text );
00265                                 if ( !$info ) {
00266                                         echo "Invalid DB:// URL in old_id {$row->old_id}\n";
00267                                         continue;
00268                                 }
00269                                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00270                                         echo "Invalid cluster returned in SQL query\n";
00271                                         continue;
00272                                 }
00273 
00274                                 $insertBatch[] = array(
00275                                         'bt_page' => 0,
00276                                         'bt_rev_id' => 0,
00277                                         'bt_text_id' => $row->old_id,
00278                                         'bt_cluster' => $info['cluster'],
00279                                         'bt_blob_id' => $info['id'],
00280                                         'bt_cgz_hash' => $info['hash']
00281                                 );
00282                                 if ( $this->doBlobOrphans ) {
00283                                         gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00284                                 }
00285                         }
00286                         $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00287 
00288                         $rowsInserted += count( $insertBatch );
00289                         ++$batchesDone;
00290                         if ( $batchesDone >= $this->reportingInterval ) {
00291                                 $batchesDone = 0;
00292                                 echo "$startId / $endId\n";
00293                                 wfWaitForSlaves();
00294                         }
00295                 }
00296                 echo "Found $rowsInserted orphan text rows\n";
00297         }
00298 
00306         function findOrphanBlobs() {
00307                 if ( !extension_loaded( 'gmp' ) ) {
00308                         echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
00309                         return;
00310                 }
00311 
00312                 $dbw = wfGetDB( DB_MASTER );
00313 
00314                 foreach ( $this->clusters as $cluster ) {
00315                         echo "Searching for orphan blobs in $cluster...\n";
00316                         $lb = wfGetLBFactory()->getExternalLB( $cluster );
00317                         try {
00318                                 $extDB = $lb->getConnection( DB_SLAVE );
00319                         } catch ( DBConnectionError $e ) {
00320                                 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
00321                                         echo "No database on $cluster\n";
00322                                 } else {
00323                                         echo "Error on $cluster: " . $e->getMessage() . "\n";
00324                                 }
00325                                 continue;
00326                         }
00327                         $table = $extDB->getLBInfo( 'blobs table' );
00328                         if ( is_null( $table ) ) {
00329                                 $table = 'blobs';
00330                         }
00331                         if ( !$extDB->tableExists( $table ) ) {
00332                                 echo "No blobs table on cluster $cluster\n";
00333                                 continue;
00334                         }
00335                         $startId = 0;
00336                         $batchesDone = 0;
00337                         $actualBlobs = gmp_init( 0 );
00338                         $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
00339 
00340                         // Build a bitmap of actual blob rows
00341                         while ( true ) {
00342                                 $res = $extDB->select( $table,
00343                                         array( 'blob_id' ),
00344                                         array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
00345                                         __METHOD__,
00346                                         array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
00347                                 );
00348 
00349                                 if ( !$res->numRows() ) {
00350                                         break;
00351                                 }
00352 
00353                                 foreach ( $res as $row ) {
00354                                         gmp_setbit( $actualBlobs, $row->blob_id );
00355                                 }
00356                                 $startId = $row->blob_id;
00357 
00358                                 ++$batchesDone;
00359                                 if ( $batchesDone >= $this->reportingInterval ) {
00360                                         $batchesDone = 0;
00361                                         echo "$startId / $endId\n";
00362                                 }
00363                         }
00364 
00365                         // Find actual blobs that weren't tracked by the previous passes
00366                         // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
00367                         $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
00368 
00369                         // Traverse the orphan list
00370                         $insertBatch = array();
00371                         $id = 0;
00372                         $numOrphans = 0;
00373                         while ( true ) {
00374                                 $id = gmp_scan1( $orphans, $id );
00375                                 if ( $id == -1 ) {
00376                                         break;
00377                                 }
00378                                 $insertBatch[] = array(
00379                                         'bo_cluster' => $cluster,
00380                                         'bo_blob_id' => $id
00381                                 );
00382                                 if ( count( $insertBatch ) > $this->batchSize ) {
00383                                         $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00384                                         $insertBatch = array();
00385                                 }
00386 
00387                                 ++$id;
00388                                 ++$numOrphans;
00389                         }
00390                         if ( $insertBatch ) {
00391                                 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00392                         }
00393                         echo "Found $numOrphans orphan(s) in $cluster\n";
00394                 }
00395         }
00396 }