MediaWiki  REL1_23
trackBlobs.php
Go to the documentation of this file.
00001 <?php
00025 require __DIR__ . '/../commandLine.inc';
00026 
00027 if ( count( $args ) < 1 ) {
00028     echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
00029     echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
00030     echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
00031 
00032     exit( 1 );
00033 }
00034 $tracker = new TrackBlobs( $args );
00035 $tracker->run();
00036 echo "All done.\n";
00037 
00038 class TrackBlobs {
00039     public $clusters, $textClause;
00040     public $doBlobOrphans;
00041     public $trackedBlobs = array();
00042 
00043     public $batchSize = 1000;
00044     public $reportingInterval = 10;
00045 
00046     function __construct( $clusters ) {
00047         $this->clusters = $clusters;
00048         if ( extension_loaded( 'gmp' ) ) {
00049             $this->doBlobOrphans = true;
00050             foreach ( $clusters as $cluster ) {
00051                 $this->trackedBlobs[$cluster] = gmp_init( 0 );
00052             }
00053         } else {
00054             echo "Warning: the gmp extension is needed to find orphan blobs\n";
00055         }
00056     }
00057 
00058     function run() {
00059         $this->checkIntegrity();
00060         $this->initTrackingTable();
00061         $this->trackRevisions();
00062         $this->trackOrphanText();
00063         if ( $this->doBlobOrphans ) {
00064             $this->findOrphanBlobs();
00065         }
00066     }
00067 
00068     function checkIntegrity() {
00069         echo "Doing integrity check...\n";
00070         $dbr = wfGetDB( DB_SLAVE );
00071 
00072         // Scan for HistoryBlobStub objects in the text table (bug 20757)
00073 
00074         $exists = $dbr->selectField( 'text', 1,
00075             'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
00076             'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
00077             __METHOD__
00078         );
00079 
00080         if ( $exists ) {
00081             echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
00082                 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
00083                 "to fix this.\n";
00084             exit( 1 );
00085         }
00086 
00087         // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
00088         $flags = $dbr->selectField( 'archive', 'ar_flags',
00089             'ar_flags LIKE \'%external%\' OR (' .
00090             'ar_flags LIKE \'%object%\' ' .
00091             'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
00092             __METHOD__
00093         );
00094 
00095         if ( strpos( $flags, 'external' ) !== false ) {
00096             echo "Integrity check failed: found external storage pointers in your archive table.\n" .
00097                 "Run normaliseArchiveTable.php to fix this.\n";
00098             exit( 1 );
00099         } elseif ( $flags ) {
00100             echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
00101                 "These objects are probably already broken, continuing would make them\n" .
00102                 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
00103             exit( 1 );
00104         }
00105 
00106         echo "Integrity check OK\n";
00107     }
00108 
00109     function initTrackingTable() {
00110         $dbw = wfGetDB( DB_MASTER );
00111         if ( $dbw->tableExists( 'blob_tracking' ) ) {
00112             $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
00113             $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
00114         }
00115         $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
00116     }
00117 
00118     function getTextClause() {
00119         if ( !$this->textClause ) {
00120             $dbr = wfGetDB( DB_SLAVE );
00121             $this->textClause = '';
00122             foreach ( $this->clusters as $cluster ) {
00123                 if ( $this->textClause != '' ) {
00124                     $this->textClause .= ' OR ';
00125                 }
00126                 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
00127             }
00128         }
00129         return $this->textClause;
00130     }
00131 
00132     function interpretPointer( $text ) {
00133         if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
00134             return false;
00135         }
00136         return array(
00137             'cluster' => $m[1],
00138             'id' => intval( $m[2] ),
00139             'hash' => isset( $m[3] ) ? $m[3] : null
00140         );
00141     }
00142 
00146     function trackRevisions() {
00147         $dbw = wfGetDB( DB_MASTER );
00148         $dbr = wfGetDB( DB_SLAVE );
00149 
00150         $textClause = $this->getTextClause();
00151         $startId = 0;
00152         $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
00153         $batchesDone = 0;
00154         $rowsInserted = 0;
00155 
00156         echo "Finding revisions...\n";
00157 
00158         while ( true ) {
00159             $res = $dbr->select( array( 'revision', 'text' ),
00160                 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
00161                 array(
00162                     'rev_id > ' . $dbr->addQuotes( $startId ),
00163                     'rev_text_id=old_id',
00164                     $textClause,
00165                     'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
00166                 ),
00167                 __METHOD__,
00168                 array(
00169                     'ORDER BY' => 'rev_id',
00170                     'LIMIT' => $this->batchSize
00171                 )
00172             );
00173             if ( !$res->numRows() ) {
00174                 break;
00175             }
00176 
00177             $insertBatch = array();
00178             foreach ( $res as $row ) {
00179                 $startId = $row->rev_id;
00180                 $info = $this->interpretPointer( $row->old_text );
00181                 if ( !$info ) {
00182                     echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
00183                     continue;
00184                 }
00185                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00186                     echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
00187                     continue;
00188                 }
00189                 $insertBatch[] = array(
00190                     'bt_page' => $row->rev_page,
00191                     'bt_rev_id' => $row->rev_id,
00192                     'bt_text_id' => $row->old_id,
00193                     'bt_cluster' => $info['cluster'],
00194                     'bt_blob_id' => $info['id'],
00195                     'bt_cgz_hash' => $info['hash']
00196                 );
00197                 if ( $this->doBlobOrphans ) {
00198                     gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00199                 }
00200             }
00201             $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00202             $rowsInserted += count( $insertBatch );
00203 
00204             ++$batchesDone;
00205             if ( $batchesDone >= $this->reportingInterval ) {
00206                 $batchesDone = 0;
00207                 echo "$startId / $endId\n";
00208                 wfWaitForSlaves();
00209             }
00210         }
00211         echo "Found $rowsInserted revisions\n";
00212     }
00213 
00219     function trackOrphanText() {
00220         # Wait until the blob_tracking table is available in the slave
00221         $dbw = wfGetDB( DB_MASTER );
00222         $dbr = wfGetDB( DB_SLAVE );
00223         $pos = $dbw->getMasterPos();
00224         $dbr->masterPosWait( $pos, 100000 );
00225 
00226         $textClause = $this->getTextClause( $this->clusters );
00227         $startId = 0;
00228         $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
00229         $rowsInserted = 0;
00230         $batchesDone = 0;
00231 
00232         echo "Finding orphan text...\n";
00233 
00234         # Scan the text table for orphan text
00235         while ( true ) {
00236             $res = $dbr->select( array( 'text', 'blob_tracking' ),
00237                 array( 'old_id', 'old_flags', 'old_text' ),
00238                 array(
00239                     'old_id>' . $dbr->addQuotes( $startId ),
00240                     $textClause,
00241                     'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
00242                     'bt_text_id IS NULL'
00243                 ),
00244                 __METHOD__,
00245                 array(
00246                     'ORDER BY' => 'old_id',
00247                     'LIMIT' => $this->batchSize
00248                 ),
00249                 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
00250             );
00251             $ids = array();
00252             foreach ( $res as $row ) {
00253                 $ids[] = $row->old_id;
00254             }
00255 
00256             if ( !$res->numRows() ) {
00257                 break;
00258             }
00259 
00260             $insertBatch = array();
00261             foreach ( $res as $row ) {
00262                 $startId = $row->old_id;
00263                 $info = $this->interpretPointer( $row->old_text );
00264                 if ( !$info ) {
00265                     echo "Invalid DB:// URL in old_id {$row->old_id}\n";
00266                     continue;
00267                 }
00268                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00269                     echo "Invalid cluster returned in SQL query\n";
00270                     continue;
00271                 }
00272 
00273                 $insertBatch[] = array(
00274                     'bt_page' => 0,
00275                     'bt_rev_id' => 0,
00276                     'bt_text_id' => $row->old_id,
00277                     'bt_cluster' => $info['cluster'],
00278                     'bt_blob_id' => $info['id'],
00279                     'bt_cgz_hash' => $info['hash']
00280                 );
00281                 if ( $this->doBlobOrphans ) {
00282                     gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00283                 }
00284             }
00285             $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00286 
00287             $rowsInserted += count( $insertBatch );
00288             ++$batchesDone;
00289             if ( $batchesDone >= $this->reportingInterval ) {
00290                 $batchesDone = 0;
00291                 echo "$startId / $endId\n";
00292                 wfWaitForSlaves();
00293             }
00294         }
00295         echo "Found $rowsInserted orphan text rows\n";
00296     }
00297 
00305     function findOrphanBlobs() {
00306         if ( !extension_loaded( 'gmp' ) ) {
00307             echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
00308             return;
00309         }
00310 
00311         $dbw = wfGetDB( DB_MASTER );
00312 
00313         foreach ( $this->clusters as $cluster ) {
00314             echo "Searching for orphan blobs in $cluster...\n";
00315             $lb = wfGetLBFactory()->getExternalLB( $cluster );
00316             try {
00317                 $extDB = $lb->getConnection( DB_SLAVE );
00318             } catch ( DBConnectionError $e ) {
00319                 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
00320                     echo "No database on $cluster\n";
00321                 } else {
00322                     echo "Error on $cluster: " . $e->getMessage() . "\n";
00323                 }
00324                 continue;
00325             }
00326             $table = $extDB->getLBInfo( 'blobs table' );
00327             if ( is_null( $table ) ) {
00328                 $table = 'blobs';
00329             }
00330             if ( !$extDB->tableExists( $table ) ) {
00331                 echo "No blobs table on cluster $cluster\n";
00332                 continue;
00333             }
00334             $startId = 0;
00335             $batchesDone = 0;
00336             $actualBlobs = gmp_init( 0 );
00337             $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
00338 
00339             // Build a bitmap of actual blob rows
00340             while ( true ) {
00341                 $res = $extDB->select( $table,
00342                     array( 'blob_id' ),
00343                     array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
00344                     __METHOD__,
00345                     array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
00346                 );
00347 
00348                 if ( !$res->numRows() ) {
00349                     break;
00350                 }
00351 
00352                 foreach ( $res as $row ) {
00353                     gmp_setbit( $actualBlobs, $row->blob_id );
00354                 }
00355                 $startId = $row->blob_id;
00356 
00357                 ++$batchesDone;
00358                 if ( $batchesDone >= $this->reportingInterval ) {
00359                     $batchesDone = 0;
00360                     echo "$startId / $endId\n";
00361                 }
00362             }
00363 
00364             // Find actual blobs that weren't tracked by the previous passes
00365             // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
00366             $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
00367 
00368             // Traverse the orphan list
00369             $insertBatch = array();
00370             $id = 0;
00371             $numOrphans = 0;
00372             while ( true ) {
00373                 $id = gmp_scan1( $orphans, $id );
00374                 if ( $id == -1 ) {
00375                     break;
00376                 }
00377                 $insertBatch[] = array(
00378                     'bo_cluster' => $cluster,
00379                     'bo_blob_id' => $id
00380                 );
00381                 if ( count( $insertBatch ) > $this->batchSize ) {
00382                     $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00383                     $insertBatch = array();
00384                 }
00385 
00386                 ++$id;
00387                 ++$numOrphans;
00388             }
00389             if ( $insertBatch ) {
00390                 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00391             }
00392             echo "Found $numOrphans orphan(s) in $cluster\n";
00393         }
00394     }
00395 }