MediaWiki  REL1_24
trackBlobs.php
Go to the documentation of this file.
00001 <?php
00025 require __DIR__ . '/../commandLine.inc';
00026 
00027 if ( count( $args ) < 1 ) {
00028     echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
00029     echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
00030     echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
00031 
00032     exit( 1 );
00033 }
00034 $tracker = new TrackBlobs( $args );
00035 $tracker->run();
00036 echo "All done.\n";
00037 
00038 class TrackBlobs {
00039     public $clusters, $textClause;
00040     public $doBlobOrphans;
00041     public $trackedBlobs = array();
00042 
00043     public $batchSize = 1000;
00044     public $reportingInterval = 10;
00045 
00046     function __construct( $clusters ) {
00047         $this->clusters = $clusters;
00048         if ( extension_loaded( 'gmp' ) ) {
00049             $this->doBlobOrphans = true;
00050             foreach ( $clusters as $cluster ) {
00051                 $this->trackedBlobs[$cluster] = gmp_init( 0 );
00052             }
00053         } else {
00054             echo "Warning: the gmp extension is needed to find orphan blobs\n";
00055         }
00056     }
00057 
00058     function run() {
00059         $this->checkIntegrity();
00060         $this->initTrackingTable();
00061         $this->trackRevisions();
00062         $this->trackOrphanText();
00063         if ( $this->doBlobOrphans ) {
00064             $this->findOrphanBlobs();
00065         }
00066     }
00067 
00068     function checkIntegrity() {
00069         echo "Doing integrity check...\n";
00070         $dbr = wfGetDB( DB_SLAVE );
00071 
00072         // Scan for HistoryBlobStub objects in the text table (bug 20757)
00073 
00074         $exists = $dbr->selectField( 'text', 1,
00075             'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
00076             'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
00077             __METHOD__
00078         );
00079 
00080         if ( $exists ) {
00081             echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
00082                 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
00083                 "to fix this.\n";
00084             exit( 1 );
00085         }
00086 
00087         // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
00088         $flags = $dbr->selectField( 'archive', 'ar_flags',
00089             'ar_flags LIKE \'%external%\' OR (' .
00090             'ar_flags LIKE \'%object%\' ' .
00091             'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
00092             __METHOD__
00093         );
00094 
00095         if ( strpos( $flags, 'external' ) !== false ) {
00096             echo "Integrity check failed: found external storage pointers in your archive table.\n" .
00097                 "Run normaliseArchiveTable.php to fix this.\n";
00098             exit( 1 );
00099         } elseif ( $flags ) {
00100             echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
00101                 "These objects are probably already broken, continuing would make them\n" .
00102                 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
00103             exit( 1 );
00104         }
00105 
00106         echo "Integrity check OK\n";
00107     }
00108 
00109     function initTrackingTable() {
00110         $dbw = wfGetDB( DB_MASTER );
00111         if ( $dbw->tableExists( 'blob_tracking' ) ) {
00112             $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
00113             $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
00114         }
00115         $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
00116     }
00117 
00118     function getTextClause() {
00119         if ( !$this->textClause ) {
00120             $dbr = wfGetDB( DB_SLAVE );
00121             $this->textClause = '';
00122             foreach ( $this->clusters as $cluster ) {
00123                 if ( $this->textClause != '' ) {
00124                     $this->textClause .= ' OR ';
00125                 }
00126                 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
00127             }
00128         }
00129 
00130         return $this->textClause;
00131     }
00132 
00133     function interpretPointer( $text ) {
00134         if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
00135             return false;
00136         }
00137 
00138         return array(
00139             'cluster' => $m[1],
00140             'id' => intval( $m[2] ),
00141             'hash' => isset( $m[3] ) ? $m[3] : null
00142         );
00143     }
00144 
00148     function trackRevisions() {
00149         $dbw = wfGetDB( DB_MASTER );
00150         $dbr = wfGetDB( DB_SLAVE );
00151 
00152         $textClause = $this->getTextClause();
00153         $startId = 0;
00154         $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
00155         $batchesDone = 0;
00156         $rowsInserted = 0;
00157 
00158         echo "Finding revisions...\n";
00159 
00160         while ( true ) {
00161             $res = $dbr->select( array( 'revision', 'text' ),
00162                 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
00163                 array(
00164                     'rev_id > ' . $dbr->addQuotes( $startId ),
00165                     'rev_text_id=old_id',
00166                     $textClause,
00167                     'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
00168                 ),
00169                 __METHOD__,
00170                 array(
00171                     'ORDER BY' => 'rev_id',
00172                     'LIMIT' => $this->batchSize
00173                 )
00174             );
00175             if ( !$res->numRows() ) {
00176                 break;
00177             }
00178 
00179             $insertBatch = array();
00180             foreach ( $res as $row ) {
00181                 $startId = $row->rev_id;
00182                 $info = $this->interpretPointer( $row->old_text );
00183                 if ( !$info ) {
00184                     echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
00185                     continue;
00186                 }
00187                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00188                     echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
00189                     continue;
00190                 }
00191                 $insertBatch[] = array(
00192                     'bt_page' => $row->rev_page,
00193                     'bt_rev_id' => $row->rev_id,
00194                     'bt_text_id' => $row->old_id,
00195                     'bt_cluster' => $info['cluster'],
00196                     'bt_blob_id' => $info['id'],
00197                     'bt_cgz_hash' => $info['hash']
00198                 );
00199                 if ( $this->doBlobOrphans ) {
00200                     gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00201                 }
00202             }
00203             $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00204             $rowsInserted += count( $insertBatch );
00205 
00206             ++$batchesDone;
00207             if ( $batchesDone >= $this->reportingInterval ) {
00208                 $batchesDone = 0;
00209                 echo "$startId / $endId\n";
00210                 wfWaitForSlaves();
00211             }
00212         }
00213         echo "Found $rowsInserted revisions\n";
00214     }
00215 
00221     function trackOrphanText() {
00222         # Wait until the blob_tracking table is available in the slave
00223         $dbw = wfGetDB( DB_MASTER );
00224         $dbr = wfGetDB( DB_SLAVE );
00225         $pos = $dbw->getMasterPos();
00226         $dbr->masterPosWait( $pos, 100000 );
00227 
00228         $textClause = $this->getTextClause( $this->clusters );
00229         $startId = 0;
00230         $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
00231         $rowsInserted = 0;
00232         $batchesDone = 0;
00233 
00234         echo "Finding orphan text...\n";
00235 
00236         # Scan the text table for orphan text
00237         while ( true ) {
00238             $res = $dbr->select( array( 'text', 'blob_tracking' ),
00239                 array( 'old_id', 'old_flags', 'old_text' ),
00240                 array(
00241                     'old_id>' . $dbr->addQuotes( $startId ),
00242                     $textClause,
00243                     'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
00244                     'bt_text_id IS NULL'
00245                 ),
00246                 __METHOD__,
00247                 array(
00248                     'ORDER BY' => 'old_id',
00249                     'LIMIT' => $this->batchSize
00250                 ),
00251                 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
00252             );
00253             $ids = array();
00254             foreach ( $res as $row ) {
00255                 $ids[] = $row->old_id;
00256             }
00257 
00258             if ( !$res->numRows() ) {
00259                 break;
00260             }
00261 
00262             $insertBatch = array();
00263             foreach ( $res as $row ) {
00264                 $startId = $row->old_id;
00265                 $info = $this->interpretPointer( $row->old_text );
00266                 if ( !$info ) {
00267                     echo "Invalid DB:// URL in old_id {$row->old_id}\n";
00268                     continue;
00269                 }
00270                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00271                     echo "Invalid cluster returned in SQL query\n";
00272                     continue;
00273                 }
00274 
00275                 $insertBatch[] = array(
00276                     'bt_page' => 0,
00277                     'bt_rev_id' => 0,
00278                     'bt_text_id' => $row->old_id,
00279                     'bt_cluster' => $info['cluster'],
00280                     'bt_blob_id' => $info['id'],
00281                     'bt_cgz_hash' => $info['hash']
00282                 );
00283                 if ( $this->doBlobOrphans ) {
00284                     gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00285                 }
00286             }
00287             $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00288 
00289             $rowsInserted += count( $insertBatch );
00290             ++$batchesDone;
00291             if ( $batchesDone >= $this->reportingInterval ) {
00292                 $batchesDone = 0;
00293                 echo "$startId / $endId\n";
00294                 wfWaitForSlaves();
00295             }
00296         }
00297         echo "Found $rowsInserted orphan text rows\n";
00298     }
00299 
00307     function findOrphanBlobs() {
00308         if ( !extension_loaded( 'gmp' ) ) {
00309             echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
00310 
00311             return;
00312         }
00313 
00314         $dbw = wfGetDB( DB_MASTER );
00315 
00316         foreach ( $this->clusters as $cluster ) {
00317             echo "Searching for orphan blobs in $cluster...\n";
00318             $lb = wfGetLBFactory()->getExternalLB( $cluster );
00319             try {
00320                 $extDB = $lb->getConnection( DB_SLAVE );
00321             } catch ( DBConnectionError $e ) {
00322                 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
00323                     echo "No database on $cluster\n";
00324                 } else {
00325                     echo "Error on $cluster: " . $e->getMessage() . "\n";
00326                 }
00327                 continue;
00328             }
00329             $table = $extDB->getLBInfo( 'blobs table' );
00330             if ( is_null( $table ) ) {
00331                 $table = 'blobs';
00332             }
00333             if ( !$extDB->tableExists( $table ) ) {
00334                 echo "No blobs table on cluster $cluster\n";
00335                 continue;
00336             }
00337             $startId = 0;
00338             $batchesDone = 0;
00339             $actualBlobs = gmp_init( 0 );
00340             $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
00341 
00342             // Build a bitmap of actual blob rows
00343             while ( true ) {
00344                 $res = $extDB->select( $table,
00345                     array( 'blob_id' ),
00346                     array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
00347                     __METHOD__,
00348                     array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
00349                 );
00350 
00351                 if ( !$res->numRows() ) {
00352                     break;
00353                 }
00354 
00355                 foreach ( $res as $row ) {
00356                     gmp_setbit( $actualBlobs, $row->blob_id );
00357                 }
00358                 $startId = $row->blob_id;
00359 
00360                 ++$batchesDone;
00361                 if ( $batchesDone >= $this->reportingInterval ) {
00362                     $batchesDone = 0;
00363                     echo "$startId / $endId\n";
00364                 }
00365             }
00366 
00367             // Find actual blobs that weren't tracked by the previous passes
00368             // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
00369             $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
00370 
00371             // Traverse the orphan list
00372             $insertBatch = array();
00373             $id = 0;
00374             $numOrphans = 0;
00375             while ( true ) {
00376                 $id = gmp_scan1( $orphans, $id );
00377                 if ( $id == -1 ) {
00378                     break;
00379                 }
00380                 $insertBatch[] = array(
00381                     'bo_cluster' => $cluster,
00382                     'bo_blob_id' => $id
00383                 );
00384                 if ( count( $insertBatch ) > $this->batchSize ) {
00385                     $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00386                     $insertBatch = array();
00387                 }
00388 
00389                 ++$id;
00390                 ++$numOrphans;
00391             }
00392             if ( $insertBatch ) {
00393                 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00394             }
00395             echo "Found $numOrphans orphan(s) in $cluster\n";
00396         }
00397     }
00398 }