MediaWiki  REL1_22
trackBlobs.php
Go to the documentation of this file.
00001 <?php
00025 require __DIR__ . '/../commandLine.inc';
00026 
00027 
00028 if ( count( $args ) < 1 ) {
00029     echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
00030     echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
00031     echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
00032 
00033     exit( 1 );
00034 }
00035 $tracker = new TrackBlobs( $args );
00036 $tracker->run();
00037 echo "All done.\n";
00038 
00039 class TrackBlobs {
00040     public $clusters, $textClause;
00041     public $doBlobOrphans;
00042     public $trackedBlobs = array();
00043 
00044     public $batchSize = 1000;
00045     public $reportingInterval = 10;
00046 
00047     function __construct( $clusters ) {
00048         $this->clusters = $clusters;
00049         if ( extension_loaded( 'gmp' ) ) {
00050             $this->doBlobOrphans = true;
00051             foreach ( $clusters as $cluster ) {
00052                 $this->trackedBlobs[$cluster] = gmp_init( 0 );
00053             }
00054         } else {
00055             echo "Warning: the gmp extension is needed to find orphan blobs\n";
00056         }
00057     }
00058 
00059     function run() {
00060         $this->checkIntegrity();
00061         $this->initTrackingTable();
00062         $this->trackRevisions();
00063         $this->trackOrphanText();
00064         if ( $this->doBlobOrphans ) {
00065             $this->findOrphanBlobs();
00066         }
00067     }
00068 
00069     function checkIntegrity() {
00070         echo "Doing integrity check...\n";
00071         $dbr = wfGetDB( DB_SLAVE );
00072 
00073         // Scan for HistoryBlobStub objects in the text table (bug 20757)
00074 
00075         $exists = $dbr->selectField( 'text', 1,
00076             'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
00077             'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
00078             __METHOD__
00079         );
00080 
00081         if ( $exists ) {
00082             echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
00083                 "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
00084                 "to fix this.\n";
00085             exit( 1 );
00086         }
00087 
00088         // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
00089         $flags = $dbr->selectField( 'archive', 'ar_flags',
00090             'ar_flags LIKE \'%external%\' OR (' .
00091             'ar_flags LIKE \'%object%\' ' .
00092             'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
00093             __METHOD__
00094         );
00095 
00096         if ( strpos( $flags, 'external' ) !== false ) {
00097             echo "Integrity check failed: found external storage pointers in your archive table.\n" .
00098                 "Run normaliseArchiveTable.php to fix this.\n";
00099             exit( 1 );
00100         } elseif ( $flags ) {
00101             echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
00102                 "These objects are probably already broken, continuing would make them\n" .
00103                 "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
00104             exit( 1 );
00105         }
00106 
00107         echo "Integrity check OK\n";
00108     }
00109 
00110     function initTrackingTable() {
00111         $dbw = wfGetDB( DB_MASTER );
00112         if ( $dbw->tableExists( 'blob_tracking' ) ) {
00113             $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
00114             $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
00115         }
00116         $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
00117     }
00118 
00119     function getTextClause() {
00120         if ( !$this->textClause ) {
00121             $dbr = wfGetDB( DB_SLAVE );
00122             $this->textClause = '';
00123             foreach ( $this->clusters as $cluster ) {
00124                 if ( $this->textClause != '' ) {
00125                     $this->textClause .= ' OR ';
00126                 }
00127                 $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
00128             }
00129         }
00130         return $this->textClause;
00131     }
00132 
00133     function interpretPointer( $text ) {
00134         if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
00135             return false;
00136         }
00137         return array(
00138             'cluster' => $m[1],
00139             'id' => intval( $m[2] ),
00140             'hash' => isset( $m[3] ) ? $m[3] : null
00141         );
00142     }
00143 
00147     function trackRevisions() {
00148         $dbw = wfGetDB( DB_MASTER );
00149         $dbr = wfGetDB( DB_SLAVE );
00150 
00151         $textClause = $this->getTextClause();
00152         $startId = 0;
00153         $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
00154         $batchesDone = 0;
00155         $rowsInserted = 0;
00156 
00157         echo "Finding revisions...\n";
00158 
00159         while ( true ) {
00160             $res = $dbr->select( array( 'revision', 'text' ),
00161                 array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
00162                 array(
00163                     'rev_id > ' . $dbr->addQuotes( $startId ),
00164                     'rev_text_id=old_id',
00165                     $textClause,
00166                     'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
00167                 ),
00168                 __METHOD__,
00169                 array(
00170                     'ORDER BY' => 'rev_id',
00171                     'LIMIT' => $this->batchSize
00172                 )
00173             );
00174             if ( !$res->numRows() ) {
00175                 break;
00176             }
00177 
00178             $insertBatch = array();
00179             foreach ( $res as $row ) {
00180                 $startId = $row->rev_id;
00181                 $info = $this->interpretPointer( $row->old_text );
00182                 if ( !$info ) {
00183                     echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
00184                     continue;
00185                 }
00186                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00187                     echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
00188                     continue;
00189                 }
00190                 $insertBatch[] = array(
00191                     'bt_page' => $row->rev_page,
00192                     'bt_rev_id' => $row->rev_id,
00193                     'bt_text_id' => $row->old_id,
00194                     'bt_cluster' => $info['cluster'],
00195                     'bt_blob_id' => $info['id'],
00196                     'bt_cgz_hash' => $info['hash']
00197                 );
00198                 if ( $this->doBlobOrphans ) {
00199                     gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00200                 }
00201             }
00202             $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00203             $rowsInserted += count( $insertBatch );
00204 
00205             ++$batchesDone;
00206             if ( $batchesDone >= $this->reportingInterval ) {
00207                 $batchesDone = 0;
00208                 echo "$startId / $endId\n";
00209                 wfWaitForSlaves();
00210             }
00211         }
00212         echo "Found $rowsInserted revisions\n";
00213     }
00214 
00220     function trackOrphanText() {
00221         # Wait until the blob_tracking table is available in the slave
00222         $dbw = wfGetDB( DB_MASTER );
00223         $dbr = wfGetDB( DB_SLAVE );
00224         $pos = $dbw->getMasterPos();
00225         $dbr->masterPosWait( $pos, 100000 );
00226 
00227         $textClause = $this->getTextClause( $this->clusters );
00228         $startId = 0;
00229         $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
00230         $rowsInserted = 0;
00231         $batchesDone = 0;
00232 
00233         echo "Finding orphan text...\n";
00234 
00235         # Scan the text table for orphan text
00236         while ( true ) {
00237             $res = $dbr->select( array( 'text', 'blob_tracking' ),
00238                 array( 'old_id', 'old_flags', 'old_text' ),
00239                 array(
00240                     'old_id>' . $dbr->addQuotes( $startId ),
00241                     $textClause,
00242                     'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
00243                     'bt_text_id IS NULL'
00244                 ),
00245                 __METHOD__,
00246                 array(
00247                     'ORDER BY' => 'old_id',
00248                     'LIMIT' => $this->batchSize
00249                 ),
00250                 array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
00251             );
00252             $ids = array();
00253             foreach ( $res as $row ) {
00254                 $ids[] = $row->old_id;
00255             }
00256 
00257             if ( !$res->numRows() ) {
00258                 break;
00259             }
00260 
00261             $insertBatch = array();
00262             foreach ( $res as $row ) {
00263                 $startId = $row->old_id;
00264                 $info = $this->interpretPointer( $row->old_text );
00265                 if ( !$info ) {
00266                     echo "Invalid DB:// URL in old_id {$row->old_id}\n";
00267                     continue;
00268                 }
00269                 if ( !in_array( $info['cluster'], $this->clusters ) ) {
00270                     echo "Invalid cluster returned in SQL query\n";
00271                     continue;
00272                 }
00273 
00274                 $insertBatch[] = array(
00275                     'bt_page' => 0,
00276                     'bt_rev_id' => 0,
00277                     'bt_text_id' => $row->old_id,
00278                     'bt_cluster' => $info['cluster'],
00279                     'bt_blob_id' => $info['id'],
00280                     'bt_cgz_hash' => $info['hash']
00281                 );
00282                 if ( $this->doBlobOrphans ) {
00283                     gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
00284                 }
00285             }
00286             $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
00287 
00288             $rowsInserted += count( $insertBatch );
00289             ++$batchesDone;
00290             if ( $batchesDone >= $this->reportingInterval ) {
00291                 $batchesDone = 0;
00292                 echo "$startId / $endId\n";
00293                 wfWaitForSlaves();
00294             }
00295         }
00296         echo "Found $rowsInserted orphan text rows\n";
00297     }
00298 
00306     function findOrphanBlobs() {
00307         if ( !extension_loaded( 'gmp' ) ) {
00308             echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
00309             return;
00310         }
00311 
00312         $dbw = wfGetDB( DB_MASTER );
00313 
00314         foreach ( $this->clusters as $cluster ) {
00315             echo "Searching for orphan blobs in $cluster...\n";
00316             $lb = wfGetLBFactory()->getExternalLB( $cluster );
00317             try {
00318                 $extDB = $lb->getConnection( DB_SLAVE );
00319             } catch ( DBConnectionError $e ) {
00320                 if ( strpos( $e->error, 'Unknown database' ) !== false ) {
00321                     echo "No database on $cluster\n";
00322                 } else {
00323                     echo "Error on $cluster: " . $e->getMessage() . "\n";
00324                 }
00325                 continue;
00326             }
00327             $table = $extDB->getLBInfo( 'blobs table' );
00328             if ( is_null( $table ) ) {
00329                 $table = 'blobs';
00330             }
00331             if ( !$extDB->tableExists( $table ) ) {
00332                 echo "No blobs table on cluster $cluster\n";
00333                 continue;
00334             }
00335             $startId = 0;
00336             $batchesDone = 0;
00337             $actualBlobs = gmp_init( 0 );
00338             $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
00339 
00340             // Build a bitmap of actual blob rows
00341             while ( true ) {
00342                 $res = $extDB->select( $table,
00343                     array( 'blob_id' ),
00344                     array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
00345                     __METHOD__,
00346                     array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
00347                 );
00348 
00349                 if ( !$res->numRows() ) {
00350                     break;
00351                 }
00352 
00353                 foreach ( $res as $row ) {
00354                     gmp_setbit( $actualBlobs, $row->blob_id );
00355                 }
00356                 $startId = $row->blob_id;
00357 
00358                 ++$batchesDone;
00359                 if ( $batchesDone >= $this->reportingInterval ) {
00360                     $batchesDone = 0;
00361                     echo "$startId / $endId\n";
00362                 }
00363             }
00364 
00365             // Find actual blobs that weren't tracked by the previous passes
00366             // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
00367             $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
00368 
00369             // Traverse the orphan list
00370             $insertBatch = array();
00371             $id = 0;
00372             $numOrphans = 0;
00373             while ( true ) {
00374                 $id = gmp_scan1( $orphans, $id );
00375                 if ( $id == -1 ) {
00376                     break;
00377                 }
00378                 $insertBatch[] = array(
00379                     'bo_cluster' => $cluster,
00380                     'bo_blob_id' => $id
00381                 );
00382                 if ( count( $insertBatch ) > $this->batchSize ) {
00383                     $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00384                     $insertBatch = array();
00385                 }
00386 
00387                 ++$id;
00388                 ++$numOrphans;
00389             }
00390             if ( $insertBatch ) {
00391                 $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
00392             }
00393             echo "Found $numOrphans orphan(s) in $cluster\n";
00394         }
00395     }
00396 }