[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/maintenance/storage/ -> trackBlobs.php (source)

   1  <?php
   2  /**
   3   * Adds blobs from a given external storage cluster to the blob_tracking table.
   4   *
   5   * This program is free software; you can redistribute it and/or modify
   6   * it under the terms of the GNU General Public License as published by
   7   * the Free Software Foundation; either version 2 of the License, or
   8   * (at your option) any later version.
   9   *
  10   * This program is distributed in the hope that it will be useful,
  11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13   * GNU General Public License for more details.
  14   *
  15   * You should have received a copy of the GNU General Public License along
  16   * with this program; if not, write to the Free Software Foundation, Inc.,
  17   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18   * http://www.gnu.org/copyleft/gpl.html
  19   *
  20   * @file
  21   * @ingroup Maintenance
  22   * @see wfWaitForSlaves()
  23   */
  24  
  25  require  __DIR__ . '/../commandLine.inc';
  26  
  27  if ( count( $args ) < 1 ) {
  28      echo "Usage: php trackBlobs.php <cluster> [... <cluster>]\n";
  29      echo "Adds blobs from a given ES cluster to the blob_tracking table\n";
  30      echo "Automatically deletes the tracking table and starts from the start again when restarted.\n";
  31  
  32      exit( 1 );
  33  }
  34  $tracker = new TrackBlobs( $args );
  35  $tracker->run();
  36  echo "All done.\n";
  37  
  38  class TrackBlobs {
  39      public $clusters, $textClause;
  40      public $doBlobOrphans;
  41      public $trackedBlobs = array();
  42  
  43      public $batchSize = 1000;
  44      public $reportingInterval = 10;
  45  
  46  	function __construct( $clusters ) {
  47          $this->clusters = $clusters;
  48          if ( extension_loaded( 'gmp' ) ) {
  49              $this->doBlobOrphans = true;
  50              foreach ( $clusters as $cluster ) {
  51                  $this->trackedBlobs[$cluster] = gmp_init( 0 );
  52              }
  53          } else {
  54              echo "Warning: the gmp extension is needed to find orphan blobs\n";
  55          }
  56      }
  57  
  58  	function run() {
  59          $this->checkIntegrity();
  60          $this->initTrackingTable();
  61          $this->trackRevisions();
  62          $this->trackOrphanText();
  63          if ( $this->doBlobOrphans ) {
  64              $this->findOrphanBlobs();
  65          }
  66      }
  67  
  68  	function checkIntegrity() {
  69          echo "Doing integrity check...\n";
  70          $dbr = wfGetDB( DB_SLAVE );
  71  
  72          // Scan for HistoryBlobStub objects in the text table (bug 20757)
  73  
  74          $exists = $dbr->selectField( 'text', 1,
  75              'old_flags LIKE \'%object%\' AND old_flags NOT LIKE \'%external%\' ' .
  76              'AND LOWER(CONVERT(LEFT(old_text,22) USING latin1)) = \'o:15:"historyblobstub"\'',
  77              __METHOD__
  78          );
  79  
  80          if ( $exists ) {
  81              echo "Integrity check failed: found HistoryBlobStub objects in your text table.\n" .
  82                  "This script could destroy these objects if it continued. Run resolveStubs.php\n" .
  83                  "to fix this.\n";
  84              exit( 1 );
  85          }
  86  
  87          // Scan the archive table for HistoryBlobStub objects or external flags (bug 22624)
  88          $flags = $dbr->selectField( 'archive', 'ar_flags',
  89              'ar_flags LIKE \'%external%\' OR (' .
  90              'ar_flags LIKE \'%object%\' ' .
  91              'AND LOWER(CONVERT(LEFT(ar_text,22) USING latin1)) = \'o:15:"historyblobstub"\' )',
  92              __METHOD__
  93          );
  94  
  95          if ( strpos( $flags, 'external' ) !== false ) {
  96              echo "Integrity check failed: found external storage pointers in your archive table.\n" .
  97                  "Run normaliseArchiveTable.php to fix this.\n";
  98              exit( 1 );
  99          } elseif ( $flags ) {
 100              echo "Integrity check failed: found HistoryBlobStub objects in your archive table.\n" .
 101                  "These objects are probably already broken, continuing would make them\n" .
 102                  "unrecoverable. Run \"normaliseArchiveTable.php --fix-cgz-bug\" to fix this.\n";
 103              exit( 1 );
 104          }
 105  
 106          echo "Integrity check OK\n";
 107      }
 108  
 109  	function initTrackingTable() {
 110          $dbw = wfGetDB( DB_MASTER );
 111          if ( $dbw->tableExists( 'blob_tracking' ) ) {
 112              $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_tracking' ) );
 113              $dbw->query( 'DROP TABLE ' . $dbw->tableName( 'blob_orphans' ) );
 114          }
 115          $dbw->sourceFile( __DIR__ . '/blob_tracking.sql' );
 116      }
 117  
 118  	function getTextClause() {
 119          if ( !$this->textClause ) {
 120              $dbr = wfGetDB( DB_SLAVE );
 121              $this->textClause = '';
 122              foreach ( $this->clusters as $cluster ) {
 123                  if ( $this->textClause != '' ) {
 124                      $this->textClause .= ' OR ';
 125                  }
 126                  $this->textClause .= 'old_text' . $dbr->buildLike( "DB://$cluster/", $dbr->anyString() );
 127              }
 128          }
 129  
 130          return $this->textClause;
 131      }
 132  
 133  	function interpretPointer( $text ) {
 134          if ( !preg_match( '!^DB://(\w+)/(\d+)(?:/([0-9a-fA-F]+)|)$!', $text, $m ) ) {
 135              return false;
 136          }
 137  
 138          return array(
 139              'cluster' => $m[1],
 140              'id' => intval( $m[2] ),
 141              'hash' => isset( $m[3] ) ? $m[3] : null
 142          );
 143      }
 144  
 145      /**
 146       *  Scan the revision table for rows stored in the specified clusters
 147       */
 148  	function trackRevisions() {
 149          $dbw = wfGetDB( DB_MASTER );
 150          $dbr = wfGetDB( DB_SLAVE );
 151  
 152          $textClause = $this->getTextClause();
 153          $startId = 0;
 154          $endId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
 155          $batchesDone = 0;
 156          $rowsInserted = 0;
 157  
 158          echo "Finding revisions...\n";
 159  
 160          while ( true ) {
 161              $res = $dbr->select( array( 'revision', 'text' ),
 162                  array( 'rev_id', 'rev_page', 'old_id', 'old_flags', 'old_text' ),
 163                  array(
 164                      'rev_id > ' . $dbr->addQuotes( $startId ),
 165                      'rev_text_id=old_id',
 166                      $textClause,
 167                      'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
 168                  ),
 169                  __METHOD__,
 170                  array(
 171                      'ORDER BY' => 'rev_id',
 172                      'LIMIT' => $this->batchSize
 173                  )
 174              );
 175              if ( !$res->numRows() ) {
 176                  break;
 177              }
 178  
 179              $insertBatch = array();
 180              foreach ( $res as $row ) {
 181                  $startId = $row->rev_id;
 182                  $info = $this->interpretPointer( $row->old_text );
 183                  if ( !$info ) {
 184                      echo "Invalid DB:// URL in rev_id {$row->rev_id}\n";
 185                      continue;
 186                  }
 187                  if ( !in_array( $info['cluster'], $this->clusters ) ) {
 188                      echo "Invalid cluster returned in SQL query: {$info['cluster']}\n";
 189                      continue;
 190                  }
 191                  $insertBatch[] = array(
 192                      'bt_page' => $row->rev_page,
 193                      'bt_rev_id' => $row->rev_id,
 194                      'bt_text_id' => $row->old_id,
 195                      'bt_cluster' => $info['cluster'],
 196                      'bt_blob_id' => $info['id'],
 197                      'bt_cgz_hash' => $info['hash']
 198                  );
 199                  if ( $this->doBlobOrphans ) {
 200                      gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
 201                  }
 202              }
 203              $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
 204              $rowsInserted += count( $insertBatch );
 205  
 206              ++$batchesDone;
 207              if ( $batchesDone >= $this->reportingInterval ) {
 208                  $batchesDone = 0;
 209                  echo "$startId / $endId\n";
 210                  wfWaitForSlaves();
 211              }
 212          }
 213          echo "Found $rowsInserted revisions\n";
 214      }
 215  
 216      /**
 217       * Scan the text table for orphan text
 218       * Orphan text here does not imply DB corruption -- deleted text tracked by the
 219       * archive table counts as orphan for our purposes.
 220       */
 221  	function trackOrphanText() {
 222          # Wait until the blob_tracking table is available in the slave
 223          $dbw = wfGetDB( DB_MASTER );
 224          $dbr = wfGetDB( DB_SLAVE );
 225          $pos = $dbw->getMasterPos();
 226          $dbr->masterPosWait( $pos, 100000 );
 227  
 228          $textClause = $this->getTextClause( $this->clusters );
 229          $startId = 0;
 230          $endId = $dbr->selectField( 'text', 'MAX(old_id)', false, __METHOD__ );
 231          $rowsInserted = 0;
 232          $batchesDone = 0;
 233  
 234          echo "Finding orphan text...\n";
 235  
 236          # Scan the text table for orphan text
 237          while ( true ) {
 238              $res = $dbr->select( array( 'text', 'blob_tracking' ),
 239                  array( 'old_id', 'old_flags', 'old_text' ),
 240                  array(
 241                      'old_id>' . $dbr->addQuotes( $startId ),
 242                      $textClause,
 243                      'old_flags ' . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ),
 244                      'bt_text_id IS NULL'
 245                  ),
 246                  __METHOD__,
 247                  array(
 248                      'ORDER BY' => 'old_id',
 249                      'LIMIT' => $this->batchSize
 250                  ),
 251                  array( 'blob_tracking' => array( 'LEFT JOIN', 'bt_text_id=old_id' ) )
 252              );
 253              $ids = array();
 254              foreach ( $res as $row ) {
 255                  $ids[] = $row->old_id;
 256              }
 257  
 258              if ( !$res->numRows() ) {
 259                  break;
 260              }
 261  
 262              $insertBatch = array();
 263              foreach ( $res as $row ) {
 264                  $startId = $row->old_id;
 265                  $info = $this->interpretPointer( $row->old_text );
 266                  if ( !$info ) {
 267                      echo "Invalid DB:// URL in old_id {$row->old_id}\n";
 268                      continue;
 269                  }
 270                  if ( !in_array( $info['cluster'], $this->clusters ) ) {
 271                      echo "Invalid cluster returned in SQL query\n";
 272                      continue;
 273                  }
 274  
 275                  $insertBatch[] = array(
 276                      'bt_page' => 0,
 277                      'bt_rev_id' => 0,
 278                      'bt_text_id' => $row->old_id,
 279                      'bt_cluster' => $info['cluster'],
 280                      'bt_blob_id' => $info['id'],
 281                      'bt_cgz_hash' => $info['hash']
 282                  );
 283                  if ( $this->doBlobOrphans ) {
 284                      gmp_setbit( $this->trackedBlobs[$info['cluster']], $info['id'] );
 285                  }
 286              }
 287              $dbw->insert( 'blob_tracking', $insertBatch, __METHOD__ );
 288  
 289              $rowsInserted += count( $insertBatch );
 290              ++$batchesDone;
 291              if ( $batchesDone >= $this->reportingInterval ) {
 292                  $batchesDone = 0;
 293                  echo "$startId / $endId\n";
 294                  wfWaitForSlaves();
 295              }
 296          }
 297          echo "Found $rowsInserted orphan text rows\n";
 298      }
 299  
 300      /**
 301       * Scan the blobs table for rows not registered in blob_tracking (and thus not
 302       * registered in the text table).
 303       *
 304       * Orphan blobs are indicative of DB corruption. They are inaccessible and
 305       * should probably be deleted.
 306       */
 307  	function findOrphanBlobs() {
 308          if ( !extension_loaded( 'gmp' ) ) {
 309              echo "Can't find orphan blobs, need bitfield support provided by GMP.\n";
 310  
 311              return;
 312          }
 313  
 314          $dbw = wfGetDB( DB_MASTER );
 315  
 316          foreach ( $this->clusters as $cluster ) {
 317              echo "Searching for orphan blobs in $cluster...\n";
 318              $lb = wfGetLBFactory()->getExternalLB( $cluster );
 319              try {
 320                  $extDB = $lb->getConnection( DB_SLAVE );
 321              } catch ( DBConnectionError $e ) {
 322                  if ( strpos( $e->error, 'Unknown database' ) !== false ) {
 323                      echo "No database on $cluster\n";
 324                  } else {
 325                      echo "Error on $cluster: " . $e->getMessage() . "\n";
 326                  }
 327                  continue;
 328              }
 329              $table = $extDB->getLBInfo( 'blobs table' );
 330              if ( is_null( $table ) ) {
 331                  $table = 'blobs';
 332              }
 333              if ( !$extDB->tableExists( $table ) ) {
 334                  echo "No blobs table on cluster $cluster\n";
 335                  continue;
 336              }
 337              $startId = 0;
 338              $batchesDone = 0;
 339              $actualBlobs = gmp_init( 0 );
 340              $endId = $extDB->selectField( $table, 'MAX(blob_id)', false, __METHOD__ );
 341  
 342              // Build a bitmap of actual blob rows
 343              while ( true ) {
 344                  $res = $extDB->select( $table,
 345                      array( 'blob_id' ),
 346                      array( 'blob_id > ' . $extDB->addQuotes( $startId ) ),
 347                      __METHOD__,
 348                      array( 'LIMIT' => $this->batchSize, 'ORDER BY' => 'blob_id' )
 349                  );
 350  
 351                  if ( !$res->numRows() ) {
 352                      break;
 353                  }
 354  
 355                  foreach ( $res as $row ) {
 356                      gmp_setbit( $actualBlobs, $row->blob_id );
 357                  }
 358                  $startId = $row->blob_id;
 359  
 360                  ++$batchesDone;
 361                  if ( $batchesDone >= $this->reportingInterval ) {
 362                      $batchesDone = 0;
 363                      echo "$startId / $endId\n";
 364                  }
 365              }
 366  
 367              // Find actual blobs that weren't tracked by the previous passes
 368              // This is a set-theoretic difference A \ B, or in bitwise terms, A & ~B
 369              $orphans = gmp_and( $actualBlobs, gmp_com( $this->trackedBlobs[$cluster] ) );
 370  
 371              // Traverse the orphan list
 372              $insertBatch = array();
 373              $id = 0;
 374              $numOrphans = 0;
 375              while ( true ) {
 376                  $id = gmp_scan1( $orphans, $id );
 377                  if ( $id == -1 ) {
 378                      break;
 379                  }
 380                  $insertBatch[] = array(
 381                      'bo_cluster' => $cluster,
 382                      'bo_blob_id' => $id
 383                  );
 384                  if ( count( $insertBatch ) > $this->batchSize ) {
 385                      $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
 386                      $insertBatch = array();
 387                  }
 388  
 389                  ++$id;
 390                  ++$numOrphans;
 391              }
 392              if ( $insertBatch ) {
 393                  $dbw->insert( 'blob_orphans', $insertBatch, __METHOD__ );
 394              }
 395              echo "Found $numOrphans orphan(s) in $cluster\n";
 396          }
 397      }
 398  }


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1