[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/maintenance/ -> updateCollation.php (source)

   1  <?php
   2  /**
   3   * Find all rows in the categorylinks table whose collation is out-of-date
   4   * (cl_collation != $wgCategoryCollation) and repopulate cl_sortkey
   5   * using the page title and cl_sortkey_prefix.
   6   *
   7   * This program is free software; you can redistribute it and/or modify
   8   * it under the terms of the GNU General Public License as published by
   9   * the Free Software Foundation; either version 2 of the License, or
  10   * (at your option) any later version.
  11   *
  12   * This program is distributed in the hope that it will be useful,
  13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15   * GNU General Public License for more details.
  16   *
  17   * You should have received a copy of the GNU General Public License along
  18   * with this program; if not, write to the Free Software Foundation, Inc.,
  19   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  20   * http://www.gnu.org/copyleft/gpl.html
  21   *
  22   * @file
  23   * @ingroup Maintenance
  24   * @author Aryeh Gregor (Simetrical)
  25   */
  26  
  27  #$optionsWithArgs = array( 'begin', 'max-slave-lag' );
  28  
  29  require_once  __DIR__ . '/Maintenance.php';
  30  
  31  /**
  32   * Maintenance script that will find all rows in the categorylinks table
  33   * whose collation is out-of-date.
  34   *
  35   * @ingroup Maintenance
  36   */
  37  class UpdateCollation extends Maintenance {
  38      const BATCH_SIZE = 10000; // Number of rows to process in one batch
  39      const SYNC_INTERVAL = 20; // Wait for slaves after this many batches
  40  
  41      public $sizeHistogram = array();
  42  
  43  	public function __construct() {
  44          parent::__construct();
  45  
  46          global $wgCategoryCollation;
  47          $this->mDescription = <<<TEXT
  48  This script will find all rows in the categorylinks table whose collation is
  49  out-of-date (cl_collation != '$wgCategoryCollation') and repopulate cl_sortkey
  50  using the page title and cl_sortkey_prefix.  If all collations are
  51  up-to-date, it will do nothing.
  52  TEXT;
  53  
  54          $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
  55              'supposed to be up-to-date.' );
  56          $this->addOption( 'previous-collation', 'Set the previous value of ' .
  57              '$wgCategoryCollation here to speed up this script, especially if your ' .
  58              'categorylinks table is large. This will only update rows with that ' .
  59              'collation, though, so it may miss out-of-date rows with a different, ' .
  60              'even older collation.', false, true );
  61          $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
  62              'use instead of $wgCategoryCollation. Usually you should not use this, ' .
  63              'you should just update $wgCategoryCollation in LocalSettings.php.',
  64              false, true );
  65          $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
  66              'compile statistics.' );
  67          $this->addOption( 'verbose-stats', 'Show more statistics.' );
  68      }
  69  
  70  	public function execute() {
  71          global $wgCategoryCollation;
  72  
  73          $dbw = $this->getDB( DB_MASTER );
  74          $force = $this->getOption( 'force' );
  75          $dryRun = $this->getOption( 'dry-run' );
  76          $verboseStats = $this->getOption( 'verbose-stats' );
  77          if ( $this->hasOption( 'target-collation' ) ) {
  78              $collationName = $this->getOption( 'target-collation' );
  79              $collation = Collation::factory( $collationName );
  80          } else {
  81              $collationName = $wgCategoryCollation;
  82              $collation = Collation::singleton();
  83          }
  84  
  85          // Collation sanity check: in some cases the constructor will work,
  86          // but this will raise an exception, breaking all category pages
  87          $collation->getFirstLetter( 'MediaWiki' );
  88  
  89          $options = array(
  90              'LIMIT' => self::BATCH_SIZE,
  91              'ORDER BY' => 'cl_to, cl_type, cl_from',
  92              'STRAIGHT_JOIN',
  93          );
  94  
  95          if ( $force || $dryRun ) {
  96              $collationConds = array();
  97          } else {
  98              if ( $this->hasOption( 'previous-collation' ) ) {
  99                  $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
 100              } else {
 101                  $collationConds = array( 0 =>
 102                      'cl_collation != ' . $dbw->addQuotes( $collationName )
 103                  );
 104              }
 105  
 106              $count = $dbw->estimateRowCount(
 107                  'categorylinks',
 108                  '*',
 109                  $collationConds,
 110                  __METHOD__
 111              );
 112              // Improve estimate if feasible
 113              if ( $count < 1000000 ) {
 114                  $count = $dbw->selectField(
 115                      'categorylinks',
 116                      'COUNT(*)',
 117                      $collationConds,
 118                      __METHOD__
 119                  );
 120              }
 121              if ( $count == 0 ) {
 122                  $this->output( "Collations up-to-date.\n" );
 123  
 124                  return;
 125              }
 126              $this->output( "Fixing collation for $count rows.\n" );
 127          }
 128  
 129          $count = 0;
 130          $batchCount = 0;
 131          $batchConds = array();
 132          do {
 133              $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
 134              $res = $dbw->select(
 135                  array( 'categorylinks', 'page' ),
 136                  array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
 137                      'cl_sortkey', 'cl_type', 'page_namespace', 'page_title'
 138                  ),
 139                  array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ),
 140                  __METHOD__,
 141                  $options
 142              );
 143              $this->output( " processing..." );
 144  
 145              if ( !$dryRun ) {
 146                  $dbw->begin( __METHOD__ );
 147              }
 148              foreach ( $res as $row ) {
 149                  $title = Title::newFromRow( $row );
 150                  if ( !$row->cl_collation ) {
 151                      # This is an old-style row, so the sortkey needs to be
 152                      # converted.
 153                      if ( $row->cl_sortkey == $title->getText()
 154                          || $row->cl_sortkey == $title->getPrefixedText()
 155                      ) {
 156                          $prefix = '';
 157                      } else {
 158                          # Custom sortkey, use it as a prefix
 159                          $prefix = $row->cl_sortkey;
 160                      }
 161                  } else {
 162                      $prefix = $row->cl_sortkey_prefix;
 163                  }
 164                  # cl_type will be wrong for lots of pages if cl_collation is 0,
 165                  # so let's update it while we're here.
 166                  if ( $title->getNamespace() == NS_CATEGORY ) {
 167                      $type = 'subcat';
 168                  } elseif ( $title->getNamespace() == NS_FILE ) {
 169                      $type = 'file';
 170                  } else {
 171                      $type = 'page';
 172                  }
 173                  $newSortKey = $collation->getSortKey(
 174                      $title->getCategorySortkey( $prefix ) );
 175                  if ( $verboseStats ) {
 176                      $this->updateSortKeySizeHistogram( $newSortKey );
 177                  }
 178  
 179                  if ( !$dryRun ) {
 180                      $dbw->update(
 181                          'categorylinks',
 182                          array(
 183                              'cl_sortkey' => $newSortKey,
 184                              'cl_sortkey_prefix' => $prefix,
 185                              'cl_collation' => $collationName,
 186                              'cl_type' => $type,
 187                              'cl_timestamp = cl_timestamp',
 188                          ),
 189                          array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ),
 190                          __METHOD__
 191                      );
 192                  }
 193                  if ( $row ) {
 194                      $batchConds = array( $this->getBatchCondition( $row, $dbw ) );
 195                  }
 196              }
 197              if ( !$dryRun ) {
 198                  $dbw->commit( __METHOD__ );
 199              }
 200  
 201              $count += $res->numRows();
 202              $this->output( "$count done.\n" );
 203  
 204              if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) {
 205                  $this->output( "Waiting for slaves ... " );
 206                  wfWaitForSlaves();
 207                  $this->output( "done\n" );
 208              }
 209          } while ( $res->numRows() == self::BATCH_SIZE );
 210  
 211          $this->output( "$count rows processed\n" );
 212  
 213          if ( $verboseStats ) {
 214              $this->output( "\n" );
 215              $this->showSortKeySizeHistogram();
 216          }
 217      }
 218  
 219      /**
 220       * Return an SQL expression selecting rows which sort above the given row,
 221       * assuming an ordering of cl_to, cl_type, cl_from
 222       * @param stdClass $row
 223       * @param DatabaseBase $dbw
 224       * @return string
 225       */
 226  	function getBatchCondition( $row, $dbw ) {
 227          $fields = array( 'cl_to', 'cl_type', 'cl_from' );
 228          $first = true;
 229          $cond = false;
 230          $prefix = false;
 231          foreach ( $fields as $field ) {
 232              $encValue = $dbw->addQuotes( $row->$field );
 233              $inequality = "$field > $encValue";
 234              $equality = "$field = $encValue";
 235              if ( $first ) {
 236                  $cond = $inequality;
 237                  $prefix = $equality;
 238                  $first = false;
 239              } else {
 240                  $cond .= " OR ($prefix AND $inequality)";
 241                  $prefix .= " AND $equality";
 242              }
 243          }
 244  
 245          return $cond;
 246      }
 247  
 248  	function updateSortKeySizeHistogram( $key ) {
 249          $length = strlen( $key );
 250          if ( !isset( $this->sizeHistogram[$length] ) ) {
 251              $this->sizeHistogram[$length] = 0;
 252          }
 253          $this->sizeHistogram[$length]++;
 254      }
 255  
 256  	function showSortKeySizeHistogram() {
 257          $maxLength = max( array_keys( $this->sizeHistogram ) );
 258          if ( $maxLength == 0 ) {
 259              return;
 260          }
 261          $numBins = 20;
 262          $coarseHistogram = array_fill( 0, $numBins, 0 );
 263          $coarseBoundaries = array();
 264          $boundary = 0;
 265          for ( $i = 0; $i < $numBins - 1; $i++ ) {
 266              $boundary += $maxLength / $numBins;
 267              $coarseBoundaries[$i] = round( $boundary );
 268          }
 269          $coarseBoundaries[$numBins - 1] = $maxLength + 1;
 270          $raw = '';
 271          for ( $i = 0; $i <= $maxLength; $i++ ) {
 272              if ( $raw !== '' ) {
 273                  $raw .= ', ';
 274              }
 275              if ( !isset( $this->sizeHistogram[$i] ) ) {
 276                  $val = 0;
 277              } else {
 278                  $val = $this->sizeHistogram[$i];
 279              }
 280              for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
 281                  if ( $coarseBoundaries[$coarseIndex] > $i ) {
 282                      $coarseHistogram[$coarseIndex] += $val;
 283                      break;
 284                  }
 285              }
 286              if ( $coarseIndex == $numBins - 1 ) {
 287                  $coarseHistogram[$coarseIndex] += $val;
 288              }
 289              $raw .= $val;
 290          }
 291  
 292          $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
 293  
 294          $maxBinVal = max( $coarseHistogram );
 295          $scale = 60 / $maxBinVal;
 296          $prevBoundary = 0;
 297          for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
 298              if ( !isset( $coarseHistogram[$coarseIndex] ) ) {
 299                  $val = 0;
 300              } else {
 301                  $val = $coarseHistogram[$coarseIndex];
 302              }
 303              $boundary = $coarseBoundaries[$coarseIndex];
 304              $this->output( sprintf( "%-10s %-10d |%s\n",
 305                  $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
 306                  $val,
 307                  str_repeat( '*', $scale * $val ) ) );
 308              $prevBoundary = $boundary;
 309          }
 310      }
 311  }
 312  
 313  $maintClass = "UpdateCollation";
 314  require_once RUN_MAINTENANCE_IF_MAIN;


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1