MediaWiki  REL1_20
updateCollation.php
Go to the documentation of this file.
00001 <?php
00027 #$optionsWithArgs = array( 'begin', 'max-slave-lag' );
00028 
00029 require_once( __DIR__ . '/Maintenance.php' );
00030 
00037 class UpdateCollation extends Maintenance {
00038         const BATCH_SIZE = 50; // Number of rows to process in one batch
00039         const SYNC_INTERVAL = 20; // Wait for slaves after this many batches
00040 
00041         var $sizeHistogram = array();
00042 
00043         public function __construct() {
00044                 parent::__construct();
00045 
00046                 global $wgCategoryCollation;
00047                 $this->mDescription = <<<TEXT
00048 This script will find all rows in the categorylinks table whose collation is
00049 out-of-date (cl_collation != '$wgCategoryCollation') and repopulate cl_sortkey
00050 using the page title and cl_sortkey_prefix.  If everything's collation is
00051 up-to-date, it will do nothing.
00052 TEXT;
00053 
00054                 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
00055                         'supposed to be up-to-date.' );
00056                 $this->addOption( 'previous-collation', 'Set the previous value of ' .
00057                         '$wgCategoryCollation here to speed up this script, especially if your ' .
00058                         'categorylinks table is large. This will only update rows with that ' .
00059                         'collation, though, so it may miss out-of-date rows with a different, ' .
00060                         'even older collation.', false, true );
00061                 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
00062                         'use instead of $wgCategoryCollation. Usually you should not use this, ' .
00063                         'you should just update $wgCategoryCollation in LocalSettings.php.',
00064                         false, true );
00065                 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
00066                         'compile statistics.' );
00067                 $this->addOption( 'verbose-stats', 'Show more statistics.' );
00068         }
00069 
00070         public function execute() {
00071                 global $wgCategoryCollation, $wgMiserMode;
00072 
00073                 $dbw = $this->getDB( DB_MASTER );
00074                 $force = $this->getOption( 'force' );
00075                 $dryRun = $this->getOption( 'dry-run' );
00076                 $verboseStats = $this->getOption( 'verbose-stats' );
00077                 if ( $this->hasOption( 'target-collation' ) ) {
00078                         $collationName = $this->getOption( 'target-collation' );
00079                         $collation = Collation::factory( $collationName );
00080                 } else {
00081                         $collationName = $wgCategoryCollation;
00082                         $collation = Collation::singleton();
00083                 }
00084 
00085                 $options = array( 'LIMIT' => self::BATCH_SIZE, 'STRAIGHT_JOIN' );
00086 
00087                 if ( $force || $dryRun ) {
00088                         $options['ORDER BY'] = 'cl_from, cl_to';
00089                         $collationConds = array();
00090                 } else {
00091                         if ( $this->hasOption( 'previous-collation' ) ) {
00092                                 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
00093                         } else {
00094                                 $collationConds = array( 0 =>
00095                                         'cl_collation != ' . $dbw->addQuotes( $collationName )
00096                                 );
00097                         }
00098 
00099                         if ( !$wgMiserMode ) {
00100                                 $count = $dbw->selectField(
00101                                         'categorylinks',
00102                                         'COUNT(*)',
00103                                         $collationConds,
00104                                         __METHOD__
00105                                 );
00106                         } else {
00107                                 $count = $dbw->estimateRowCount(
00108                                         'categorylinks',
00109                                         '*',
00110                                         $collationConds,
00111                                         __METHOD__
00112                                 );
00113                         }
00114                         if ( $count == 0 ) {
00115                                 $this->output( "Collations up-to-date.\n" );
00116                                 return;
00117                         }
00118                         $this->output( "Fixing collation for $count rows.\n" );
00119                 }
00120 
00121                 $count = 0;
00122                 $batchCount = 0;
00123                 $batchConds = array();
00124                 do {
00125                         $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
00126                         $res = $dbw->select(
00127                                 array( 'categorylinks', 'page' ),
00128                                 array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
00129                                         'cl_sortkey', 'page_namespace', 'page_title'
00130                                 ),
00131                                 array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ),
00132                                 __METHOD__,
00133                                 $options
00134                         );
00135                         $this->output( " processing..." );
00136 
00137                         if ( !$dryRun ) {
00138                                 $dbw->begin( __METHOD__ );
00139                         }
00140                         foreach ( $res as $row ) {
00141                                 $title = Title::newFromRow( $row );
00142                                 if ( !$row->cl_collation ) {
00143                                         # This is an old-style row, so the sortkey needs to be
00144                                         # converted.
00145                                         if ( $row->cl_sortkey == $title->getText()
00146                                                 || $row->cl_sortkey == $title->getPrefixedText() ) {
00147                                                 $prefix = '';
00148                                         } else {
00149                                                 # Custom sortkey, use it as a prefix
00150                                                 $prefix = $row->cl_sortkey;
00151                                         }
00152                                 } else {
00153                                         $prefix = $row->cl_sortkey_prefix;
00154                                 }
00155                                 # cl_type will be wrong for lots of pages if cl_collation is 0,
00156                                 # so let's update it while we're here.
00157                                 if ( $title->getNamespace() == NS_CATEGORY ) {
00158                                         $type = 'subcat';
00159                                 } elseif ( $title->getNamespace() == NS_FILE ) {
00160                                         $type = 'file';
00161                                 } else {
00162                                         $type = 'page';
00163                                 }
00164                                 $newSortKey = $collation->getSortKey(
00165                                         $title->getCategorySortkey( $prefix ) );
00166                                 if ( $verboseStats ) {
00167                                         $this->updateSortKeySizeHistogram( $newSortKey );
00168                                 }
00169 
00170                                 if ( !$dryRun ) {
00171                                         $dbw->update(
00172                                                 'categorylinks',
00173                                                 array(
00174                                                         'cl_sortkey' => $newSortKey,
00175                                                         'cl_sortkey_prefix' => $prefix,
00176                                                         'cl_collation' => $collationName,
00177                                                         'cl_type' => $type,
00178                                                         'cl_timestamp = cl_timestamp',
00179                                                 ),
00180                                                 array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ),
00181                                                 __METHOD__
00182                                         );
00183                                 }
00184                         }
00185                         if ( !$dryRun ) {
00186                                 $dbw->commit( __METHOD__ );
00187                         }
00188 
00189                         if ( ( $force || $dryRun ) && $row ) {
00190                                 $encFrom = $dbw->addQuotes( $row->cl_from );
00191                                 $encTo = $dbw->addQuotes( $row->cl_to );
00192                                 $batchConds = array(
00193                                         "(cl_from = $encFrom AND cl_to > $encTo) " .
00194                                         " OR cl_from > $encFrom" );
00195                         }
00196 
00197                         $count += $res->numRows();
00198                         $this->output( "$count done.\n" );
00199 
00200                         if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) {
00201                                 $this->output( "Waiting for slaves ... " );
00202                                 wfWaitForSlaves();
00203                                 $this->output( "done\n" );
00204                         }
00205                 } while ( $res->numRows() == self::BATCH_SIZE );
00206 
00207                 $this->output( "$count rows processed\n" );
00208 
00209                 if ( $verboseStats ) {
00210                         $this->output( "\n" );
00211                         $this->showSortKeySizeHistogram();
00212                 }
00213         }
00214 
00215         function updateSortKeySizeHistogram( $key ) {
00216                 $length = strlen( $key );
00217                 if ( !isset( $this->sizeHistogram[$length] ) ) {
00218                         $this->sizeHistogram[$length] = 0;
00219                 }
00220                 $this->sizeHistogram[$length]++;
00221         }
00222 
00223         function showSortKeySizeHistogram() {
00224                 $maxLength = max( array_keys( $this->sizeHistogram ) );
00225                 if ( $maxLength == 0 ) {
00226                         return;
00227                 }
00228                 $numBins = 20;
00229                 $coarseHistogram = array_fill( 0, $numBins, 0 );
00230                 $coarseBoundaries = array();
00231                 $boundary = 0;
00232                 for ( $i = 0; $i < $numBins - 1; $i++ ) {
00233                         $boundary += $maxLength / $numBins;
00234                         $coarseBoundaries[$i] = round( $boundary );
00235                 }
00236                 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
00237                 $raw = '';
00238                 for ( $i = 0; $i <= $maxLength; $i++ ) {
00239                         if ( $raw !== '' ) {
00240                                 $raw .= ', ';
00241                         }
00242                         if ( !isset( $this->sizeHistogram[$i] ) ) {
00243                                 $val = 0;
00244                         } else {
00245                                 $val = $this->sizeHistogram[$i];
00246                         }
00247                         for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
00248                                 if ( $coarseBoundaries[$coarseIndex] > $i ) {
00249                                         $coarseHistogram[$coarseIndex] += $val;
00250                                         break;
00251                                 }
00252                         }
00253                         if ( $coarseIndex == $numBins - 1 ) {
00254                                 $coarseHistogram[$coarseIndex] += $val;
00255                         }
00256                         $raw .= $val;
00257                 }
00258 
00259                 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
00260 
00261                 $maxBinVal = max( $coarseHistogram );
00262                 $scale = 60 / $maxBinVal;
00263                 $prevBoundary = 0;
00264                 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
00265                         if ( !isset( $coarseHistogram[$coarseIndex] ) ) {
00266                                 $val = 0;
00267                         } else {
00268                                 $val = $coarseHistogram[$coarseIndex];
00269                         }
00270                         $boundary = $coarseBoundaries[$coarseIndex];
00271                         $this->output( sprintf( "%-10s %-10d |%s\n",
00272                                 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
00273                                 $val,
00274                                 str_repeat( '*', $scale * $val ) ) );
00275                         $prevBoundary = $boundary;
00276                 }
00277         }
00278 }
00279 
00280 $maintClass = "UpdateCollation";
00281 require_once( RUN_MAINTENANCE_IF_MAIN );