MediaWiki  REL1_21
updateCollation.php
Go to the documentation of this file.
00001 <?php
00027 #$optionsWithArgs = array( 'begin', 'max-slave-lag' );
00028 
00029 require_once( __DIR__ . '/Maintenance.php' );
00030 
00037 class UpdateCollation extends Maintenance {
00038         const BATCH_SIZE = 10000; // Number of rows to process in one batch
00039         const SYNC_INTERVAL = 20; // Wait for slaves after this many batches
00040 
00041         public $sizeHistogram = array();
00042 
00043         public function __construct() {
00044                 parent::__construct();
00045 
00046                 global $wgCategoryCollation;
00047                 $this->mDescription = <<<TEXT
00048 This script will find all rows in the categorylinks table whose collation is
00049 out-of-date (cl_collation != '$wgCategoryCollation') and repopulate cl_sortkey
00050 using the page title and cl_sortkey_prefix.  If everything's collation is
00051 up-to-date, it will do nothing.
00052 TEXT;
00053 
00054                 $this->addOption( 'force', 'Run on all rows, even if the collation is ' .
00055                         'supposed to be up-to-date.' );
00056                 $this->addOption( 'previous-collation', 'Set the previous value of ' .
00057                         '$wgCategoryCollation here to speed up this script, especially if your ' .
00058                         'categorylinks table is large. This will only update rows with that ' .
00059                         'collation, though, so it may miss out-of-date rows with a different, ' .
00060                         'even older collation.', false, true );
00061                 $this->addOption( 'target-collation', 'Set this to the new collation type to ' .
00062                         'use instead of $wgCategoryCollation. Usually you should not use this, ' .
00063                         'you should just update $wgCategoryCollation in LocalSettings.php.',
00064                         false, true );
00065                 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
00066                         'compile statistics.' );
00067                 $this->addOption( 'verbose-stats', 'Show more statistics.' );
00068         }
00069 
00070         public function execute() {
00071                 global $wgCategoryCollation;
00072 
00073                 $dbw = $this->getDB( DB_MASTER );
00074                 $force = $this->getOption( 'force' );
00075                 $dryRun = $this->getOption( 'dry-run' );
00076                 $verboseStats = $this->getOption( 'verbose-stats' );
00077                 if ( $this->hasOption( 'target-collation' ) ) {
00078                         $collationName = $this->getOption( 'target-collation' );
00079                         $collation = Collation::factory( $collationName );
00080                 } else {
00081                         $collationName = $wgCategoryCollation;
00082                         $collation = Collation::singleton();
00083                 }
00084 
00085                 $options = array(
00086                         'LIMIT' => self::BATCH_SIZE,
00087                         'ORDER BY' => 'cl_to, cl_type, cl_from',
00088                         'STRAIGHT_JOIN',
00089                 );
00090 
00091                 if ( $force || $dryRun ) {
00092                         $collationConds = array();
00093                 } else {
00094                         if ( $this->hasOption( 'previous-collation' ) ) {
00095                                 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
00096                         } else {
00097                                 $collationConds = array( 0 =>
00098                                         'cl_collation != ' . $dbw->addQuotes( $collationName )
00099                                 );
00100                         }
00101 
00102                         $count = $dbw->estimateRowCount(
00103                                 'categorylinks',
00104                                 '*',
00105                                 $collationConds,
00106                                 __METHOD__
00107                         );
00108                         // Improve estimate if feasible
00109                         if ( $count < 1000000 ) {
00110                                 $count = $dbw->selectField(
00111                                         'categorylinks',
00112                                         'COUNT(*)',
00113                                         $collationConds,
00114                                         __METHOD__
00115                                 );
00116                         }
00117                         if ( $count == 0 ) {
00118                                 $this->output( "Collations up-to-date.\n" );
00119                                 return;
00120                         }
00121                         $this->output( "Fixing collation for $count rows.\n" );
00122                 }
00123 
00124                 $count = 0;
00125                 $batchCount = 0;
00126                 $batchConds = array();
00127                 do {
00128                         $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
00129                         $res = $dbw->select(
00130                                 array( 'categorylinks', 'page' ),
00131                                 array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
00132                                         'cl_sortkey', 'cl_type', 'page_namespace', 'page_title'
00133                                 ),
00134                                 array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ),
00135                                 __METHOD__,
00136                                 $options
00137                         );
00138                         $this->output( " processing..." );
00139 
00140                         if ( !$dryRun ) {
00141                                 $dbw->begin( __METHOD__ );
00142                         }
00143                         foreach ( $res as $row ) {
00144                                 $title = Title::newFromRow( $row );
00145                                 if ( !$row->cl_collation ) {
00146                                         # This is an old-style row, so the sortkey needs to be
00147                                         # converted.
00148                                         if ( $row->cl_sortkey == $title->getText()
00149                                                 || $row->cl_sortkey == $title->getPrefixedText() ) {
00150                                                 $prefix = '';
00151                                         } else {
00152                                                 # Custom sortkey, use it as a prefix
00153                                                 $prefix = $row->cl_sortkey;
00154                                         }
00155                                 } else {
00156                                         $prefix = $row->cl_sortkey_prefix;
00157                                 }
00158                                 # cl_type will be wrong for lots of pages if cl_collation is 0,
00159                                 # so let's update it while we're here.
00160                                 if ( $title->getNamespace() == NS_CATEGORY ) {
00161                                         $type = 'subcat';
00162                                 } elseif ( $title->getNamespace() == NS_FILE ) {
00163                                         $type = 'file';
00164                                 } else {
00165                                         $type = 'page';
00166                                 }
00167                                 $newSortKey = $collation->getSortKey(
00168                                         $title->getCategorySortkey( $prefix ) );
00169                                 if ( $verboseStats ) {
00170                                         $this->updateSortKeySizeHistogram( $newSortKey );
00171                                 }
00172 
00173                                 if ( !$dryRun ) {
00174                                         $dbw->update(
00175                                                 'categorylinks',
00176                                                 array(
00177                                                         'cl_sortkey' => $newSortKey,
00178                                                         'cl_sortkey_prefix' => $prefix,
00179                                                         'cl_collation' => $collationName,
00180                                                         'cl_type' => $type,
00181                                                         'cl_timestamp = cl_timestamp',
00182                                                 ),
00183                                                 array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ),
00184                                                 __METHOD__
00185                                         );
00186                                 }
00187                         }
00188                         if ( !$dryRun ) {
00189                                 $dbw->commit( __METHOD__ );
00190                         }
00191 
00192                         if ( $row ) {
00193                                 $batchConds = array( $this->getBatchCondition( $row ) );
00194                         }
00195 
00196                         $count += $res->numRows();
00197                         $this->output( "$count done.\n" );
00198 
00199                         if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) {
00200                                 $this->output( "Waiting for slaves ... " );
00201                                 wfWaitForSlaves();
00202                                 $this->output( "done\n" );
00203                         }
00204                 } while ( $res->numRows() == self::BATCH_SIZE );
00205 
00206                 $this->output( "$count rows processed\n" );
00207 
00208                 if ( $verboseStats ) {
00209                         $this->output( "\n" );
00210                         $this->showSortKeySizeHistogram();
00211                 }
00212         }
00213 
00218         function getBatchCondition( $row ) {
00219                 $dbw = $this->getDB( DB_MASTER );
00220                 $fields = array( 'cl_to', 'cl_type', 'cl_from' );
00221                 $first = true;
00222                 $cond = false;
00223                 $prefix = false;
00224                 foreach ( $fields as $field ) {
00225                         $encValue = $dbw->addQuotes( $row->$field );
00226                         $inequality = "$field > $encValue";
00227                         $equality = "$field = $encValue";
00228                         if ( $first ) {
00229                                 $cond = $inequality;
00230                                 $prefix = $equality;
00231                                 $first = false;
00232                         } else {
00233                                 $cond .= " OR ($prefix AND $inequality)";
00234                                 $prefix .= " AND $equality";
00235                         }
00236                 }
00237                 return $cond;
00238         }
00239 
00240         function updateSortKeySizeHistogram( $key ) {
00241                 $length = strlen( $key );
00242                 if ( !isset( $this->sizeHistogram[$length] ) ) {
00243                         $this->sizeHistogram[$length] = 0;
00244                 }
00245                 $this->sizeHistogram[$length]++;
00246         }
00247 
00248         function showSortKeySizeHistogram() {
00249                 $maxLength = max( array_keys( $this->sizeHistogram ) );
00250                 if ( $maxLength == 0 ) {
00251                         return;
00252                 }
00253                 $numBins = 20;
00254                 $coarseHistogram = array_fill( 0, $numBins, 0 );
00255                 $coarseBoundaries = array();
00256                 $boundary = 0;
00257                 for ( $i = 0; $i < $numBins - 1; $i++ ) {
00258                         $boundary += $maxLength / $numBins;
00259                         $coarseBoundaries[$i] = round( $boundary );
00260                 }
00261                 $coarseBoundaries[$numBins - 1] = $maxLength + 1;
00262                 $raw = '';
00263                 for ( $i = 0; $i <= $maxLength; $i++ ) {
00264                         if ( $raw !== '' ) {
00265                                 $raw .= ', ';
00266                         }
00267                         if ( !isset( $this->sizeHistogram[$i] ) ) {
00268                                 $val = 0;
00269                         } else {
00270                                 $val = $this->sizeHistogram[$i];
00271                         }
00272                         for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
00273                                 if ( $coarseBoundaries[$coarseIndex] > $i ) {
00274                                         $coarseHistogram[$coarseIndex] += $val;
00275                                         break;
00276                                 }
00277                         }
00278                         if ( $coarseIndex == $numBins - 1 ) {
00279                                 $coarseHistogram[$coarseIndex] += $val;
00280                         }
00281                         $raw .= $val;
00282                 }
00283 
00284                 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
00285 
00286                 $maxBinVal = max( $coarseHistogram );
00287                 $scale = 60 / $maxBinVal;
00288                 $prevBoundary = 0;
00289                 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
00290                         if ( !isset( $coarseHistogram[$coarseIndex] ) ) {
00291                                 $val = 0;
00292                         } else {
00293                                 $val = $coarseHistogram[$coarseIndex];
00294                         }
00295                         $boundary = $coarseBoundaries[$coarseIndex];
00296                         $this->output( sprintf( "%-10s %-10d |%s\n",
00297                                 $prevBoundary . '-' . ( $boundary - 1 ) . ': ',
00298                                 $val,
00299                                 str_repeat( '*', $scale * $val ) ) );
00300                         $prevBoundary = $boundary;
00301                 }
00302         }
00303 }
00304 
00305 $maintClass = "UpdateCollation";
00306 require_once( RUN_MAINTENANCE_IF_MAIN );