MediaWiki
REL1_24
|
00001 <?php 00027 #$optionsWithArgs = array( 'begin', 'max-slave-lag' ); 00028 00029 require_once __DIR__ . '/Maintenance.php'; 00030 00037 class UpdateCollation extends Maintenance { 00038 const BATCH_SIZE = 10000; // Number of rows to process in one batch 00039 const SYNC_INTERVAL = 20; // Wait for slaves after this many batches 00040 00041 public $sizeHistogram = array(); 00042 00043 public function __construct() { 00044 parent::__construct(); 00045 00046 global $wgCategoryCollation; 00047 $this->mDescription = <<<TEXT 00048 This script will find all rows in the categorylinks table whose collation is 00049 out-of-date (cl_collation != '$wgCategoryCollation') and repopulate cl_sortkey 00050 using the page title and cl_sortkey_prefix. If all collations are 00051 up-to-date, it will do nothing. 00052 TEXT; 00053 00054 $this->addOption( 'force', 'Run on all rows, even if the collation is ' . 00055 'supposed to be up-to-date.' ); 00056 $this->addOption( 'previous-collation', 'Set the previous value of ' . 00057 '$wgCategoryCollation here to speed up this script, especially if your ' . 00058 'categorylinks table is large. This will only update rows with that ' . 00059 'collation, though, so it may miss out-of-date rows with a different, ' . 00060 'even older collation.', false, true ); 00061 $this->addOption( 'target-collation', 'Set this to the new collation type to ' . 00062 'use instead of $wgCategoryCollation. Usually you should not use this, ' . 00063 'you should just update $wgCategoryCollation in LocalSettings.php.', 00064 false, true ); 00065 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' . 00066 'compile statistics.' ); 00067 $this->addOption( 'verbose-stats', 'Show more statistics.' ); 00068 } 00069 00070 public function execute() { 00071 global $wgCategoryCollation; 00072 00073 $dbw = $this->getDB( DB_MASTER ); 00074 $force = $this->getOption( 'force' ); 00075 $dryRun = $this->getOption( 'dry-run' ); 00076 $verboseStats = $this->getOption( 'verbose-stats' ); 00077 if ( $this->hasOption( 'target-collation' ) ) { 00078 $collationName = $this->getOption( 'target-collation' ); 00079 $collation = Collation::factory( $collationName ); 00080 } else { 00081 $collationName = $wgCategoryCollation; 00082 $collation = Collation::singleton(); 00083 } 00084 00085 // Collation sanity check: in some cases the constructor will work, 00086 // but this will raise an exception, breaking all category pages 00087 $collation->getFirstLetter( 'MediaWiki' ); 00088 00089 $options = array( 00090 'LIMIT' => self::BATCH_SIZE, 00091 'ORDER BY' => 'cl_to, cl_type, cl_from', 00092 'STRAIGHT_JOIN', 00093 ); 00094 00095 if ( $force || $dryRun ) { 00096 $collationConds = array(); 00097 } else { 00098 if ( $this->hasOption( 'previous-collation' ) ) { 00099 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' ); 00100 } else { 00101 $collationConds = array( 0 => 00102 'cl_collation != ' . $dbw->addQuotes( $collationName ) 00103 ); 00104 } 00105 00106 $count = $dbw->estimateRowCount( 00107 'categorylinks', 00108 '*', 00109 $collationConds, 00110 __METHOD__ 00111 ); 00112 // Improve estimate if feasible 00113 if ( $count < 1000000 ) { 00114 $count = $dbw->selectField( 00115 'categorylinks', 00116 'COUNT(*)', 00117 $collationConds, 00118 __METHOD__ 00119 ); 00120 } 00121 if ( $count == 0 ) { 00122 $this->output( "Collations up-to-date.\n" ); 00123 00124 return; 00125 } 00126 $this->output( "Fixing collation for $count rows.\n" ); 00127 } 00128 00129 $count = 0; 00130 $batchCount = 0; 00131 $batchConds = array(); 00132 do { 00133 $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." ); 00134 $res = $dbw->select( 00135 array( 'categorylinks', 'page' ), 00136 array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', 00137 'cl_sortkey', 'cl_type', 'page_namespace', 'page_title' 00138 ), 00139 array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ), 00140 __METHOD__, 00141 $options 00142 ); 00143 $this->output( " processing..." ); 00144 00145 if ( !$dryRun ) { 00146 $dbw->begin( __METHOD__ ); 00147 } 00148 foreach ( $res as $row ) { 00149 $title = Title::newFromRow( $row ); 00150 if ( !$row->cl_collation ) { 00151 # This is an old-style row, so the sortkey needs to be 00152 # converted. 00153 if ( $row->cl_sortkey == $title->getText() 00154 || $row->cl_sortkey == $title->getPrefixedText() 00155 ) { 00156 $prefix = ''; 00157 } else { 00158 # Custom sortkey, use it as a prefix 00159 $prefix = $row->cl_sortkey; 00160 } 00161 } else { 00162 $prefix = $row->cl_sortkey_prefix; 00163 } 00164 # cl_type will be wrong for lots of pages if cl_collation is 0, 00165 # so let's update it while we're here. 00166 if ( $title->getNamespace() == NS_CATEGORY ) { 00167 $type = 'subcat'; 00168 } elseif ( $title->getNamespace() == NS_FILE ) { 00169 $type = 'file'; 00170 } else { 00171 $type = 'page'; 00172 } 00173 $newSortKey = $collation->getSortKey( 00174 $title->getCategorySortkey( $prefix ) ); 00175 if ( $verboseStats ) { 00176 $this->updateSortKeySizeHistogram( $newSortKey ); 00177 } 00178 00179 if ( !$dryRun ) { 00180 $dbw->update( 00181 'categorylinks', 00182 array( 00183 'cl_sortkey' => $newSortKey, 00184 'cl_sortkey_prefix' => $prefix, 00185 'cl_collation' => $collationName, 00186 'cl_type' => $type, 00187 'cl_timestamp = cl_timestamp', 00188 ), 00189 array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ), 00190 __METHOD__ 00191 ); 00192 } 00193 if ( $row ) { 00194 $batchConds = array( $this->getBatchCondition( $row, $dbw ) ); 00195 } 00196 } 00197 if ( !$dryRun ) { 00198 $dbw->commit( __METHOD__ ); 00199 } 00200 00201 $count += $res->numRows(); 00202 $this->output( "$count done.\n" ); 00203 00204 if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) { 00205 $this->output( "Waiting for slaves ... " ); 00206 wfWaitForSlaves(); 00207 $this->output( "done\n" ); 00208 } 00209 } while ( $res->numRows() == self::BATCH_SIZE ); 00210 00211 $this->output( "$count rows processed\n" ); 00212 00213 if ( $verboseStats ) { 00214 $this->output( "\n" ); 00215 $this->showSortKeySizeHistogram(); 00216 } 00217 } 00218 00226 function getBatchCondition( $row, $dbw ) { 00227 $fields = array( 'cl_to', 'cl_type', 'cl_from' ); 00228 $first = true; 00229 $cond = false; 00230 $prefix = false; 00231 foreach ( $fields as $field ) { 00232 $encValue = $dbw->addQuotes( $row->$field ); 00233 $inequality = "$field > $encValue"; 00234 $equality = "$field = $encValue"; 00235 if ( $first ) { 00236 $cond = $inequality; 00237 $prefix = $equality; 00238 $first = false; 00239 } else { 00240 $cond .= " OR ($prefix AND $inequality)"; 00241 $prefix .= " AND $equality"; 00242 } 00243 } 00244 00245 return $cond; 00246 } 00247 00248 function updateSortKeySizeHistogram( $key ) { 00249 $length = strlen( $key ); 00250 if ( !isset( $this->sizeHistogram[$length] ) ) { 00251 $this->sizeHistogram[$length] = 0; 00252 } 00253 $this->sizeHistogram[$length]++; 00254 } 00255 00256 function showSortKeySizeHistogram() { 00257 $maxLength = max( array_keys( $this->sizeHistogram ) ); 00258 if ( $maxLength == 0 ) { 00259 return; 00260 } 00261 $numBins = 20; 00262 $coarseHistogram = array_fill( 0, $numBins, 0 ); 00263 $coarseBoundaries = array(); 00264 $boundary = 0; 00265 for ( $i = 0; $i < $numBins - 1; $i++ ) { 00266 $boundary += $maxLength / $numBins; 00267 $coarseBoundaries[$i] = round( $boundary ); 00268 } 00269 $coarseBoundaries[$numBins - 1] = $maxLength + 1; 00270 $raw = ''; 00271 for ( $i = 0; $i <= $maxLength; $i++ ) { 00272 if ( $raw !== '' ) { 00273 $raw .= ', '; 00274 } 00275 if ( !isset( $this->sizeHistogram[$i] ) ) { 00276 $val = 0; 00277 } else { 00278 $val = $this->sizeHistogram[$i]; 00279 } 00280 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) { 00281 if ( $coarseBoundaries[$coarseIndex] > $i ) { 00282 $coarseHistogram[$coarseIndex] += $val; 00283 break; 00284 } 00285 } 00286 if ( $coarseIndex == $numBins - 1 ) { 00287 $coarseHistogram[$coarseIndex] += $val; 00288 } 00289 $raw .= $val; 00290 } 00291 00292 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" ); 00293 00294 $maxBinVal = max( $coarseHistogram ); 00295 $scale = 60 / $maxBinVal; 00296 $prevBoundary = 0; 00297 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) { 00298 if ( !isset( $coarseHistogram[$coarseIndex] ) ) { 00299 $val = 0; 00300 } else { 00301 $val = $coarseHistogram[$coarseIndex]; 00302 } 00303 $boundary = $coarseBoundaries[$coarseIndex]; 00304 $this->output( sprintf( "%-10s %-10d |%s\n", 00305 $prevBoundary . '-' . ( $boundary - 1 ) . ': ', 00306 $val, 00307 str_repeat( '*', $scale * $val ) ) ); 00308 $prevBoundary = $boundary; 00309 } 00310 } 00311 } 00312 00313 $maintClass = "UpdateCollation"; 00314 require_once RUN_MAINTENANCE_IF_MAIN;