MediaWiki
REL1_22
|
00001 <?php 00027 #$optionsWithArgs = array( 'begin', 'max-slave-lag' ); 00028 00029 require_once __DIR__ . '/Maintenance.php'; 00030 00037 class UpdateCollation extends Maintenance { 00038 const BATCH_SIZE = 10000; // Number of rows to process in one batch 00039 const SYNC_INTERVAL = 20; // Wait for slaves after this many batches 00040 00041 public $sizeHistogram = array(); 00042 00043 public function __construct() { 00044 parent::__construct(); 00045 00046 global $wgCategoryCollation; 00047 $this->mDescription = <<<TEXT 00048 This script will find all rows in the categorylinks table whose collation is 00049 out-of-date (cl_collation != '$wgCategoryCollation') and repopulate cl_sortkey 00050 using the page title and cl_sortkey_prefix. If everything's collation is 00051 up-to-date, it will do nothing. 00052 TEXT; 00053 00054 $this->addOption( 'force', 'Run on all rows, even if the collation is ' . 00055 'supposed to be up-to-date.' ); 00056 $this->addOption( 'previous-collation', 'Set the previous value of ' . 00057 '$wgCategoryCollation here to speed up this script, especially if your ' . 00058 'categorylinks table is large. This will only update rows with that ' . 00059 'collation, though, so it may miss out-of-date rows with a different, ' . 00060 'even older collation.', false, true ); 00061 $this->addOption( 'target-collation', 'Set this to the new collation type to ' . 00062 'use instead of $wgCategoryCollation. Usually you should not use this, ' . 00063 'you should just update $wgCategoryCollation in LocalSettings.php.', 00064 false, true ); 00065 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' . 00066 'compile statistics.' ); 00067 $this->addOption( 'verbose-stats', 'Show more statistics.' ); 00068 } 00069 00070 public function execute() { 00071 global $wgCategoryCollation; 00072 00073 $dbw = $this->getDB( DB_MASTER ); 00074 $force = $this->getOption( 'force' ); 00075 $dryRun = $this->getOption( 'dry-run' ); 00076 $verboseStats = $this->getOption( 'verbose-stats' ); 00077 if ( $this->hasOption( 'target-collation' ) ) { 00078 $collationName = $this->getOption( 'target-collation' ); 00079 $collation = Collation::factory( $collationName ); 00080 } else { 00081 $collationName = $wgCategoryCollation; 00082 $collation = Collation::singleton(); 00083 } 00084 00085 // Collation sanity check: in some cases the constructor will work, 00086 // but this will raise an exception, breaking all category pages 00087 $collation->getFirstLetter( 'MediaWiki' ); 00088 00089 $options = array( 00090 'LIMIT' => self::BATCH_SIZE, 00091 'ORDER BY' => 'cl_to, cl_type, cl_from', 00092 'STRAIGHT_JOIN', 00093 ); 00094 00095 if ( $force || $dryRun ) { 00096 $collationConds = array(); 00097 } else { 00098 if ( $this->hasOption( 'previous-collation' ) ) { 00099 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' ); 00100 } else { 00101 $collationConds = array( 0 => 00102 'cl_collation != ' . $dbw->addQuotes( $collationName ) 00103 ); 00104 } 00105 00106 $count = $dbw->estimateRowCount( 00107 'categorylinks', 00108 '*', 00109 $collationConds, 00110 __METHOD__ 00111 ); 00112 // Improve estimate if feasible 00113 if ( $count < 1000000 ) { 00114 $count = $dbw->selectField( 00115 'categorylinks', 00116 'COUNT(*)', 00117 $collationConds, 00118 __METHOD__ 00119 ); 00120 } 00121 if ( $count == 0 ) { 00122 $this->output( "Collations up-to-date.\n" ); 00123 return; 00124 } 00125 $this->output( "Fixing collation for $count rows.\n" ); 00126 } 00127 00128 $count = 0; 00129 $batchCount = 0; 00130 $batchConds = array(); 00131 do { 00132 $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." ); 00133 $res = $dbw->select( 00134 array( 'categorylinks', 'page' ), 00135 array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', 00136 'cl_sortkey', 'cl_type', 'page_namespace', 'page_title' 00137 ), 00138 array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ), 00139 __METHOD__, 00140 $options 00141 ); 00142 $this->output( " processing..." ); 00143 00144 if ( !$dryRun ) { 00145 $dbw->begin( __METHOD__ ); 00146 } 00147 foreach ( $res as $row ) { 00148 $title = Title::newFromRow( $row ); 00149 if ( !$row->cl_collation ) { 00150 # This is an old-style row, so the sortkey needs to be 00151 # converted. 00152 if ( $row->cl_sortkey == $title->getText() 00153 || $row->cl_sortkey == $title->getPrefixedText() ) { 00154 $prefix = ''; 00155 } else { 00156 # Custom sortkey, use it as a prefix 00157 $prefix = $row->cl_sortkey; 00158 } 00159 } else { 00160 $prefix = $row->cl_sortkey_prefix; 00161 } 00162 # cl_type will be wrong for lots of pages if cl_collation is 0, 00163 # so let's update it while we're here. 00164 if ( $title->getNamespace() == NS_CATEGORY ) { 00165 $type = 'subcat'; 00166 } elseif ( $title->getNamespace() == NS_FILE ) { 00167 $type = 'file'; 00168 } else { 00169 $type = 'page'; 00170 } 00171 $newSortKey = $collation->getSortKey( 00172 $title->getCategorySortkey( $prefix ) ); 00173 if ( $verboseStats ) { 00174 $this->updateSortKeySizeHistogram( $newSortKey ); 00175 } 00176 00177 if ( !$dryRun ) { 00178 $dbw->update( 00179 'categorylinks', 00180 array( 00181 'cl_sortkey' => $newSortKey, 00182 'cl_sortkey_prefix' => $prefix, 00183 'cl_collation' => $collationName, 00184 'cl_type' => $type, 00185 'cl_timestamp = cl_timestamp', 00186 ), 00187 array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ), 00188 __METHOD__ 00189 ); 00190 } 00191 } 00192 if ( !$dryRun ) { 00193 $dbw->commit( __METHOD__ ); 00194 } 00195 00196 if ( $row ) { 00197 $batchConds = array( $this->getBatchCondition( $row ) ); 00198 } 00199 00200 $count += $res->numRows(); 00201 $this->output( "$count done.\n" ); 00202 00203 if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) { 00204 $this->output( "Waiting for slaves ... " ); 00205 wfWaitForSlaves(); 00206 $this->output( "done\n" ); 00207 } 00208 } while ( $res->numRows() == self::BATCH_SIZE ); 00209 00210 $this->output( "$count rows processed\n" ); 00211 00212 if ( $verboseStats ) { 00213 $this->output( "\n" ); 00214 $this->showSortKeySizeHistogram(); 00215 } 00216 } 00217 00222 function getBatchCondition( $row ) { 00223 $dbw = $this->getDB( DB_MASTER ); 00224 $fields = array( 'cl_to', 'cl_type', 'cl_from' ); 00225 $first = true; 00226 $cond = false; 00227 $prefix = false; 00228 foreach ( $fields as $field ) { 00229 $encValue = $dbw->addQuotes( $row->$field ); 00230 $inequality = "$field > $encValue"; 00231 $equality = "$field = $encValue"; 00232 if ( $first ) { 00233 $cond = $inequality; 00234 $prefix = $equality; 00235 $first = false; 00236 } else { 00237 $cond .= " OR ($prefix AND $inequality)"; 00238 $prefix .= " AND $equality"; 00239 } 00240 } 00241 return $cond; 00242 } 00243 00244 function updateSortKeySizeHistogram( $key ) { 00245 $length = strlen( $key ); 00246 if ( !isset( $this->sizeHistogram[$length] ) ) { 00247 $this->sizeHistogram[$length] = 0; 00248 } 00249 $this->sizeHistogram[$length]++; 00250 } 00251 00252 function showSortKeySizeHistogram() { 00253 $maxLength = max( array_keys( $this->sizeHistogram ) ); 00254 if ( $maxLength == 0 ) { 00255 return; 00256 } 00257 $numBins = 20; 00258 $coarseHistogram = array_fill( 0, $numBins, 0 ); 00259 $coarseBoundaries = array(); 00260 $boundary = 0; 00261 for ( $i = 0; $i < $numBins - 1; $i++ ) { 00262 $boundary += $maxLength / $numBins; 00263 $coarseBoundaries[$i] = round( $boundary ); 00264 } 00265 $coarseBoundaries[$numBins - 1] = $maxLength + 1; 00266 $raw = ''; 00267 for ( $i = 0; $i <= $maxLength; $i++ ) { 00268 if ( $raw !== '' ) { 00269 $raw .= ', '; 00270 } 00271 if ( !isset( $this->sizeHistogram[$i] ) ) { 00272 $val = 0; 00273 } else { 00274 $val = $this->sizeHistogram[$i]; 00275 } 00276 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) { 00277 if ( $coarseBoundaries[$coarseIndex] > $i ) { 00278 $coarseHistogram[$coarseIndex] += $val; 00279 break; 00280 } 00281 } 00282 if ( $coarseIndex == $numBins - 1 ) { 00283 $coarseHistogram[$coarseIndex] += $val; 00284 } 00285 $raw .= $val; 00286 } 00287 00288 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" ); 00289 00290 $maxBinVal = max( $coarseHistogram ); 00291 $scale = 60 / $maxBinVal; 00292 $prevBoundary = 0; 00293 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) { 00294 if ( !isset( $coarseHistogram[$coarseIndex] ) ) { 00295 $val = 0; 00296 } else { 00297 $val = $coarseHistogram[$coarseIndex]; 00298 } 00299 $boundary = $coarseBoundaries[$coarseIndex]; 00300 $this->output( sprintf( "%-10s %-10d |%s\n", 00301 $prevBoundary . '-' . ( $boundary - 1 ) . ': ', 00302 $val, 00303 str_repeat( '*', $scale * $val ) ) ); 00304 $prevBoundary = $boundary; 00305 } 00306 } 00307 } 00308 00309 $maintClass = "UpdateCollation"; 00310 require_once RUN_MAINTENANCE_IF_MAIN;