MediaWiki
REL1_21
|
00001 <?php 00027 #$optionsWithArgs = array( 'begin', 'max-slave-lag' ); 00028 00029 require_once( __DIR__ . '/Maintenance.php' ); 00030 00037 class UpdateCollation extends Maintenance { 00038 const BATCH_SIZE = 10000; // Number of rows to process in one batch 00039 const SYNC_INTERVAL = 20; // Wait for slaves after this many batches 00040 00041 public $sizeHistogram = array(); 00042 00043 public function __construct() { 00044 parent::__construct(); 00045 00046 global $wgCategoryCollation; 00047 $this->mDescription = <<<TEXT 00048 This script will find all rows in the categorylinks table whose collation is 00049 out-of-date (cl_collation != '$wgCategoryCollation') and repopulate cl_sortkey 00050 using the page title and cl_sortkey_prefix. If everything's collation is 00051 up-to-date, it will do nothing. 00052 TEXT; 00053 00054 $this->addOption( 'force', 'Run on all rows, even if the collation is ' . 00055 'supposed to be up-to-date.' ); 00056 $this->addOption( 'previous-collation', 'Set the previous value of ' . 00057 '$wgCategoryCollation here to speed up this script, especially if your ' . 00058 'categorylinks table is large. This will only update rows with that ' . 00059 'collation, though, so it may miss out-of-date rows with a different, ' . 00060 'even older collation.', false, true ); 00061 $this->addOption( 'target-collation', 'Set this to the new collation type to ' . 00062 'use instead of $wgCategoryCollation. Usually you should not use this, ' . 00063 'you should just update $wgCategoryCollation in LocalSettings.php.', 00064 false, true ); 00065 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' . 00066 'compile statistics.' ); 00067 $this->addOption( 'verbose-stats', 'Show more statistics.' ); 00068 } 00069 00070 public function execute() { 00071 global $wgCategoryCollation; 00072 00073 $dbw = $this->getDB( DB_MASTER ); 00074 $force = $this->getOption( 'force' ); 00075 $dryRun = $this->getOption( 'dry-run' ); 00076 $verboseStats = $this->getOption( 'verbose-stats' ); 00077 if ( $this->hasOption( 'target-collation' ) ) { 00078 $collationName = $this->getOption( 'target-collation' ); 00079 $collation = Collation::factory( $collationName ); 00080 } else { 00081 $collationName = $wgCategoryCollation; 00082 $collation = Collation::singleton(); 00083 } 00084 00085 $options = array( 00086 'LIMIT' => self::BATCH_SIZE, 00087 'ORDER BY' => 'cl_to, cl_type, cl_from', 00088 'STRAIGHT_JOIN', 00089 ); 00090 00091 if ( $force || $dryRun ) { 00092 $collationConds = array(); 00093 } else { 00094 if ( $this->hasOption( 'previous-collation' ) ) { 00095 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' ); 00096 } else { 00097 $collationConds = array( 0 => 00098 'cl_collation != ' . $dbw->addQuotes( $collationName ) 00099 ); 00100 } 00101 00102 $count = $dbw->estimateRowCount( 00103 'categorylinks', 00104 '*', 00105 $collationConds, 00106 __METHOD__ 00107 ); 00108 // Improve estimate if feasible 00109 if ( $count < 1000000 ) { 00110 $count = $dbw->selectField( 00111 'categorylinks', 00112 'COUNT(*)', 00113 $collationConds, 00114 __METHOD__ 00115 ); 00116 } 00117 if ( $count == 0 ) { 00118 $this->output( "Collations up-to-date.\n" ); 00119 return; 00120 } 00121 $this->output( "Fixing collation for $count rows.\n" ); 00122 } 00123 00124 $count = 0; 00125 $batchCount = 0; 00126 $batchConds = array(); 00127 do { 00128 $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." ); 00129 $res = $dbw->select( 00130 array( 'categorylinks', 'page' ), 00131 array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', 00132 'cl_sortkey', 'cl_type', 'page_namespace', 'page_title' 00133 ), 00134 array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ), 00135 __METHOD__, 00136 $options 00137 ); 00138 $this->output( " processing..." ); 00139 00140 if ( !$dryRun ) { 00141 $dbw->begin( __METHOD__ ); 00142 } 00143 foreach ( $res as $row ) { 00144 $title = Title::newFromRow( $row ); 00145 if ( !$row->cl_collation ) { 00146 # This is an old-style row, so the sortkey needs to be 00147 # converted. 00148 if ( $row->cl_sortkey == $title->getText() 00149 || $row->cl_sortkey == $title->getPrefixedText() ) { 00150 $prefix = ''; 00151 } else { 00152 # Custom sortkey, use it as a prefix 00153 $prefix = $row->cl_sortkey; 00154 } 00155 } else { 00156 $prefix = $row->cl_sortkey_prefix; 00157 } 00158 # cl_type will be wrong for lots of pages if cl_collation is 0, 00159 # so let's update it while we're here. 00160 if ( $title->getNamespace() == NS_CATEGORY ) { 00161 $type = 'subcat'; 00162 } elseif ( $title->getNamespace() == NS_FILE ) { 00163 $type = 'file'; 00164 } else { 00165 $type = 'page'; 00166 } 00167 $newSortKey = $collation->getSortKey( 00168 $title->getCategorySortkey( $prefix ) ); 00169 if ( $verboseStats ) { 00170 $this->updateSortKeySizeHistogram( $newSortKey ); 00171 } 00172 00173 if ( !$dryRun ) { 00174 $dbw->update( 00175 'categorylinks', 00176 array( 00177 'cl_sortkey' => $newSortKey, 00178 'cl_sortkey_prefix' => $prefix, 00179 'cl_collation' => $collationName, 00180 'cl_type' => $type, 00181 'cl_timestamp = cl_timestamp', 00182 ), 00183 array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ), 00184 __METHOD__ 00185 ); 00186 } 00187 } 00188 if ( !$dryRun ) { 00189 $dbw->commit( __METHOD__ ); 00190 } 00191 00192 if ( $row ) { 00193 $batchConds = array( $this->getBatchCondition( $row ) ); 00194 } 00195 00196 $count += $res->numRows(); 00197 $this->output( "$count done.\n" ); 00198 00199 if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) { 00200 $this->output( "Waiting for slaves ... " ); 00201 wfWaitForSlaves(); 00202 $this->output( "done\n" ); 00203 } 00204 } while ( $res->numRows() == self::BATCH_SIZE ); 00205 00206 $this->output( "$count rows processed\n" ); 00207 00208 if ( $verboseStats ) { 00209 $this->output( "\n" ); 00210 $this->showSortKeySizeHistogram(); 00211 } 00212 } 00213 00218 function getBatchCondition( $row ) { 00219 $dbw = $this->getDB( DB_MASTER ); 00220 $fields = array( 'cl_to', 'cl_type', 'cl_from' ); 00221 $first = true; 00222 $cond = false; 00223 $prefix = false; 00224 foreach ( $fields as $field ) { 00225 $encValue = $dbw->addQuotes( $row->$field ); 00226 $inequality = "$field > $encValue"; 00227 $equality = "$field = $encValue"; 00228 if ( $first ) { 00229 $cond = $inequality; 00230 $prefix = $equality; 00231 $first = false; 00232 } else { 00233 $cond .= " OR ($prefix AND $inequality)"; 00234 $prefix .= " AND $equality"; 00235 } 00236 } 00237 return $cond; 00238 } 00239 00240 function updateSortKeySizeHistogram( $key ) { 00241 $length = strlen( $key ); 00242 if ( !isset( $this->sizeHistogram[$length] ) ) { 00243 $this->sizeHistogram[$length] = 0; 00244 } 00245 $this->sizeHistogram[$length]++; 00246 } 00247 00248 function showSortKeySizeHistogram() { 00249 $maxLength = max( array_keys( $this->sizeHistogram ) ); 00250 if ( $maxLength == 0 ) { 00251 return; 00252 } 00253 $numBins = 20; 00254 $coarseHistogram = array_fill( 0, $numBins, 0 ); 00255 $coarseBoundaries = array(); 00256 $boundary = 0; 00257 for ( $i = 0; $i < $numBins - 1; $i++ ) { 00258 $boundary += $maxLength / $numBins; 00259 $coarseBoundaries[$i] = round( $boundary ); 00260 } 00261 $coarseBoundaries[$numBins - 1] = $maxLength + 1; 00262 $raw = ''; 00263 for ( $i = 0; $i <= $maxLength; $i++ ) { 00264 if ( $raw !== '' ) { 00265 $raw .= ', '; 00266 } 00267 if ( !isset( $this->sizeHistogram[$i] ) ) { 00268 $val = 0; 00269 } else { 00270 $val = $this->sizeHistogram[$i]; 00271 } 00272 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) { 00273 if ( $coarseBoundaries[$coarseIndex] > $i ) { 00274 $coarseHistogram[$coarseIndex] += $val; 00275 break; 00276 } 00277 } 00278 if ( $coarseIndex == $numBins - 1 ) { 00279 $coarseHistogram[$coarseIndex] += $val; 00280 } 00281 $raw .= $val; 00282 } 00283 00284 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" ); 00285 00286 $maxBinVal = max( $coarseHistogram ); 00287 $scale = 60 / $maxBinVal; 00288 $prevBoundary = 0; 00289 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) { 00290 if ( !isset( $coarseHistogram[$coarseIndex] ) ) { 00291 $val = 0; 00292 } else { 00293 $val = $coarseHistogram[$coarseIndex]; 00294 } 00295 $boundary = $coarseBoundaries[$coarseIndex]; 00296 $this->output( sprintf( "%-10s %-10d |%s\n", 00297 $prevBoundary . '-' . ( $boundary - 1 ) . ': ', 00298 $val, 00299 str_repeat( '*', $scale * $val ) ) ); 00300 $prevBoundary = $boundary; 00301 } 00302 } 00303 } 00304 00305 $maintClass = "UpdateCollation"; 00306 require_once( RUN_MAINTENANCE_IF_MAIN );