MediaWiki
REL1_20
|
00001 <?php 00027 #$optionsWithArgs = array( 'begin', 'max-slave-lag' ); 00028 00029 require_once( __DIR__ . '/Maintenance.php' ); 00030 00037 class UpdateCollation extends Maintenance { 00038 const BATCH_SIZE = 50; // Number of rows to process in one batch 00039 const SYNC_INTERVAL = 20; // Wait for slaves after this many batches 00040 00041 var $sizeHistogram = array(); 00042 00043 public function __construct() { 00044 parent::__construct(); 00045 00046 global $wgCategoryCollation; 00047 $this->mDescription = <<<TEXT 00048 This script will find all rows in the categorylinks table whose collation is 00049 out-of-date (cl_collation != '$wgCategoryCollation') and repopulate cl_sortkey 00050 using the page title and cl_sortkey_prefix. If everything's collation is 00051 up-to-date, it will do nothing. 00052 TEXT; 00053 00054 $this->addOption( 'force', 'Run on all rows, even if the collation is ' . 00055 'supposed to be up-to-date.' ); 00056 $this->addOption( 'previous-collation', 'Set the previous value of ' . 00057 '$wgCategoryCollation here to speed up this script, especially if your ' . 00058 'categorylinks table is large. This will only update rows with that ' . 00059 'collation, though, so it may miss out-of-date rows with a different, ' . 00060 'even older collation.', false, true ); 00061 $this->addOption( 'target-collation', 'Set this to the new collation type to ' . 00062 'use instead of $wgCategoryCollation. Usually you should not use this, ' . 00063 'you should just update $wgCategoryCollation in LocalSettings.php.', 00064 false, true ); 00065 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' . 00066 'compile statistics.' ); 00067 $this->addOption( 'verbose-stats', 'Show more statistics.' ); 00068 } 00069 00070 public function execute() { 00071 global $wgCategoryCollation, $wgMiserMode; 00072 00073 $dbw = $this->getDB( DB_MASTER ); 00074 $force = $this->getOption( 'force' ); 00075 $dryRun = $this->getOption( 'dry-run' ); 00076 $verboseStats = $this->getOption( 'verbose-stats' ); 00077 if ( $this->hasOption( 'target-collation' ) ) { 00078 $collationName = $this->getOption( 'target-collation' ); 00079 $collation = Collation::factory( $collationName ); 00080 } else { 00081 $collationName = $wgCategoryCollation; 00082 $collation = Collation::singleton(); 00083 } 00084 00085 $options = array( 'LIMIT' => self::BATCH_SIZE, 'STRAIGHT_JOIN' ); 00086 00087 if ( $force || $dryRun ) { 00088 $options['ORDER BY'] = 'cl_from, cl_to'; 00089 $collationConds = array(); 00090 } else { 00091 if ( $this->hasOption( 'previous-collation' ) ) { 00092 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' ); 00093 } else { 00094 $collationConds = array( 0 => 00095 'cl_collation != ' . $dbw->addQuotes( $collationName ) 00096 ); 00097 } 00098 00099 if ( !$wgMiserMode ) { 00100 $count = $dbw->selectField( 00101 'categorylinks', 00102 'COUNT(*)', 00103 $collationConds, 00104 __METHOD__ 00105 ); 00106 } else { 00107 $count = $dbw->estimateRowCount( 00108 'categorylinks', 00109 '*', 00110 $collationConds, 00111 __METHOD__ 00112 ); 00113 } 00114 if ( $count == 0 ) { 00115 $this->output( "Collations up-to-date.\n" ); 00116 return; 00117 } 00118 $this->output( "Fixing collation for $count rows.\n" ); 00119 } 00120 00121 $count = 0; 00122 $batchCount = 0; 00123 $batchConds = array(); 00124 do { 00125 $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." ); 00126 $res = $dbw->select( 00127 array( 'categorylinks', 'page' ), 00128 array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', 00129 'cl_sortkey', 'page_namespace', 'page_title' 00130 ), 00131 array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ), 00132 __METHOD__, 00133 $options 00134 ); 00135 $this->output( " processing..." ); 00136 00137 if ( !$dryRun ) { 00138 $dbw->begin( __METHOD__ ); 00139 } 00140 foreach ( $res as $row ) { 00141 $title = Title::newFromRow( $row ); 00142 if ( !$row->cl_collation ) { 00143 # This is an old-style row, so the sortkey needs to be 00144 # converted. 00145 if ( $row->cl_sortkey == $title->getText() 00146 || $row->cl_sortkey == $title->getPrefixedText() ) { 00147 $prefix = ''; 00148 } else { 00149 # Custom sortkey, use it as a prefix 00150 $prefix = $row->cl_sortkey; 00151 } 00152 } else { 00153 $prefix = $row->cl_sortkey_prefix; 00154 } 00155 # cl_type will be wrong for lots of pages if cl_collation is 0, 00156 # so let's update it while we're here. 00157 if ( $title->getNamespace() == NS_CATEGORY ) { 00158 $type = 'subcat'; 00159 } elseif ( $title->getNamespace() == NS_FILE ) { 00160 $type = 'file'; 00161 } else { 00162 $type = 'page'; 00163 } 00164 $newSortKey = $collation->getSortKey( 00165 $title->getCategorySortkey( $prefix ) ); 00166 if ( $verboseStats ) { 00167 $this->updateSortKeySizeHistogram( $newSortKey ); 00168 } 00169 00170 if ( !$dryRun ) { 00171 $dbw->update( 00172 'categorylinks', 00173 array( 00174 'cl_sortkey' => $newSortKey, 00175 'cl_sortkey_prefix' => $prefix, 00176 'cl_collation' => $collationName, 00177 'cl_type' => $type, 00178 'cl_timestamp = cl_timestamp', 00179 ), 00180 array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ), 00181 __METHOD__ 00182 ); 00183 } 00184 } 00185 if ( !$dryRun ) { 00186 $dbw->commit( __METHOD__ ); 00187 } 00188 00189 if ( ( $force || $dryRun ) && $row ) { 00190 $encFrom = $dbw->addQuotes( $row->cl_from ); 00191 $encTo = $dbw->addQuotes( $row->cl_to ); 00192 $batchConds = array( 00193 "(cl_from = $encFrom AND cl_to > $encTo) " . 00194 " OR cl_from > $encFrom" ); 00195 } 00196 00197 $count += $res->numRows(); 00198 $this->output( "$count done.\n" ); 00199 00200 if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) { 00201 $this->output( "Waiting for slaves ... " ); 00202 wfWaitForSlaves(); 00203 $this->output( "done\n" ); 00204 } 00205 } while ( $res->numRows() == self::BATCH_SIZE ); 00206 00207 $this->output( "$count rows processed\n" ); 00208 00209 if ( $verboseStats ) { 00210 $this->output( "\n" ); 00211 $this->showSortKeySizeHistogram(); 00212 } 00213 } 00214 00215 function updateSortKeySizeHistogram( $key ) { 00216 $length = strlen( $key ); 00217 if ( !isset( $this->sizeHistogram[$length] ) ) { 00218 $this->sizeHistogram[$length] = 0; 00219 } 00220 $this->sizeHistogram[$length]++; 00221 } 00222 00223 function showSortKeySizeHistogram() { 00224 $maxLength = max( array_keys( $this->sizeHistogram ) ); 00225 if ( $maxLength == 0 ) { 00226 return; 00227 } 00228 $numBins = 20; 00229 $coarseHistogram = array_fill( 0, $numBins, 0 ); 00230 $coarseBoundaries = array(); 00231 $boundary = 0; 00232 for ( $i = 0; $i < $numBins - 1; $i++ ) { 00233 $boundary += $maxLength / $numBins; 00234 $coarseBoundaries[$i] = round( $boundary ); 00235 } 00236 $coarseBoundaries[$numBins - 1] = $maxLength + 1; 00237 $raw = ''; 00238 for ( $i = 0; $i <= $maxLength; $i++ ) { 00239 if ( $raw !== '' ) { 00240 $raw .= ', '; 00241 } 00242 if ( !isset( $this->sizeHistogram[$i] ) ) { 00243 $val = 0; 00244 } else { 00245 $val = $this->sizeHistogram[$i]; 00246 } 00247 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) { 00248 if ( $coarseBoundaries[$coarseIndex] > $i ) { 00249 $coarseHistogram[$coarseIndex] += $val; 00250 break; 00251 } 00252 } 00253 if ( $coarseIndex == $numBins - 1 ) { 00254 $coarseHistogram[$coarseIndex] += $val; 00255 } 00256 $raw .= $val; 00257 } 00258 00259 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" ); 00260 00261 $maxBinVal = max( $coarseHistogram ); 00262 $scale = 60 / $maxBinVal; 00263 $prevBoundary = 0; 00264 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) { 00265 if ( !isset( $coarseHistogram[$coarseIndex] ) ) { 00266 $val = 0; 00267 } else { 00268 $val = $coarseHistogram[$coarseIndex]; 00269 } 00270 $boundary = $coarseBoundaries[$coarseIndex]; 00271 $this->output( sprintf( "%-10s %-10d |%s\n", 00272 $prevBoundary . '-' . ( $boundary - 1 ) . ': ', 00273 $val, 00274 str_repeat( '*', $scale * $val ) ) ); 00275 $prevBoundary = $boundary; 00276 } 00277 } 00278 } 00279 00280 $maintClass = "UpdateCollation"; 00281 require_once( RUN_MAINTENANCE_IF_MAIN );