[ Index ] |
PHP Cross Reference of MediaWiki-1.24.0 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Find all rows in the categorylinks table whose collation is out-of-date 4 * (cl_collation != $wgCategoryCollation) and repopulate cl_sortkey 5 * using the page title and cl_sortkey_prefix. 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License along 18 * with this program; if not, write to the Free Software Foundation, Inc., 19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 20 * http://www.gnu.org/copyleft/gpl.html 21 * 22 * @file 23 * @ingroup Maintenance 24 * @author Aryeh Gregor (Simetrical) 25 */ 26 27 #$optionsWithArgs = array( 'begin', 'max-slave-lag' ); 28 29 require_once __DIR__ . '/Maintenance.php'; 30 31 /** 32 * Maintenance script that will find all rows in the categorylinks table 33 * whose collation is out-of-date. 34 * 35 * @ingroup Maintenance 36 */ 37 class UpdateCollation extends Maintenance { 38 const BATCH_SIZE = 10000; // Number of rows to process in one batch 39 const SYNC_INTERVAL = 20; // Wait for slaves after this many batches 40 41 public $sizeHistogram = array(); 42 43 public function __construct() { 44 parent::__construct(); 45 46 global $wgCategoryCollation; 47 $this->mDescription = <<<TEXT 48 This script will find all rows in the categorylinks table whose collation is 49 out-of-date (cl_collation != '$wgCategoryCollation') and repopulate cl_sortkey 50 using the page title and cl_sortkey_prefix. If all collations are 51 up-to-date, it will do nothing. 52 TEXT; 53 54 $this->addOption( 'force', 'Run on all rows, even if the collation is ' . 55 'supposed to be up-to-date.' ); 56 $this->addOption( 'previous-collation', 'Set the previous value of ' . 57 '$wgCategoryCollation here to speed up this script, especially if your ' . 58 'categorylinks table is large. This will only update rows with that ' . 59 'collation, though, so it may miss out-of-date rows with a different, ' . 60 'even older collation.', false, true ); 61 $this->addOption( 'target-collation', 'Set this to the new collation type to ' . 62 'use instead of $wgCategoryCollation. Usually you should not use this, ' . 63 'you should just update $wgCategoryCollation in LocalSettings.php.', 64 false, true ); 65 $this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' . 66 'compile statistics.' ); 67 $this->addOption( 'verbose-stats', 'Show more statistics.' ); 68 } 69 70 public function execute() { 71 global $wgCategoryCollation; 72 73 $dbw = $this->getDB( DB_MASTER ); 74 $force = $this->getOption( 'force' ); 75 $dryRun = $this->getOption( 'dry-run' ); 76 $verboseStats = $this->getOption( 'verbose-stats' ); 77 if ( $this->hasOption( 'target-collation' ) ) { 78 $collationName = $this->getOption( 'target-collation' ); 79 $collation = Collation::factory( $collationName ); 80 } else { 81 $collationName = $wgCategoryCollation; 82 $collation = Collation::singleton(); 83 } 84 85 // Collation sanity check: in some cases the constructor will work, 86 // but this will raise an exception, breaking all category pages 87 $collation->getFirstLetter( 'MediaWiki' ); 88 89 $options = array( 90 'LIMIT' => self::BATCH_SIZE, 91 'ORDER BY' => 'cl_to, cl_type, cl_from', 92 'STRAIGHT_JOIN', 93 ); 94 95 if ( $force || $dryRun ) { 96 $collationConds = array(); 97 } else { 98 if ( $this->hasOption( 'previous-collation' ) ) { 99 $collationConds['cl_collation'] = $this->getOption( 'previous-collation' ); 100 } else { 101 $collationConds = array( 0 => 102 'cl_collation != ' . $dbw->addQuotes( $collationName ) 103 ); 104 } 105 106 $count = $dbw->estimateRowCount( 107 'categorylinks', 108 '*', 109 $collationConds, 110 __METHOD__ 111 ); 112 // Improve estimate if feasible 113 if ( $count < 1000000 ) { 114 $count = $dbw->selectField( 115 'categorylinks', 116 'COUNT(*)', 117 $collationConds, 118 __METHOD__ 119 ); 120 } 121 if ( $count == 0 ) { 122 $this->output( "Collations up-to-date.\n" ); 123 124 return; 125 } 126 $this->output( "Fixing collation for $count rows.\n" ); 127 } 128 129 $count = 0; 130 $batchCount = 0; 131 $batchConds = array(); 132 do { 133 $this->output( "Selecting next " . self::BATCH_SIZE . " rows..." ); 134 $res = $dbw->select( 135 array( 'categorylinks', 'page' ), 136 array( 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation', 137 'cl_sortkey', 'cl_type', 'page_namespace', 'page_title' 138 ), 139 array_merge( $collationConds, $batchConds, array( 'cl_from = page_id' ) ), 140 __METHOD__, 141 $options 142 ); 143 $this->output( " processing..." ); 144 145 if ( !$dryRun ) { 146 $dbw->begin( __METHOD__ ); 147 } 148 foreach ( $res as $row ) { 149 $title = Title::newFromRow( $row ); 150 if ( !$row->cl_collation ) { 151 # This is an old-style row, so the sortkey needs to be 152 # converted. 153 if ( $row->cl_sortkey == $title->getText() 154 || $row->cl_sortkey == $title->getPrefixedText() 155 ) { 156 $prefix = ''; 157 } else { 158 # Custom sortkey, use it as a prefix 159 $prefix = $row->cl_sortkey; 160 } 161 } else { 162 $prefix = $row->cl_sortkey_prefix; 163 } 164 # cl_type will be wrong for lots of pages if cl_collation is 0, 165 # so let's update it while we're here. 166 if ( $title->getNamespace() == NS_CATEGORY ) { 167 $type = 'subcat'; 168 } elseif ( $title->getNamespace() == NS_FILE ) { 169 $type = 'file'; 170 } else { 171 $type = 'page'; 172 } 173 $newSortKey = $collation->getSortKey( 174 $title->getCategorySortkey( $prefix ) ); 175 if ( $verboseStats ) { 176 $this->updateSortKeySizeHistogram( $newSortKey ); 177 } 178 179 if ( !$dryRun ) { 180 $dbw->update( 181 'categorylinks', 182 array( 183 'cl_sortkey' => $newSortKey, 184 'cl_sortkey_prefix' => $prefix, 185 'cl_collation' => $collationName, 186 'cl_type' => $type, 187 'cl_timestamp = cl_timestamp', 188 ), 189 array( 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ), 190 __METHOD__ 191 ); 192 } 193 if ( $row ) { 194 $batchConds = array( $this->getBatchCondition( $row, $dbw ) ); 195 } 196 } 197 if ( !$dryRun ) { 198 $dbw->commit( __METHOD__ ); 199 } 200 201 $count += $res->numRows(); 202 $this->output( "$count done.\n" ); 203 204 if ( !$dryRun && ++$batchCount % self::SYNC_INTERVAL == 0 ) { 205 $this->output( "Waiting for slaves ... " ); 206 wfWaitForSlaves(); 207 $this->output( "done\n" ); 208 } 209 } while ( $res->numRows() == self::BATCH_SIZE ); 210 211 $this->output( "$count rows processed\n" ); 212 213 if ( $verboseStats ) { 214 $this->output( "\n" ); 215 $this->showSortKeySizeHistogram(); 216 } 217 } 218 219 /** 220 * Return an SQL expression selecting rows which sort above the given row, 221 * assuming an ordering of cl_to, cl_type, cl_from 222 * @param stdClass $row 223 * @param DatabaseBase $dbw 224 * @return string 225 */ 226 function getBatchCondition( $row, $dbw ) { 227 $fields = array( 'cl_to', 'cl_type', 'cl_from' ); 228 $first = true; 229 $cond = false; 230 $prefix = false; 231 foreach ( $fields as $field ) { 232 $encValue = $dbw->addQuotes( $row->$field ); 233 $inequality = "$field > $encValue"; 234 $equality = "$field = $encValue"; 235 if ( $first ) { 236 $cond = $inequality; 237 $prefix = $equality; 238 $first = false; 239 } else { 240 $cond .= " OR ($prefix AND $inequality)"; 241 $prefix .= " AND $equality"; 242 } 243 } 244 245 return $cond; 246 } 247 248 function updateSortKeySizeHistogram( $key ) { 249 $length = strlen( $key ); 250 if ( !isset( $this->sizeHistogram[$length] ) ) { 251 $this->sizeHistogram[$length] = 0; 252 } 253 $this->sizeHistogram[$length]++; 254 } 255 256 function showSortKeySizeHistogram() { 257 $maxLength = max( array_keys( $this->sizeHistogram ) ); 258 if ( $maxLength == 0 ) { 259 return; 260 } 261 $numBins = 20; 262 $coarseHistogram = array_fill( 0, $numBins, 0 ); 263 $coarseBoundaries = array(); 264 $boundary = 0; 265 for ( $i = 0; $i < $numBins - 1; $i++ ) { 266 $boundary += $maxLength / $numBins; 267 $coarseBoundaries[$i] = round( $boundary ); 268 } 269 $coarseBoundaries[$numBins - 1] = $maxLength + 1; 270 $raw = ''; 271 for ( $i = 0; $i <= $maxLength; $i++ ) { 272 if ( $raw !== '' ) { 273 $raw .= ', '; 274 } 275 if ( !isset( $this->sizeHistogram[$i] ) ) { 276 $val = 0; 277 } else { 278 $val = $this->sizeHistogram[$i]; 279 } 280 for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) { 281 if ( $coarseBoundaries[$coarseIndex] > $i ) { 282 $coarseHistogram[$coarseIndex] += $val; 283 break; 284 } 285 } 286 if ( $coarseIndex == $numBins - 1 ) { 287 $coarseHistogram[$coarseIndex] += $val; 288 } 289 $raw .= $val; 290 } 291 292 $this->output( "Sort key size histogram\nRaw data: $raw\n\n" ); 293 294 $maxBinVal = max( $coarseHistogram ); 295 $scale = 60 / $maxBinVal; 296 $prevBoundary = 0; 297 for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) { 298 if ( !isset( $coarseHistogram[$coarseIndex] ) ) { 299 $val = 0; 300 } else { 301 $val = $coarseHistogram[$coarseIndex]; 302 } 303 $boundary = $coarseBoundaries[$coarseIndex]; 304 $this->output( sprintf( "%-10s %-10d |%s\n", 305 $prevBoundary . '-' . ( $boundary - 1 ) . ': ', 306 $val, 307 str_repeat( '*', $scale * $val ) ) ); 308 $prevBoundary = $boundary; 309 } 310 } 311 } 312 313 $maintClass = "UpdateCollation"; 314 require_once RUN_MAINTENANCE_IF_MAIN;
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 14:03:12 2014 | Cross-referenced by PHPXref 0.7.1 |