[ Index ] |
PHP Cross Reference of MediaWiki-1.24.0 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Compress the text of a wiki. 4 * 5 * Usage: 6 * 7 * Non-wikimedia 8 * php compressOld.php [options...] 9 * 10 * Wikimedia 11 * php compressOld.php <database> [options...] 12 * 13 * Options are: 14 * -t <type> set compression type to either: 15 * gzip: compress revisions independently 16 * concat: concatenate revisions and compress in chunks (default) 17 * -c <chunk-size> maximum number of revisions in a concat chunk 18 * -b <begin-date> earliest date to check for uncompressed revisions 19 * -e <end-date> latest revision date to compress 20 * -s <startid> the id to start from (referring to the text table for 21 * type gzip, and to the page table for type concat) 22 * -n <endid> the page_id to stop at (only when using concat compression type) 23 * --extdb <cluster> store specified revisions in an external cluster (untested) 24 * 25 * This program is free software; you can redistribute it and/or modify 26 * it under the terms of the GNU General Public License as published by 27 * the Free Software Foundation; either version 2 of the License, or 28 * (at your option) any later version. 29 * 30 * This program is distributed in the hope that it will be useful, 31 * but WITHOUT ANY WARRANTY; without even the implied warranty of 32 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 33 * GNU General Public License for more details. 34 * 35 * You should have received a copy of the GNU General Public License along 36 * with this program; if not, write to the Free Software Foundation, Inc., 37 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 38 * http://www.gnu.org/copyleft/gpl.html 39 * 40 * @file 41 * @ingroup Maintenance ExternalStorage 42 */ 43 44 require_once __DIR__ . '/../Maintenance.php'; 45 46 /** 47 * Maintenance script that compress the text of a wiki. 48 * 49 * @ingroup Maintenance ExternalStorage 50 */ 51 class CompressOld extends Maintenance { 52 /** 53 * @todo document 54 */ 55 const LS_INDIVIDUAL = 0; 56 const LS_CHUNKED = 1; 57 58 public function __construct() { 59 parent::__construct(); 60 $this->mDescription = 'Compress the text of a wiki'; 61 $this->addOption( 'type', 'Set compression type to either: gzip|concat', false, true, 't' ); 62 $this->addOption( 63 'chunksize', 64 'Maximum number of revisions in a concat chunk', 65 false, 66 true, 67 'c' 68 ); 69 $this->addOption( 70 'begin-date', 71 'Earliest date to check for uncompressed revisions', 72 false, 73 true, 74 'b' 75 ); 76 $this->addOption( 'end-date', 'Latest revision date to compress', false, true, 'e' ); 77 $this->addOption( 78 'startid', 79 'The id to start from (gzip -> text table, concat -> page table)', 80 false, 81 true, 82 's' 83 ); 84 $this->addOption( 85 'extdb', 86 'Store specified revisions in an external cluster (untested)', 87 false, 88 true 89 ); 90 $this->addOption( 91 'endid', 92 'The page_id to stop at (only when using concat compression type)', 93 false, 94 true, 95 'n' 96 ); 97 } 98 99 public function execute() { 100 global $wgDBname; 101 if ( !function_exists( "gzdeflate" ) ) { 102 $this->error( "You must enable zlib support in PHP to compress old revisions!\n" . 103 "Please see http://www.php.net/manual/en/ref.zlib.php\n", true ); 104 } 105 106 $type = $this->getOption( 'type', 'concat' ); 107 $chunkSize = $this->getOption( 'chunksize', 20 ); 108 $startId = $this->getOption( 'startid', 0 ); 109 $beginDate = $this->getOption( 'begin-date', '' ); 110 $endDate = $this->getOption( 'end-date', '' ); 111 $extDB = $this->getOption( 'extdb', '' ); 112 $endId = $this->getOption( 'endid', false ); 113 114 if ( $type != 'concat' && $type != 'gzip' ) { 115 $this->error( "Type \"{$type}\" not supported" ); 116 } 117 118 if ( $extDB != '' ) { 119 $this->output( "Compressing database {$wgDBname} to external cluster {$extDB}\n" 120 . str_repeat( '-', 76 ) . "\n\n" ); 121 } else { 122 $this->output( "Compressing database {$wgDBname}\n" 123 . str_repeat( '-', 76 ) . "\n\n" ); 124 } 125 126 $success = true; 127 if ( $type == 'concat' ) { 128 $success = $this->compressWithConcat( $startId, $chunkSize, $beginDate, 129 $endDate, $extDB, $endId ); 130 } else { 131 $this->compressOldPages( $startId, $extDB ); 132 } 133 134 if ( $success ) { 135 $this->output( "Done.\n" ); 136 } 137 } 138 139 /** 140 * @todo document 141 * @param int $start 142 * @param string $extdb 143 */ 144 private function compressOldPages( $start = 0, $extdb = '' ) { 145 $chunksize = 50; 146 $this->output( "Starting from old_id $start...\n" ); 147 $dbw = wfGetDB( DB_MASTER ); 148 do { 149 $res = $dbw->select( 150 'text', 151 array( 'old_id', 'old_flags', 'old_text' ), 152 "old_id>=$start", 153 __METHOD__, 154 array( 'ORDER BY' => 'old_id', 'LIMIT' => $chunksize, 'FOR UPDATE' ) 155 ); 156 157 if ( $res->numRows() == 0 ) { 158 break; 159 } 160 161 $last = $start; 162 163 foreach ( $res as $row ) { 164 # print " {$row->old_id} - {$row->old_namespace}:{$row->old_title}\n"; 165 $this->compressPage( $row, $extdb ); 166 $last = $row->old_id; 167 } 168 169 $start = $last + 1; # Deletion may leave long empty stretches 170 $this->output( "$start...\n" ); 171 } while ( true ); 172 } 173 174 /** 175 * @todo document 176 * @param stdClass $row 177 * @param string $extdb 178 * @return bool 179 */ 180 private function compressPage( $row, $extdb ) { 181 if ( false !== strpos( $row->old_flags, 'gzip' ) 182 || false !== strpos( $row->old_flags, 'object' ) 183 ) { 184 #print "Already compressed row {$row->old_id}\n"; 185 return false; 186 } 187 $dbw = wfGetDB( DB_MASTER ); 188 $flags = $row->old_flags ? "{$row->old_flags},gzip" : "gzip"; 189 $compress = gzdeflate( $row->old_text ); 190 191 # Store in external storage if required 192 if ( $extdb !== '' ) { 193 $storeObj = new ExternalStoreDB; 194 $compress = $storeObj->store( $extdb, $compress ); 195 if ( $compress === false ) { 196 $this->error( "Unable to store object" ); 197 198 return false; 199 } 200 } 201 202 # Update text row 203 $dbw->update( 'text', 204 array( /* SET */ 205 'old_flags' => $flags, 206 'old_text' => $compress 207 ), array( /* WHERE */ 208 'old_id' => $row->old_id 209 ), __METHOD__, 210 array( 'LIMIT' => 1 ) 211 ); 212 213 return true; 214 } 215 216 /** 217 * @param int $startId 218 * @param int $maxChunkSize 219 * @param string $beginDate 220 * @param string $endDate 221 * @param string $extdb 222 * @param bool|int $maxPageId 223 * @return bool 224 */ 225 private function compressWithConcat( $startId, $maxChunkSize, $beginDate, 226 $endDate, $extdb = "", $maxPageId = false 227 ) { 228 $loadStyle = self::LS_CHUNKED; 229 230 $dbr = wfGetDB( DB_SLAVE ); 231 $dbw = wfGetDB( DB_MASTER ); 232 233 # Set up external storage 234 if ( $extdb != '' ) { 235 $storeObj = new ExternalStoreDB; 236 } 237 238 # Get all articles by page_id 239 if ( !$maxPageId ) { 240 $maxPageId = $dbr->selectField( 'page', 'max(page_id)', '', __METHOD__ ); 241 } 242 $this->output( "Starting from $startId of $maxPageId\n" ); 243 $pageConds = array(); 244 245 /* 246 if ( $exclude_ns0 ) { 247 print "Excluding main namespace\n"; 248 $pageConds[] = 'page_namespace<>0'; 249 } 250 if ( $queryExtra ) { 251 $pageConds[] = $queryExtra; 252 } 253 */ 254 255 # For each article, get a list of revisions which fit the criteria 256 257 # No recompression, use a condition on old_flags 258 # Don't compress object type entities, because that might produce data loss when 259 # overwriting bulk storage concat rows. Don't compress external references, because 260 # the script doesn't yet delete rows from external storage. 261 $conds = array( 262 'old_flags NOT ' . $dbr->buildLike( $dbr->anyString(), 'object', $dbr->anyString() ) 263 . ' AND old_flags NOT ' 264 . $dbr->buildLike( $dbr->anyString(), 'external', $dbr->anyString() ) 265 ); 266 267 if ( $beginDate ) { 268 if ( !preg_match( '/^\d{14}$/', $beginDate ) ) { 269 $this->error( "Invalid begin date \"$beginDate\"\n" ); 270 271 return false; 272 } 273 $conds[] = "rev_timestamp>'" . $beginDate . "'"; 274 } 275 if ( $endDate ) { 276 if ( !preg_match( '/^\d{14}$/', $endDate ) ) { 277 $this->error( "Invalid end date \"$endDate\"\n" ); 278 279 return false; 280 } 281 $conds[] = "rev_timestamp<'" . $endDate . "'"; 282 } 283 if ( $loadStyle == self::LS_CHUNKED ) { 284 $tables = array( 'revision', 'text' ); 285 $fields = array( 'rev_id', 'rev_text_id', 'old_flags', 'old_text' ); 286 $conds[] = 'rev_text_id=old_id'; 287 $revLoadOptions = 'FOR UPDATE'; 288 } else { 289 $tables = array( 'revision' ); 290 $fields = array( 'rev_id', 'rev_text_id' ); 291 $revLoadOptions = array(); 292 } 293 294 # Don't work with current revisions 295 # Don't lock the page table for update either -- TS 2006-04-04 296 #$tables[] = 'page'; 297 #$conds[] = 'page_id=rev_page AND rev_id != page_latest'; 298 299 for ( $pageId = $startId; $pageId <= $maxPageId; $pageId++ ) { 300 wfWaitForSlaves(); 301 302 # Wake up 303 $dbr->ping(); 304 305 # Get the page row 306 $pageRes = $dbr->select( 'page', 307 array( 'page_id', 'page_namespace', 'page_title', 'page_latest' ), 308 $pageConds + array( 'page_id' => $pageId ), __METHOD__ ); 309 if ( $pageRes->numRows() == 0 ) { 310 continue; 311 } 312 $pageRow = $dbr->fetchObject( $pageRes ); 313 314 # Display progress 315 $titleObj = Title::makeTitle( $pageRow->page_namespace, $pageRow->page_title ); 316 $this->output( "$pageId\t" . $titleObj->getPrefixedDBkey() . " " ); 317 318 # Load revisions 319 $revRes = $dbw->select( $tables, $fields, 320 array_merge( array( 321 'rev_page' => $pageRow->page_id, 322 # Don't operate on the current revision 323 # Use < instead of <> in case the current revision has changed 324 # since the page select, which wasn't locking 325 'rev_id < ' . $pageRow->page_latest 326 ), $conds ), 327 __METHOD__, 328 $revLoadOptions 329 ); 330 $revs = array(); 331 foreach ( $revRes as $revRow ) { 332 $revs[] = $revRow; 333 } 334 335 if ( count( $revs ) < 2 ) { 336 # No revisions matching, no further processing 337 $this->output( "\n" ); 338 continue; 339 } 340 341 # For each chunk 342 $i = 0; 343 while ( $i < count( $revs ) ) { 344 if ( $i < count( $revs ) - $maxChunkSize ) { 345 $thisChunkSize = $maxChunkSize; 346 } else { 347 $thisChunkSize = count( $revs ) - $i; 348 } 349 350 $chunk = new ConcatenatedGzipHistoryBlob(); 351 $stubs = array(); 352 $dbw->begin( __METHOD__ ); 353 $usedChunk = false; 354 $primaryOldid = $revs[$i]->rev_text_id; 355 356 // @codingStandardsIgnoreStart Ignore avoid function calls in a FOR loop test part warning 357 # Get the text of each revision and add it to the object 358 for ( $j = 0; $j < $thisChunkSize && $chunk->isHappy(); $j++ ) { 359 // @codingStandardsIgnoreEnd 360 $oldid = $revs[$i + $j]->rev_text_id; 361 362 # Get text 363 if ( $loadStyle == self::LS_INDIVIDUAL ) { 364 $textRow = $dbw->selectRow( 'text', 365 array( 'old_flags', 'old_text' ), 366 array( 'old_id' => $oldid ), 367 __METHOD__, 368 'FOR UPDATE' 369 ); 370 $text = Revision::getRevisionText( $textRow ); 371 } else { 372 $text = Revision::getRevisionText( $revs[$i + $j] ); 373 } 374 375 if ( $text === false ) { 376 $this->error( "\nError, unable to get text in old_id $oldid" ); 377 #$dbw->delete( 'old', array( 'old_id' => $oldid ) ); 378 } 379 380 if ( $extdb == "" && $j == 0 ) { 381 $chunk->setText( $text ); 382 $this->output( '.' ); 383 } else { 384 # Don't make a stub if it's going to be longer than the article 385 # Stubs are typically about 100 bytes 386 if ( strlen( $text ) < 120 ) { 387 $stub = false; 388 $this->output( 'x' ); 389 } else { 390 $stub = new HistoryBlobStub( $chunk->addItem( $text ) ); 391 $stub->setLocation( $primaryOldid ); 392 $stub->setReferrer( $oldid ); 393 $this->output( '.' ); 394 $usedChunk = true; 395 } 396 $stubs[$j] = $stub; 397 } 398 } 399 $thisChunkSize = $j; 400 401 # If we couldn't actually use any stubs because the pages were too small, do nothing 402 if ( $usedChunk ) { 403 if ( $extdb != "" ) { 404 # Move blob objects to External Storage 405 $stored = $storeObj->store( $extdb, serialize( $chunk ) ); 406 if ( $stored === false ) { 407 $this->error( "Unable to store object" ); 408 409 return false; 410 } 411 # Store External Storage URLs instead of Stub placeholders 412 foreach ( $stubs as $stub ) { 413 if ( $stub === false ) { 414 continue; 415 } 416 # $stored should provide base path to a BLOB 417 $url = $stored . "/" . $stub->getHash(); 418 $dbw->update( 'text', 419 array( /* SET */ 420 'old_text' => $url, 421 'old_flags' => 'external,utf-8', 422 ), array( /* WHERE */ 423 'old_id' => $stub->getReferrer(), 424 ) 425 ); 426 } 427 } else { 428 # Store the main object locally 429 $dbw->update( 'text', 430 array( /* SET */ 431 'old_text' => serialize( $chunk ), 432 'old_flags' => 'object,utf-8', 433 ), array( /* WHERE */ 434 'old_id' => $primaryOldid 435 ) 436 ); 437 438 # Store the stub objects 439 for ( $j = 1; $j < $thisChunkSize; $j++ ) { 440 # Skip if not compressing and don't overwrite the first revision 441 if ( $stubs[$j] !== false && $revs[$i + $j]->rev_text_id != $primaryOldid ) { 442 $dbw->update( 'text', 443 array( /* SET */ 444 'old_text' => serialize( $stubs[$j] ), 445 'old_flags' => 'object,utf-8', 446 ), array( /* WHERE */ 447 'old_id' => $revs[$i + $j]->rev_text_id 448 ) 449 ); 450 } 451 } 452 } 453 } 454 # Done, next 455 $this->output( "/" ); 456 $dbw->commit( __METHOD__ ); 457 $i += $thisChunkSize; 458 wfWaitForSlaves(); 459 } 460 $this->output( "\n" ); 461 } 462 463 return true; 464 } 465 } 466 467 $maintClass = 'CompressOld'; 468 require_once RUN_MAINTENANCE_IF_MAIN;
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 14:03:12 2014 | Cross-referenced by PHPXref 0.7.1 |