MediaWiki  REL1_20
dumpIterator.php
Go to the documentation of this file.
00001 <?php
00029 require_once( __DIR__ . '/Maintenance.php' );
00030 
00036 abstract class DumpIterator extends Maintenance {
00037 
00038         private $count = 0;
00039         private $startTime;
00040 
00041         public function __construct() {
00042                 parent::__construct();
00043                 $this->mDescription = "Does something with a dump";
00044                 $this->addOption( 'file', 'File with text to run.', false, true );
00045                 $this->addOption( 'dump', 'XML dump to execute all revisions.', false, true );
00046                 $this->addOption( 'from', 'Article from XML dump to start from.', false, true );
00047         }
00048 
00049         public function execute() {
00050                 if (! ( $this->hasOption('file') ^ $this->hasOption('dump') ) ) {
00051                         $this->error("You must provide a file or dump", true);
00052                 }
00053 
00054                 $this->checkOptions();
00055 
00056                 if ( $this->hasOption('file') ) {
00057                         $revision = new WikiRevision;
00058 
00059                         $revision->setText( file_get_contents( $this->getOption( 'file' ) ) );
00060                         $revision->setTitle( Title::newFromText( rawurldecode( basename( $this->getOption( 'file' ), '.txt' ) ) ) );
00061                         $this->handleRevision( $revision );
00062                         return;
00063                 }
00064 
00065                 $this->startTime = microtime( true );
00066 
00067                 if ( $this->getOption('dump') == '-' ) {
00068                         $source = new ImportStreamSource( $this->getStdin() );
00069                 } else {
00070                         $this->error("Sorry, I don't support dump filenames yet. Use - and provide it on stdin on the meantime.", true);
00071                 }
00072                 $importer = new WikiImporter( $source );
00073 
00074                 $importer->setRevisionCallback(
00075                         array( &$this, 'handleRevision' ) );
00076 
00077                 $this->from = $this->getOption( 'from', null );
00078                 $this->count = 0;
00079                 $importer->doImport();
00080 
00081                 $this->conclusions();
00082 
00083                 $delta = microtime( true ) - $this->startTime;
00084                 $this->error( "Done {$this->count} revisions in " . round($delta, 2) . " seconds " );
00085                 if ($delta > 0)
00086                         $this->error( round($this->count / $delta, 2) . " pages/sec" );
00087 
00088                 # Perform the memory_get_peak_usage() when all the other data has been output so there's no damage if it dies.
00089                 # It is only available since 5.2.0 (since 5.2.1 if you haven't compiled with --enable-memory-limit)
00090                 $this->error( "Memory peak usage of " . memory_get_peak_usage() . " bytes\n" );
00091         }
00092 
00093         public function finalSetup() {
00094                 parent::finalSetup();
00095 
00096                 if ( $this->getDbType() == Maintenance::DB_NONE ) {
00097                         global $wgUseDatabaseMessages, $wgLocalisationCacheConf, $wgHooks;
00098                         $wgUseDatabaseMessages = false;
00099                         $wgLocalisationCacheConf['storeClass'] =  'LCStore_Null';
00100                         $wgHooks['InterwikiLoadPrefix'][] = 'DumpIterator::disableInterwikis';
00101                 }
00102         }
00103 
00104         static function disableInterwikis( $prefix, &$data ) {
00105                 # Title::newFromText will check on each namespaced article if it's an interwiki.
00106                 # We always answer that it is not.
00107 
00108                 return false;
00109         }
00110 
00116         public function handleRevision( $rev ) {
00117                 $title = $rev->getTitle();
00118                 if ( !$title ) {
00119                         $this->error( "Got bogus revision with null title!" );
00120                         return;
00121                 }
00122 
00123                 $this->count++;
00124                 if ( isset( $this->from ) ) {
00125                         if ( $this->from != $title )
00126                                 return;
00127                         $this->output( "Skipped " . ($this->count - 1) . " pages\n" );
00128 
00129                         $this->count = 1;
00130                         $this->from = null;
00131                 }
00132 
00133                 $this->processRevision( $rev );
00134         }
00135 
00136         /* Stub function for processing additional options */
00137         public function checkOptions() {
00138                 return;
00139         }
00140 
00141         /* Stub function for giving data about what was computed */
00142         public function conclusions() {
00143                 return;
00144         }
00145 
00146         /* Core function which does whatever the maintenance script is designed to do */
00147         abstract public function processRevision( $rev );
00148 }
00149 
00155 class SearchDump extends DumpIterator {
00156 
00157         public function __construct() {
00158                 parent::__construct();
00159                 $this->mDescription = "Runs a regex in the revisions from a dump";
00160                 $this->addOption( 'regex', 'Searching regex', true, true );
00161         }
00162 
00163         public function getDbType() {
00164                 return Maintenance::DB_NONE;
00165         }
00166 
00170         public function processRevision( $rev ) {
00171                 if ( preg_match( $this->getOption( 'regex' ), $rev->getText() ) ) {
00172                         $this->output( $rev->getTitle() . " matches at edit from " . $rev->getTimestamp() . "\n" );
00173                 }
00174         }
00175 }
00176 
00177 $maintClass = "SearchDump";
00178 require_once( RUN_MAINTENANCE_IF_MAIN );