MediaWiki  REL1_20
refreshLinks.php
Go to the documentation of this file.
00001 <?php
00024 require_once( __DIR__ . '/Maintenance.php' );
00025 
00031 class RefreshLinks extends Maintenance {
00032         public function __construct() {
00033                 parent::__construct();
00034                 $this->mDescription = "Refresh link tables";
00035                 $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
00036                 $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
00037                 $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
00038                 $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
00039                 $this->addOption( 'm', 'Maximum replication lag', false, true );
00040                 $this->addOption( 'e', 'Last page id to refresh', false, true );
00041                 $this->addArg( 'start', 'Page_id to start from, default 1', false );
00042                 $this->setBatchSize( 100 );
00043         }
00044 
00045         public function execute() {
00046                 $max = $this->getOption( 'm', 0 );
00047                 if ( !$this->hasOption( 'dfn-only' ) ) {
00048                         $start = $this->getArg( 0, 1 );
00049                         $new = $this->getOption( 'new-only', false );
00050                         $end = $this->getOption( 'e', 0 );
00051                         $redir = $this->getOption( 'redirects-only', false );
00052                         $oldRedir = $this->getOption( 'old-redirects-only', false );
00053                         $this->doRefreshLinks( $start, $new, $max, $end, $redir, $oldRedir );
00054                 }
00055                 $this->deleteLinksFromNonexistent( $max, $this->mBatchSize );
00056         }
00057 
00067         private function doRefreshLinks( $start, $newOnly = false, $maxLag = false,
00068                                                 $end = 0, $redirectsOnly = false, $oldRedirectsOnly = false ) {
00069                 global $wgParser, $wgUseTidy;
00070 
00071                 $reportingInterval = 100;
00072                 $dbr = wfGetDB( DB_SLAVE );
00073                 $start = intval( $start );
00074 
00075                 // Give extensions a chance to optimize settings
00076                 wfRunHooks( 'MaintenanceRefreshLinksInit', array( $this ) );
00077 
00078                 # Don't generate extension images (e.g. Timeline)
00079                 $wgParser->clearTagHooks();
00080 
00081                 # Don't use HTML tidy
00082                 $wgUseTidy = false;
00083 
00084                 $what = $redirectsOnly ? "redirects" : "links";
00085 
00086                 if ( $oldRedirectsOnly ) {
00087                         # This entire code path is cut-and-pasted from below.  Hurrah.
00088 
00089                         $conds = array(
00090                                 "page_is_redirect=1",
00091                                 "rd_from IS NULL"
00092                         );
00093 
00094                         if ( $end == 0 ) {
00095                                 $conds[] = "page_id >= $start";
00096                         } else {
00097                                 $conds[] = "page_id BETWEEN $start AND $end";
00098                         }
00099 
00100                         $res = $dbr->select(
00101                                 array( 'page', 'redirect' ),
00102                                 'page_id',
00103                                 $conds,
00104                                 __METHOD__,
00105                                 array(),
00106                                 array( 'redirect' => array( "LEFT JOIN", "page_id=rd_from" ) )
00107                         );
00108                         $num = $dbr->numRows( $res );
00109                         $this->output( "Refreshing $num old redirects from $start...\n" );
00110 
00111                         $i = 0;
00112 
00113                         foreach ( $res as $row ) {
00114                                 if ( !( ++$i % $reportingInterval ) ) {
00115                                         $this->output( "$i\n" );
00116                                         wfWaitForSlaves();
00117                                 }
00118                                 $this->fixRedirect( $row->page_id );
00119                         }
00120                 } elseif ( $newOnly ) {
00121                         $this->output( "Refreshing $what from " );
00122                         $res = $dbr->select( 'page',
00123                                 array( 'page_id' ),
00124                                 array(
00125                                         'page_is_new' => 1,
00126                                         "page_id >= $start" ),
00127                                 __METHOD__
00128                         );
00129                         $num = $dbr->numRows( $res );
00130                         $this->output( "$num new articles...\n" );
00131 
00132                         $i = 0;
00133                         foreach ( $res as $row ) {
00134                                 if ( !( ++$i % $reportingInterval ) ) {
00135                                         $this->output( "$i\n" );
00136                                         wfWaitForSlaves();
00137                                 }
00138                                 if ( $redirectsOnly ) {
00139                                         $this->fixRedirect( $row->page_id );
00140                                 } else {
00141                                         self::fixLinksFromArticle( $row->page_id );
00142                                 }
00143                         }
00144                 } else {
00145                         if ( !$end ) {
00146                                 $maxPage = $dbr->selectField( 'page', 'max(page_id)', false );
00147                                 $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', false );
00148                                 $end = max( $maxPage, $maxRD );
00149                         }
00150                         $this->output( "Refreshing redirects table.\n" );
00151                         $this->output( "Starting from page_id $start of $end.\n" );
00152 
00153                         for ( $id = $start; $id <= $end; $id++ ) {
00154 
00155                                 if ( !( $id % $reportingInterval ) ) {
00156                                         $this->output( "$id\n" );
00157                                         wfWaitForSlaves();
00158                                 }
00159                                 $this->fixRedirect( $id );
00160                         }
00161 
00162                         if ( !$redirectsOnly ) {
00163                                 $this->output( "Refreshing links table.\n" );
00164                                 $this->output( "Starting from page_id $start of $end.\n" );
00165 
00166                                 for ( $id = $start; $id <= $end; $id++ ) {
00167 
00168                                         if ( !( $id % $reportingInterval ) ) {
00169                                                 $this->output( "$id\n" );
00170                                                 wfWaitForSlaves();
00171                                         }
00172                                         self::fixLinksFromArticle( $id );
00173                                 }
00174                         }
00175                 }
00176         }
00177 
00182         private function fixRedirect( $id ) {
00183                 $page = WikiPage::newFromID( $id );
00184                 $dbw = wfGetDB( DB_MASTER );
00185 
00186                 if ( $page === null ) {
00187                         // This page doesn't exist (any more)
00188                         // Delete any redirect table entry for it
00189                         $dbw->delete( 'redirect', array( 'rd_from' => $id ),
00190                                 __METHOD__ );
00191                         return;
00192                 }
00193 
00194                 $rt = $page->getRedirectTarget();
00195 
00196                 if ( $rt === null ) {
00197                         // The page is not a redirect
00198                         // Delete any redirect table entry for it
00199                         $dbw->delete( 'redirect', array( 'rd_from' => $id ),
00200                                 __METHOD__ );
00201                 }
00202         }
00203 
00208         public static function fixLinksFromArticle( $id ) {
00209                 global $wgParser, $wgContLang;
00210 
00211                 $page = WikiPage::newFromID( $id );
00212 
00213                 LinkCache::singleton()->clear();
00214 
00215                 if ( $page === null ) {
00216                         return;
00217                 }
00218 
00219                 $text = $page->getRawText();
00220                 if ( $text === false ) {
00221                         return;
00222                 }
00223 
00224                 $dbw = wfGetDB( DB_MASTER );
00225                 $dbw->begin( __METHOD__ );
00226 
00227                 $options = ParserOptions::newFromUserAndLang( new User, $wgContLang );
00228                 $parserOutput = $wgParser->parse( $text, $page->getTitle(), $options, true, true, $page->getLatest() );
00229                 $update = new LinksUpdate( $page->getTitle(), $parserOutput, false );
00230                 $update->doUpdate();
00231 
00232                 $dbw->commit( __METHOD__ );
00233         }
00234 
00244         private function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
00245                 wfWaitForSlaves();
00246 
00247                 $dbw = wfGetDB( DB_MASTER );
00248 
00249                 $lb = wfGetLBFactory()->newMainLB();
00250                 $dbr = $lb->getConnection( DB_SLAVE );
00251                 $dbr->bufferResults( false );
00252 
00253                 $linksTables = array( // table name => page_id field
00254                         'pagelinks' => 'pl_from',
00255                         'imagelinks' => 'il_from',
00256                         'categorylinks' => 'cl_from',
00257                         'templatelinks' => 'tl_from',
00258                         'externallinks' => 'el_from',
00259                         'iwlinks' => 'iwl_from',
00260                         'langlinks' => 'll_from',
00261                         'redirect' => 'rd_from',
00262                         'page_props' => 'pp_page',
00263                 );
00264 
00265                 foreach ( $linksTables as $table => $field ) {
00266                         $this->output( "Retrieving illegal entries from $table... " );
00267 
00268                         // SELECT DISTINCT( $field ) FROM $table LEFT JOIN page ON $field=page_id WHERE page_id IS NULL;
00269                         $results = $dbr->select( array( $table, 'page' ),
00270                                                   $field,
00271                                                   array( 'page_id' => null ),
00272                                                   __METHOD__,
00273                                                   'DISTINCT',
00274                                                   array( 'page' => array( 'LEFT JOIN', "$field=page_id" ) )
00275                         );
00276 
00277                         $counter = 0;
00278                         $list = array();
00279                         $this->output( "0.." );
00280                         foreach ( $results as $row ) {
00281                                 $counter++;
00282                                 $list[] = $row->$field;
00283                                 if ( ( $counter % $batchSize ) == 0 ) {
00284                                         wfWaitForSlaves();
00285                                         $dbw->delete( $table, array( $field => $list ), __METHOD__ );
00286 
00287                                         $this->output( $counter . ".." );
00288                                         $list = array();
00289                                 }
00290                         }
00291                         $this->output( $counter );
00292                         if ( count( $list ) > 0 ) {
00293                                 $dbw->delete( $table, array( $field => $list ), __METHOD__ );
00294                         }
00295                         $this->output( "\n" );
00296                         wfWaitForSlaves();
00297                 }
00298                 $lb->closeAll();
00299         }
00300 }
00301 
00302 $maintClass = 'RefreshLinks';
00303 require_once( RUN_MAINTENANCE_IF_MAIN );