MediaWiki  REL1_22
refreshLinks.php
Go to the documentation of this file.
00001 <?php
00024 require_once __DIR__ . '/Maintenance.php';
00025 
00031 class RefreshLinks extends Maintenance {
00032     public function __construct() {
00033         parent::__construct();
00034         $this->mDescription = "Refresh link tables";
00035         $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
00036         $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
00037         $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
00038         $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
00039         $this->addOption( 'm', 'Maximum replication lag', false, true );
00040         $this->addOption( 'e', 'Last page id to refresh', false, true );
00041         $this->addArg( 'start', 'Page_id to start from, default 1', false );
00042         $this->setBatchSize( 100 );
00043     }
00044 
00045     public function execute() {
00046         $max = $this->getOption( 'm', 0 );
00047         if ( !$this->hasOption( 'dfn-only' ) ) {
00048             $start = $this->getArg( 0, 1 );
00049             $new = $this->getOption( 'new-only', false );
00050             $end = $this->getOption( 'e', 0 );
00051             $redir = $this->getOption( 'redirects-only', false );
00052             $oldRedir = $this->getOption( 'old-redirects-only', false );
00053             $this->doRefreshLinks( $start, $new, $max, $end, $redir, $oldRedir );
00054         }
00055         $this->deleteLinksFromNonexistent( $max, $this->mBatchSize );
00056     }
00057 
00067     private function doRefreshLinks( $start, $newOnly = false, $maxLag = false,
00068                         $end = 0, $redirectsOnly = false, $oldRedirectsOnly = false ) {
00069         global $wgParser, $wgUseTidy;
00070 
00071         $reportingInterval = 100;
00072         $dbr = wfGetDB( DB_SLAVE );
00073         $start = intval( $start );
00074 
00075         // Give extensions a chance to optimize settings
00076         wfRunHooks( 'MaintenanceRefreshLinksInit', array( $this ) );
00077 
00078         # Don't generate extension images (e.g. Timeline)
00079         $wgParser->clearTagHooks();
00080 
00081         # Don't use HTML tidy
00082         $wgUseTidy = false;
00083 
00084         $what = $redirectsOnly ? "redirects" : "links";
00085 
00086         if ( $oldRedirectsOnly ) {
00087             # This entire code path is cut-and-pasted from below.  Hurrah.
00088 
00089             $conds = array(
00090                 "page_is_redirect=1",
00091                 "rd_from IS NULL"
00092             );
00093 
00094             if ( $end == 0 ) {
00095                 $conds[] = "page_id >= $start";
00096             } else {
00097                 $conds[] = "page_id BETWEEN $start AND $end";
00098             }
00099 
00100             $res = $dbr->select(
00101                 array( 'page', 'redirect' ),
00102                 'page_id',
00103                 $conds,
00104                 __METHOD__,
00105                 array(),
00106                 array( 'redirect' => array( "LEFT JOIN", "page_id=rd_from" ) )
00107             );
00108             $num = $res->numRows();
00109             $this->output( "Refreshing $num old redirects from $start...\n" );
00110 
00111             $i = 0;
00112 
00113             foreach ( $res as $row ) {
00114                 if ( !( ++$i % $reportingInterval ) ) {
00115                     $this->output( "$i\n" );
00116                     wfWaitForSlaves();
00117                 }
00118                 $this->fixRedirect( $row->page_id );
00119             }
00120         } elseif ( $newOnly ) {
00121             $this->output( "Refreshing $what from " );
00122             $res = $dbr->select( 'page',
00123                 array( 'page_id' ),
00124                 array(
00125                     'page_is_new' => 1,
00126                     "page_id >= $start" ),
00127                 __METHOD__
00128             );
00129             $num = $res->numRows();
00130             $this->output( "$num new articles...\n" );
00131 
00132             $i = 0;
00133             foreach ( $res as $row ) {
00134                 if ( !( ++$i % $reportingInterval ) ) {
00135                     $this->output( "$i\n" );
00136                     wfWaitForSlaves();
00137                 }
00138                 if ( $redirectsOnly ) {
00139                     $this->fixRedirect( $row->page_id );
00140                 } else {
00141                     self::fixLinksFromArticle( $row->page_id );
00142                 }
00143             }
00144         } else {
00145             if ( !$end ) {
00146                 $maxPage = $dbr->selectField( 'page', 'max(page_id)', false );
00147                 $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', false );
00148                 $end = max( $maxPage, $maxRD );
00149             }
00150             $this->output( "Refreshing redirects table.\n" );
00151             $this->output( "Starting from page_id $start of $end.\n" );
00152 
00153             for ( $id = $start; $id <= $end; $id++ ) {
00154 
00155                 if ( !( $id % $reportingInterval ) ) {
00156                     $this->output( "$id\n" );
00157                     wfWaitForSlaves();
00158                 }
00159                 $this->fixRedirect( $id );
00160             }
00161 
00162             if ( !$redirectsOnly ) {
00163                 $this->output( "Refreshing links tables.\n" );
00164                 $this->output( "Starting from page_id $start of $end.\n" );
00165 
00166                 for ( $id = $start; $id <= $end; $id++ ) {
00167 
00168                     if ( !( $id % $reportingInterval ) ) {
00169                         $this->output( "$id\n" );
00170                         wfWaitForSlaves();
00171                     }
00172                     self::fixLinksFromArticle( $id );
00173                 }
00174             }
00175         }
00176     }
00177 
00190     private function fixRedirect( $id ) {
00191         $page = WikiPage::newFromID( $id );
00192         $dbw = wfGetDB( DB_MASTER );
00193 
00194         if ( $page === null ) {
00195             // This page doesn't exist (any more)
00196             // Delete any redirect table entry for it
00197             $dbw->delete( 'redirect', array( 'rd_from' => $id ),
00198                 __METHOD__ );
00199             return;
00200         }
00201 
00202         $rt = null;
00203         $content = $page->getContent( Revision::RAW );
00204         if ( $content !== null ) {
00205             $rt = $content->getUltimateRedirectTarget();
00206         }
00207 
00208         if ( $rt === null ) {
00209             // The page is not a redirect
00210             // Delete any redirect table entry for it
00211             $dbw->delete( 'redirect', array( 'rd_from' => $id ), __METHOD__ );
00212             $fieldValue = 0;
00213         } else {
00214             $page->insertRedirectEntry( $rt );
00215             $fieldValue = 1;
00216         }
00217 
00218         // Update the page table to be sure it is an a consistent state
00219         $dbw->update( 'page', array( 'page_is_redirect' => $fieldValue ),
00220             array( 'page_id' => $id ), __METHOD__ );
00221     }
00222 
00227     public static function fixLinksFromArticle( $id ) {
00228         $page = WikiPage::newFromID( $id );
00229 
00230         LinkCache::singleton()->clear();
00231 
00232         if ( $page === null ) {
00233             return;
00234         }
00235 
00236         $content = $page->getContent( Revision::RAW );
00237         if ( $content === null ) {
00238             return;
00239         }
00240 
00241         $dbw = wfGetDB( DB_MASTER );
00242         $dbw->begin( __METHOD__ );
00243 
00244         $updates = $content->getSecondaryDataUpdates( $page->getTitle() );
00245         DataUpdate::runUpdates( $updates );
00246 
00247         $dbw->commit( __METHOD__ );
00248     }
00249 
00259     private function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
00260         wfWaitForSlaves();
00261 
00262         $dbw = wfGetDB( DB_MASTER );
00263 
00264         $lb = wfGetLBFactory()->newMainLB();
00265         $dbr = $lb->getConnection( DB_SLAVE );
00266         $dbr->bufferResults( false );
00267 
00268         $linksTables = array( // table name => page_id field
00269             'pagelinks' => 'pl_from',
00270             'imagelinks' => 'il_from',
00271             'categorylinks' => 'cl_from',
00272             'templatelinks' => 'tl_from',
00273             'externallinks' => 'el_from',
00274             'iwlinks' => 'iwl_from',
00275             'langlinks' => 'll_from',
00276             'redirect' => 'rd_from',
00277             'page_props' => 'pp_page',
00278         );
00279 
00280         foreach ( $linksTables as $table => $field ) {
00281             $this->output( "Retrieving illegal entries from $table... " );
00282 
00283             // SELECT DISTINCT( $field ) FROM $table LEFT JOIN page ON $field=page_id WHERE page_id IS NULL;
00284             $results = $dbr->select(
00285                 array( $table, 'page' ),
00286                 $field,
00287                 array( 'page_id' => null ),
00288                 __METHOD__,
00289                 'DISTINCT',
00290                 array( 'page' => array( 'LEFT JOIN', "$field=page_id" ) )
00291             );
00292 
00293             $counter = 0;
00294             $list = array();
00295             $this->output( "0.." );
00296             foreach ( $results as $row ) {
00297                 $counter++;
00298                 $list[] = $row->$field;
00299                 if ( ( $counter % $batchSize ) == 0 ) {
00300                     wfWaitForSlaves();
00301                     $dbw->delete( $table, array( $field => $list ), __METHOD__ );
00302 
00303                     $this->output( $counter . ".." );
00304                     $list = array();
00305                 }
00306             }
00307             $this->output( $counter );
00308             if ( count( $list ) > 0 ) {
00309                 $dbw->delete( $table, array( $field => $list ), __METHOD__ );
00310             }
00311             $this->output( "\n" );
00312             wfWaitForSlaves();
00313         }
00314         $lb->closeAll();
00315     }
00316 }
00317 
00318 $maintClass = 'RefreshLinks';
00319 require_once RUN_MAINTENANCE_IF_MAIN;