MediaWiki  REL1_21
refreshLinks.php
Go to the documentation of this file.
00001 <?php
00024 require_once( __DIR__ . '/Maintenance.php' );
00025 
00031 class RefreshLinks extends Maintenance {
00032         public function __construct() {
00033                 parent::__construct();
00034                 $this->mDescription = "Refresh link tables";
00035                 $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
00036                 $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
00037                 $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
00038                 $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
00039                 $this->addOption( 'm', 'Maximum replication lag', false, true );
00040                 $this->addOption( 'e', 'Last page id to refresh', false, true );
00041                 $this->addArg( 'start', 'Page_id to start from, default 1', false );
00042                 $this->setBatchSize( 100 );
00043         }
00044 
00045         public function execute() {
00046                 $max = $this->getOption( 'm', 0 );
00047                 if ( !$this->hasOption( 'dfn-only' ) ) {
00048                         $start = $this->getArg( 0, 1 );
00049                         $new = $this->getOption( 'new-only', false );
00050                         $end = $this->getOption( 'e', 0 );
00051                         $redir = $this->getOption( 'redirects-only', false );
00052                         $oldRedir = $this->getOption( 'old-redirects-only', false );
00053                         $this->doRefreshLinks( $start, $new, $max, $end, $redir, $oldRedir );
00054                 }
00055                 $this->deleteLinksFromNonexistent( $max, $this->mBatchSize );
00056         }
00057 
00067         private function doRefreshLinks( $start, $newOnly = false, $maxLag = false,
00068                                                 $end = 0, $redirectsOnly = false, $oldRedirectsOnly = false ) {
00069                 global $wgParser, $wgUseTidy;
00070 
00071                 $reportingInterval = 100;
00072                 $dbr = wfGetDB( DB_SLAVE );
00073                 $start = intval( $start );
00074 
00075                 // Give extensions a chance to optimize settings
00076                 wfRunHooks( 'MaintenanceRefreshLinksInit', array( $this ) );
00077 
00078                 # Don't generate extension images (e.g. Timeline)
00079                 $wgParser->clearTagHooks();
00080 
00081                 # Don't use HTML tidy
00082                 $wgUseTidy = false;
00083 
00084                 $what = $redirectsOnly ? "redirects" : "links";
00085 
00086                 if ( $oldRedirectsOnly ) {
00087                         # This entire code path is cut-and-pasted from below.  Hurrah.
00088 
00089                         $conds = array(
00090                                 "page_is_redirect=1",
00091                                 "rd_from IS NULL"
00092                         );
00093 
00094                         if ( $end == 0 ) {
00095                                 $conds[] = "page_id >= $start";
00096                         } else {
00097                                 $conds[] = "page_id BETWEEN $start AND $end";
00098                         }
00099 
00100                         $res = $dbr->select(
00101                                 array( 'page', 'redirect' ),
00102                                 'page_id',
00103                                 $conds,
00104                                 __METHOD__,
00105                                 array(),
00106                                 array( 'redirect' => array( "LEFT JOIN", "page_id=rd_from" ) )
00107                         );
00108                         $num = $res->numRows();
00109                         $this->output( "Refreshing $num old redirects from $start...\n" );
00110 
00111                         $i = 0;
00112 
00113                         foreach ( $res as $row ) {
00114                                 if ( !( ++$i % $reportingInterval ) ) {
00115                                         $this->output( "$i\n" );
00116                                         wfWaitForSlaves();
00117                                 }
00118                                 $this->fixRedirect( $row->page_id );
00119                         }
00120                 } elseif ( $newOnly ) {
00121                         $this->output( "Refreshing $what from " );
00122                         $res = $dbr->select( 'page',
00123                                 array( 'page_id' ),
00124                                 array(
00125                                         'page_is_new' => 1,
00126                                         "page_id >= $start" ),
00127                                 __METHOD__
00128                         );
00129                         $num = $res->numRows();
00130                         $this->output( "$num new articles...\n" );
00131 
00132                         $i = 0;
00133                         foreach ( $res as $row ) {
00134                                 if ( !( ++$i % $reportingInterval ) ) {
00135                                         $this->output( "$i\n" );
00136                                         wfWaitForSlaves();
00137                                 }
00138                                 if ( $redirectsOnly ) {
00139                                         $this->fixRedirect( $row->page_id );
00140                                 } else {
00141                                         self::fixLinksFromArticle( $row->page_id );
00142                                 }
00143                         }
00144                 } else {
00145                         if ( !$end ) {
00146                                 $maxPage = $dbr->selectField( 'page', 'max(page_id)', false );
00147                                 $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', false );
00148                                 $end = max( $maxPage, $maxRD );
00149                         }
00150                         $this->output( "Refreshing redirects table.\n" );
00151                         $this->output( "Starting from page_id $start of $end.\n" );
00152 
00153                         for ( $id = $start; $id <= $end; $id++ ) {
00154 
00155                                 if ( !( $id % $reportingInterval ) ) {
00156                                         $this->output( "$id\n" );
00157                                         wfWaitForSlaves();
00158                                 }
00159                                 $this->fixRedirect( $id );
00160                         }
00161 
00162                         if ( !$redirectsOnly ) {
00163                                 $this->output( "Refreshing links table.\n" );
00164                                 $this->output( "Starting from page_id $start of $end.\n" );
00165 
00166                                 for ( $id = $start; $id <= $end; $id++ ) {
00167 
00168                                         if ( !( $id % $reportingInterval ) ) {
00169                                                 $this->output( "$id\n" );
00170                                                 wfWaitForSlaves();
00171                                         }
00172                                         self::fixLinksFromArticle( $id );
00173                                 }
00174                         }
00175                 }
00176         }
00177 
00190         private function fixRedirect( $id ) {
00191                 $page = WikiPage::newFromID( $id );
00192                 $dbw = wfGetDB( DB_MASTER );
00193 
00194                 if ( $page === null ) {
00195                         // This page doesn't exist (any more)
00196                         // Delete any redirect table entry for it
00197                         $dbw->delete( 'redirect', array( 'rd_from' => $id ),
00198                                 __METHOD__ );
00199                         return;
00200                 }
00201 
00202                 $rt = null;
00203                 $content = $page->getContent( Revision::RAW );
00204                 if ( $content !== null ) {
00205                         $rt = $content->getUltimateRedirectTarget();
00206                 }
00207 
00208                 if ( $rt === null ) {
00209                         // The page is not a redirect
00210                         // Delete any redirect table entry for it
00211                         $dbw->delete( 'redirect', array( 'rd_from' => $id ), __METHOD__ );
00212                         $fieldValue = 0;
00213                 } else {
00214                         $page->insertRedirectEntry( $rt );
00215                         $fieldValue = 1;
00216                 }
00217 
00218                 // Update the page table to be sure it is an a consistent state
00219                 $dbw->update( 'page', array( 'page_is_redirect' => $fieldValue ),
00220                         array( 'page_id' => $id ), __METHOD__ );
00221         }
00222 
00227         public static function fixLinksFromArticle( $id ) {
00228                 $page = WikiPage::newFromID( $id );
00229 
00230                 LinkCache::singleton()->clear();
00231 
00232                 if ( $page === null ) {
00233                         return;
00234                 }
00235 
00236                 $content = $page->getContent( Revision::RAW );
00237                 if ( $content === null ) {
00238                         return;
00239                 }
00240 
00241                 $dbw = wfGetDB( DB_MASTER );
00242                 $dbw->begin( __METHOD__ );
00243 
00244                 $updates = $content->getSecondaryDataUpdates( $page->getTitle() );
00245                 DataUpdate::runUpdates( $updates );
00246 
00247                 $dbw->commit( __METHOD__ );
00248         }
00249 
00259         private function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
00260                 wfWaitForSlaves();
00261 
00262                 $dbw = wfGetDB( DB_MASTER );
00263 
00264                 $lb = wfGetLBFactory()->newMainLB();
00265                 $dbr = $lb->getConnection( DB_SLAVE );
00266                 $dbr->bufferResults( false );
00267 
00268                 $linksTables = array( // table name => page_id field
00269                         'pagelinks' => 'pl_from',
00270                         'imagelinks' => 'il_from',
00271                         'categorylinks' => 'cl_from',
00272                         'templatelinks' => 'tl_from',
00273                         'externallinks' => 'el_from',
00274                         'iwlinks' => 'iwl_from',
00275                         'langlinks' => 'll_from',
00276                         'redirect' => 'rd_from',
00277                         'page_props' => 'pp_page',
00278                 );
00279 
00280                 foreach ( $linksTables as $table => $field ) {
00281                         $this->output( "Retrieving illegal entries from $table... " );
00282 
00283                         // SELECT DISTINCT( $field ) FROM $table LEFT JOIN page ON $field=page_id WHERE page_id IS NULL;
00284                         $results = $dbr->select( array( $table, 'page' ),
00285                                                   $field,
00286                                                   array( 'page_id' => null ),
00287                                                   __METHOD__,
00288                                                   'DISTINCT',
00289                                                   array( 'page' => array( 'LEFT JOIN', "$field=page_id" ) )
00290                         );
00291 
00292                         $counter = 0;
00293                         $list = array();
00294                         $this->output( "0.." );
00295                         foreach ( $results as $row ) {
00296                                 $counter++;
00297                                 $list[] = $row->$field;
00298                                 if ( ( $counter % $batchSize ) == 0 ) {
00299                                         wfWaitForSlaves();
00300                                         $dbw->delete( $table, array( $field => $list ), __METHOD__ );
00301 
00302                                         $this->output( $counter . ".." );
00303                                         $list = array();
00304                                 }
00305                         }
00306                         $this->output( $counter );
00307                         if ( count( $list ) > 0 ) {
00308                                 $dbw->delete( $table, array( $field => $list ), __METHOD__ );
00309                         }
00310                         $this->output( "\n" );
00311                         wfWaitForSlaves();
00312                 }
00313                 $lb->closeAll();
00314         }
00315 }
00316 
00317 $maintClass = 'RefreshLinks';
00318 require_once( RUN_MAINTENANCE_IF_MAIN );