MediaWiki  REL1_24
BacklinkCache.php
Go to the documentation of this file.
00001 <?php
00044 class BacklinkCache {
00046     protected static $cache;
00047 
00059     protected $partitionCache = array();
00060 
00068     protected $fullResultCache = array();
00069 
00077     protected $db;
00078 
00082     protected $title;
00083 
00084     const CACHE_EXPIRY = 3600;
00085 
00091     public function __construct( Title $title ) {
00092         $this->title = $title;
00093     }
00094 
00103     public static function get( Title $title ) {
00104         if ( !self::$cache ) { // init cache
00105             self::$cache = new ProcessCacheLRU( 1 );
00106         }
00107         $dbKey = $title->getPrefixedDBkey();
00108         if ( !self::$cache->has( $dbKey, 'obj', 3600 ) ) {
00109             self::$cache->set( $dbKey, 'obj', new self( $title ) );
00110         }
00111 
00112         return self::$cache->get( $dbKey, 'obj' );
00113     }
00114 
00122     function __sleep() {
00123         return array( 'partitionCache', 'fullResultCache', 'title' );
00124     }
00125 
00129     public function clear() {
00130         $this->partitionCache = array();
00131         $this->fullResultCache = array();
00132         unset( $this->db );
00133     }
00134 
00140     public function setDB( $db ) {
00141         $this->db = $db;
00142     }
00143 
00149     protected function getDB() {
00150         if ( !isset( $this->db ) ) {
00151             $this->db = wfGetDB( DB_SLAVE );
00152         }
00153 
00154         return $this->db;
00155     }
00156 
00165     public function getLinks( $table, $startId = false, $endId = false, $max = INF ) {
00166         return TitleArray::newFromResult( $this->queryLinks( $table, $startId, $endId, $max ) );
00167     }
00168 
00178     protected function queryLinks( $table, $startId, $endId, $max, $select = 'all' ) {
00179         wfProfileIn( __METHOD__ );
00180 
00181         $fromField = $this->getPrefix( $table ) . '_from';
00182 
00183         if ( !$startId && !$endId && is_infinite( $max )
00184             && isset( $this->fullResultCache[$table] )
00185         ) {
00186             wfDebug( __METHOD__ . ": got results from cache\n" );
00187             $res = $this->fullResultCache[$table];
00188         } else {
00189             wfDebug( __METHOD__ . ": got results from DB\n" );
00190             $conds = $this->getConditions( $table );
00191             // Use the from field in the condition rather than the joined page_id,
00192             // because databases are stupid and don't necessarily propagate indexes.
00193             if ( $startId ) {
00194                 $conds[] = "$fromField >= " . intval( $startId );
00195             }
00196             if ( $endId ) {
00197                 $conds[] = "$fromField <= " . intval( $endId );
00198             }
00199             $options = array( 'ORDER BY' => $fromField );
00200             if ( is_finite( $max ) && $max > 0 ) {
00201                 $options['LIMIT'] = $max;
00202             }
00203 
00204             if ( $select === 'ids' ) {
00205                 // Just select from the backlink table and ignore the page JOIN
00206                 $res = $this->getDB()->select(
00207                     $table,
00208                     array( $this->getPrefix( $table ) . '_from AS page_id' ),
00209                     array_filter( $conds, function ( $clause ) { // kind of janky
00210                         return !preg_match( '/(\b|=)page_id(\b|=)/', $clause );
00211                     } ),
00212                     __METHOD__,
00213                     $options
00214                 );
00215             } else {
00216                 // Select from the backlink table and JOIN with page title information
00217                 $res = $this->getDB()->select(
00218                     array( $table, 'page' ),
00219                     array( 'page_namespace', 'page_title', 'page_id' ),
00220                     $conds,
00221                     __METHOD__,
00222                     array_merge( array( 'STRAIGHT_JOIN' ), $options )
00223                 );
00224             }
00225 
00226             if ( $select === 'all' && !$startId && !$endId && $res->numRows() < $max ) {
00227                 // The full results fit within the limit, so cache them
00228                 $this->fullResultCache[$table] = $res;
00229             } else {
00230                 wfDebug( __METHOD__ . ": results from DB were uncacheable\n" );
00231             }
00232         }
00233 
00234         wfProfileOut( __METHOD__ );
00235 
00236         return $res;
00237     }
00238 
00245     protected function getPrefix( $table ) {
00246         static $prefixes = array(
00247             'pagelinks' => 'pl',
00248             'imagelinks' => 'il',
00249             'categorylinks' => 'cl',
00250             'templatelinks' => 'tl',
00251             'redirect' => 'rd',
00252         );
00253 
00254         if ( isset( $prefixes[$table] ) ) {
00255             return $prefixes[$table];
00256         } else {
00257             $prefix = null;
00258             wfRunHooks( 'BacklinkCacheGetPrefix', array( $table, &$prefix ) );
00259             if ( $prefix ) {
00260                 return $prefix;
00261             } else {
00262                 throw new MWException( "Invalid table \"$table\" in " . __CLASS__ );
00263             }
00264         }
00265     }
00266 
00274     protected function getConditions( $table ) {
00275         $prefix = $this->getPrefix( $table );
00276 
00277         switch ( $table ) {
00278             case 'pagelinks':
00279             case 'templatelinks':
00280                 $conds = array(
00281                     "{$prefix}_namespace" => $this->title->getNamespace(),
00282                     "{$prefix}_title" => $this->title->getDBkey(),
00283                     "page_id={$prefix}_from"
00284                 );
00285                 break;
00286             case 'redirect':
00287                 $conds = array(
00288                     "{$prefix}_namespace" => $this->title->getNamespace(),
00289                     "{$prefix}_title" => $this->title->getDBkey(),
00290                     $this->getDb()->makeList( array(
00291                         "{$prefix}_interwiki" => '',
00292                         "{$prefix}_interwiki IS NULL",
00293                     ), LIST_OR ),
00294                     "page_id={$prefix}_from"
00295                 );
00296                 break;
00297             case 'imagelinks':
00298             case 'categorylinks':
00299                 $conds = array(
00300                     "{$prefix}_to" => $this->title->getDBkey(),
00301                     "page_id={$prefix}_from"
00302                 );
00303                 break;
00304             default:
00305                 $conds = null;
00306                 wfRunHooks( 'BacklinkCacheGetConditions', array( $table, $this->title, &$conds ) );
00307                 if ( !$conds ) {
00308                     throw new MWException( "Invalid table \"$table\" in " . __CLASS__ );
00309                 }
00310         }
00311 
00312         return $conds;
00313     }
00314 
00320     public function hasLinks( $table ) {
00321         return ( $this->getNumLinks( $table, 1 ) > 0 );
00322     }
00323 
00330     public function getNumLinks( $table, $max = INF ) {
00331         global $wgMemc, $wgUpdateRowsPerJob;
00332 
00333         // 1) try partition cache ...
00334         if ( isset( $this->partitionCache[$table] ) ) {
00335             $entry = reset( $this->partitionCache[$table] );
00336 
00337             return min( $max, $entry['numRows'] );
00338         }
00339 
00340         // 2) ... then try full result cache ...
00341         if ( isset( $this->fullResultCache[$table] ) ) {
00342             return min( $max, $this->fullResultCache[$table]->numRows() );
00343         }
00344 
00345         $memcKey = wfMemcKey( 'numbacklinks', md5( $this->title->getPrefixedDBkey() ), $table );
00346 
00347         // 3) ... fallback to memcached ...
00348         $count = $wgMemc->get( $memcKey );
00349         if ( $count ) {
00350             return min( $max, $count );
00351         }
00352 
00353         // 4) fetch from the database ...
00354         if ( is_infinite( $max ) ) { // no limit at all
00355             // Use partition() since it will batch the query and skip the JOIN.
00356             // Use $wgUpdateRowsPerJob just to encourage cache reuse for jobs.
00357             $this->partition( $table, $wgUpdateRowsPerJob ); // updates $this->partitionCache
00358             return $this->partitionCache[$table][$wgUpdateRowsPerJob]['numRows'];
00359         } else { // probably some sane limit
00360             // Fetch the full title info, since the caller will likely need it next
00361             $count = $this->getLinks( $table, false, false, $max )->count();
00362             if ( $count < $max ) { // full count
00363                 $wgMemc->set( $memcKey, $count, self::CACHE_EXPIRY );
00364             }
00365         }
00366 
00367         return min( $max, $count );
00368     }
00369 
00379     public function partition( $table, $batchSize ) {
00380         global $wgMemc;
00381 
00382         // 1) try partition cache ...
00383         if ( isset( $this->partitionCache[$table][$batchSize] ) ) {
00384             wfDebug( __METHOD__ . ": got from partition cache\n" );
00385 
00386             return $this->partitionCache[$table][$batchSize]['batches'];
00387         }
00388 
00389         $this->partitionCache[$table][$batchSize] = false;
00390         $cacheEntry =& $this->partitionCache[$table][$batchSize];
00391 
00392         // 2) ... then try full result cache ...
00393         if ( isset( $this->fullResultCache[$table] ) ) {
00394             $cacheEntry = $this->partitionResult( $this->fullResultCache[$table], $batchSize );
00395             wfDebug( __METHOD__ . ": got from full result cache\n" );
00396 
00397             return $cacheEntry['batches'];
00398         }
00399 
00400         $memcKey = wfMemcKey(
00401             'backlinks',
00402             md5( $this->title->getPrefixedDBkey() ),
00403             $table,
00404             $batchSize
00405         );
00406 
00407         // 3) ... fallback to memcached ...
00408         $memcValue = $wgMemc->get( $memcKey );
00409         if ( is_array( $memcValue ) ) {
00410             $cacheEntry = $memcValue;
00411             wfDebug( __METHOD__ . ": got from memcached $memcKey\n" );
00412 
00413             return $cacheEntry['batches'];
00414         }
00415 
00416         // 4) ... finally fetch from the slow database :(
00417         $cacheEntry = array( 'numRows' => 0, 'batches' => array() ); // final result
00418         // Do the selects in batches to avoid client-side OOMs (bug 43452).
00419         // Use a LIMIT that plays well with $batchSize to keep equal sized partitions.
00420         $selectSize = max( $batchSize, 200000 - ( 200000 % $batchSize ) );
00421         $start = false;
00422         do {
00423             $res = $this->queryLinks( $table, $start, false, $selectSize, 'ids' );
00424             $partitions = $this->partitionResult( $res, $batchSize, false );
00425             // Merge the link count and range partitions for this chunk
00426             $cacheEntry['numRows'] += $partitions['numRows'];
00427             $cacheEntry['batches'] = array_merge( $cacheEntry['batches'], $partitions['batches'] );
00428             if ( count( $partitions['batches'] ) ) {
00429                 list( , $lEnd ) = end( $partitions['batches'] );
00430                 $start = $lEnd + 1; // pick up after this inclusive range
00431             }
00432         } while ( $partitions['numRows'] >= $selectSize );
00433         // Make sure the first range has start=false and the last one has end=false
00434         if ( count( $cacheEntry['batches'] ) ) {
00435             $cacheEntry['batches'][0][0] = false;
00436             $cacheEntry['batches'][count( $cacheEntry['batches'] ) - 1][1] = false;
00437         }
00438 
00439         // Save partitions to memcached
00440         $wgMemc->set( $memcKey, $cacheEntry, self::CACHE_EXPIRY );
00441 
00442         // Save backlink count to memcached
00443         $memcKey = wfMemcKey( 'numbacklinks', md5( $this->title->getPrefixedDBkey() ), $table );
00444         $wgMemc->set( $memcKey, $cacheEntry['numRows'], self::CACHE_EXPIRY );
00445 
00446         wfDebug( __METHOD__ . ": got from database\n" );
00447 
00448         return $cacheEntry['batches'];
00449     }
00450 
00459     protected function partitionResult( $res, $batchSize, $isComplete = true ) {
00460         $batches = array();
00461         $numRows = $res->numRows();
00462         $numBatches = ceil( $numRows / $batchSize );
00463 
00464         for ( $i = 0; $i < $numBatches; $i++ ) {
00465             if ( $i == 0 && $isComplete ) {
00466                 $start = false;
00467             } else {
00468                 $rowNum = $i * $batchSize;
00469                 $res->seek( $rowNum );
00470                 $row = $res->fetchObject();
00471                 $start = (int)$row->page_id;
00472             }
00473 
00474             if ( $i == ( $numBatches - 1 ) && $isComplete ) {
00475                 $end = false;
00476             } else {
00477                 $rowNum = min( $numRows - 1, ( $i + 1 ) * $batchSize - 1 );
00478                 $res->seek( $rowNum );
00479                 $row = $res->fetchObject();
00480                 $end = (int)$row->page_id;
00481             }
00482 
00483             # Sanity check order
00484             if ( $start && $end && $start > $end ) {
00485                 throw new MWException( __METHOD__ . ': Internal error: query result out of order' );
00486             }
00487 
00488             $batches[] = array( $start, $end );
00489         }
00490 
00491         return array( 'numRows' => $numRows, 'batches' => $batches );
00492     }
00493 }