MediaWiki  REL1_20
generateSitemap.php
Go to the documentation of this file.
00001 <?php
00029 require_once( __DIR__ . '/Maintenance.php' );
00030 
00036 class GenerateSitemap extends Maintenance {
00037         const GS_MAIN = -2;
00038         const GS_TALK = -1;
00039 
00047         var $url_limit;
00048 
00056         var $size_limit;
00057 
00063         var $fspath;
00064 
00070         var $urlpath;
00071 
00077         var $compress;
00078 
00084         var $skipRedirects;
00085 
00091         var $limit = array();
00092 
00098         var $priorities = array();
00099 
00105         var $namespaces = array();
00106 
00112         var $timestamp;
00113 
00119         var $dbr;
00120 
00126         var $findex;
00127 
00128 
00134         var $file;
00135 
00141         private $identifier;
00142 
00146         public function __construct() {
00147                 parent::__construct();
00148                 $this->mDescription = "Creates a sitemap for the site";
00149                 $this->addOption( 'fspath', 'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory', false, true );
00150                 $this->addOption( 'urlpath', 'The URL path corresponding to --fspath, prepended to filenames in the index; defaults to an empty string', false, true );
00151                 $this->addOption( 'compress', 'Compress the sitemap files, can take value yes|no, default yes', false, true );
00152                 $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' );
00153                 $this->addOption( 'identifier', 'What site identifier to use for the wiki, defaults to $wgDBname', false, true );
00154         }
00155 
00159         public function execute() {
00160                 $this->setNamespacePriorities();
00161                 $this->url_limit = 50000;
00162                 $this->size_limit = pow( 2, 20 ) * 10;
00163                 $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) );
00164                 $this->urlpath = $this->getOption( 'urlpath', "" );
00165                 if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) {
00166                         $this->urlpath .= '/';
00167                 }
00168                 $this->identifier = $this->getOption( 'identifier', wfWikiID() );
00169                 $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no';
00170                 $this->skipRedirects = $this->getOption( 'skip-redirects', false ) !== false ;
00171                 $this->dbr = wfGetDB( DB_SLAVE );
00172                 $this->generateNamespaces();
00173                 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
00174                 $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' );
00175                 $this->main();
00176         }
00177 
00178         private function setNamespacePriorities() {
00179                 global $wgSitemapNamespacesPriorities;
00180 
00181                 // Custom main namespaces
00182                 $this->priorities[self::GS_MAIN] = '0.5';
00183                 // Custom talk namesspaces
00184                 $this->priorities[self::GS_TALK] = '0.1';
00185                 // MediaWiki standard namespaces
00186                 $this->priorities[NS_MAIN] = '1.0';
00187                 $this->priorities[NS_TALK] = '0.1';
00188                 $this->priorities[NS_USER] = '0.5';
00189                 $this->priorities[NS_USER_TALK] = '0.1';
00190                 $this->priorities[NS_PROJECT] = '0.5';
00191                 $this->priorities[NS_PROJECT_TALK] = '0.1';
00192                 $this->priorities[NS_FILE] = '0.5';
00193                 $this->priorities[NS_FILE_TALK] = '0.1';
00194                 $this->priorities[NS_MEDIAWIKI] = '0.0';
00195                 $this->priorities[NS_MEDIAWIKI_TALK] = '0.1';
00196                 $this->priorities[NS_TEMPLATE] = '0.0';
00197                 $this->priorities[NS_TEMPLATE_TALK] = '0.1';
00198                 $this->priorities[NS_HELP] = '0.5';
00199                 $this->priorities[NS_HELP_TALK] = '0.1';
00200                 $this->priorities[NS_CATEGORY] = '0.5';
00201                 $this->priorities[NS_CATEGORY_TALK] = '0.1';
00202 
00203                 // Custom priorities
00204                 if ( $wgSitemapNamespacesPriorities !== false ) {
00208                         foreach ( $wgSitemapNamespacesPriorities as $namespace => $priority ) {
00209                                 $float = floatval( $priority );
00210                                 if ( $float > 1.0 ) {
00211                                         $priority = '1.0';
00212                                 } elseif ( $float < 0.0 ) {
00213                                         $priority = '0.0';
00214                                 }
00215                                 $this->priorities[$namespace] = $priority;
00216                         }
00217                 }
00218         }
00219 
00225         private static function init_path( $fspath ) {
00226                 if ( !isset( $fspath ) ) {
00227                         return null;
00228                 }
00229                 # Create directory if needed
00230                 if ( $fspath && !is_dir( $fspath ) ) {
00231                         wfMkdirParents( $fspath, null, __METHOD__ ) or die( "Can not create directory $fspath.\n" );
00232                 }
00233 
00234                 return realpath( $fspath ) . DIRECTORY_SEPARATOR ;
00235         }
00236 
00240         function generateNamespaces() {
00241                 // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
00242                 global $wgSitemapNamespaces;
00243                 if ( is_array( $wgSitemapNamespaces ) ) {
00244                         $this->namespaces = $wgSitemapNamespaces;
00245                         return;
00246                 }
00247 
00248                 $res = $this->dbr->select( 'page',
00249                         array( 'page_namespace' ),
00250                         array(),
00251                         __METHOD__,
00252                         array(
00253                                 'GROUP BY' => 'page_namespace',
00254                                 'ORDER BY' => 'page_namespace',
00255                         )
00256                 );
00257 
00258                 foreach ( $res as $row )
00259                         $this->namespaces[] = $row->page_namespace;
00260         }
00261 
00268         function priority( $namespace ) {
00269                 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
00270         }
00271 
00280         function guessPriority( $namespace ) {
00281                 return MWNamespace::isSubject( $namespace ) ? $this->priorities[self::GS_MAIN] : $this->priorities[self::GS_TALK];
00282         }
00283 
00290         function getPageRes( $namespace ) {
00291                 return $this->dbr->select( 'page',
00292                         array(
00293                                 'page_namespace',
00294                                 'page_title',
00295                                 'page_touched',
00296                                 'page_is_redirect'
00297                         ),
00298                         array( 'page_namespace' => $namespace ),
00299                         __METHOD__
00300                 );
00301         }
00302 
00306         public function main() {
00307                 global $wgContLang;
00308 
00309                 fwrite( $this->findex, $this->openIndex() );
00310 
00311                 foreach ( $this->namespaces as $namespace ) {
00312                         $res = $this->getPageRes( $namespace );
00313                         $this->file = false;
00314                         $this->generateLimit( $namespace );
00315                         $length = $this->limit[0];
00316                         $i = $smcount = 0;
00317 
00318                         $fns = $wgContLang->getFormattedNsText( $namespace );
00319                         $this->output( "$namespace ($fns)\n" );
00320                         $skippedRedirects = 0;  // Number of redirects skipped for that namespace
00321                         foreach ( $res as $row ) {
00322                                 if ($this->skipRedirects && $row->page_is_redirect ) {
00323                                         $skippedRedirects++;
00324                                         continue;
00325                                 }
00326 
00327                                 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
00328                                         if ( $this->file !== false ) {
00329                                                 $this->write( $this->file, $this->closeFile() );
00330                                                 $this->close( $this->file );
00331                                         }
00332                                         $filename = $this->sitemapFilename( $namespace, $smcount++ );
00333                                         $this->file = $this->open( $this->fspath . $filename, 'wb' );
00334                                         $this->write( $this->file, $this->openFile() );
00335                                         fwrite( $this->findex, $this->indexEntry( $filename ) );
00336                                         $this->output( "\t$this->fspath$filename\n" );
00337                                         $length = $this->limit[0];
00338                                         $i = 1;
00339                                 }
00340                                 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
00341                                 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
00342                                 $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) );
00343                                 $length += strlen( $entry );
00344                                 $this->write( $this->file, $entry );
00345                                 // generate pages for language variants
00346                                 if ( $wgContLang->hasVariants() ) {
00347                                         $variants = $wgContLang->getVariants();
00348                                         foreach ( $variants as $vCode ) {
00349                                                 if ( $vCode == $wgContLang->getCode() ) continue; // we don't want default variant
00350                                                 $entry = $this->fileEntry( $title->getCanonicalURL( '', $vCode ), $date, $this->priority( $namespace ) );
00351                                                 $length += strlen( $entry );
00352                                                 $this->write( $this->file, $entry );
00353                                         }
00354                                 }
00355                         }
00356 
00357                         if ($this->skipRedirects && $skippedRedirects > 0) {
00358                                 $this->output( "  skipped $skippedRedirects redirect(s)\n" );
00359                         }
00360 
00361                         if ( $this->file ) {
00362                                 $this->write( $this->file, $this->closeFile() );
00363                                 $this->close( $this->file );
00364                         }
00365                 }
00366                 fwrite( $this->findex, $this->closeIndex() );
00367                 fclose( $this->findex );
00368         }
00369 
00375         function open( $file, $flags ) {
00376                 $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
00377                 if( $resource === false ) {
00378                         wfDebugDieBacktrace( __METHOD__ . " error opening file $file with flags $flags. Check permissions?" );
00379                 }
00380                 return $resource;
00381         }
00382 
00386         function write( &$handle, $str ) {
00387                 if( $handle === true || $handle === false ) {
00388                         wfDebugDieBacktrace( __METHOD__ . " was passed a boolean as a file handle.\n" );
00389                 }
00390                 if ( $this->compress )
00391                         gzwrite( $handle, $str );
00392                 else
00393                         fwrite( $handle, $str );
00394         }
00395 
00399         function close( &$handle ) {
00400                 if ( $this->compress )
00401                         gzclose( $handle );
00402                 else
00403                         fclose( $handle );
00404         }
00405 
00413         function sitemapFilename( $namespace, $count ) {
00414                 $ext = $this->compress ? '.gz' : '';
00415                 return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext";
00416         }
00417 
00423         function xmlHead() {
00424                 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
00425         }
00426 
00432         function xmlSchema() {
00433                 return 'http://www.sitemaps.org/schemas/sitemap/0.9';
00434         }
00435 
00441         function openIndex() {
00442                 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
00443         }
00444 
00451         function indexEntry( $filename ) {
00452                 return
00453                         "\t<sitemap>\n" .
00454                         "\t\t<loc>{$this->urlpath}$filename</loc>\n" .
00455                         "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
00456                         "\t</sitemap>\n";
00457         }
00458 
00464         function closeIndex() {
00465                 return "</sitemapindex>\n";
00466         }
00467 
00473         function openFile() {
00474                 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
00475         }
00476 
00485         function fileEntry( $url, $date, $priority ) {
00486                 return
00487                         "\t<url>\n" .
00488                         "\t\t<loc>$url</loc>\n" .
00489                         "\t\t<lastmod>$date</lastmod>\n" .
00490                         "\t\t<priority>$priority</priority>\n" .
00491                         "\t</url>\n";
00492         }
00493 
00499         function closeFile() {
00500                 return "</urlset>\n";
00501         }
00502 
00506         function generateLimit( $namespace ) {
00507                 // bug 17961: make a title with the longest possible URL in this namespace
00508                 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
00509 
00510                 $this->limit = array(
00511                         strlen( $this->openFile() ),
00512                         strlen( $this->fileEntry( $title->getCanonicalURL(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
00513                         strlen( $this->closeFile() )
00514                 );
00515         }
00516 }
00517 
00518 $maintClass = "GenerateSitemap";
00519 require_once( RUN_MAINTENANCE_IF_MAIN );