MediaWiki  REL1_22
generateSitemap.php
Go to the documentation of this file.
00001 <?php
00029 require_once __DIR__ . '/Maintenance.php';
00030 
00036 class GenerateSitemap extends Maintenance {
00037     const GS_MAIN = -2;
00038     const GS_TALK = -1;
00039 
00047     public $url_limit;
00048 
00056     public $size_limit;
00057 
00063     public $fspath;
00064 
00070     public $urlpath;
00071 
00077     public $compress;
00078 
00084     public $skipRedirects;
00085 
00091     public $limit = array();
00092 
00098     public $priorities = array();
00099 
00105     public $namespaces = array();
00106 
00112     public $timestamp;
00113 
00119     public $dbr;
00120 
00126     public $findex;
00127 
00128 
00134     public $file;
00135 
00141     private $identifier;
00142 
00146     public function __construct() {
00147         parent::__construct();
00148         $this->mDescription = "Creates a sitemap for the site";
00149         $this->addOption( 'fspath', 'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory', false, true );
00150         $this->addOption( 'urlpath', 'The URL path corresponding to --fspath, prepended to filenames in the index; defaults to an empty string', false, true );
00151         $this->addOption( 'compress', 'Compress the sitemap files, can take value yes|no, default yes', false, true );
00152         $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' );
00153         $this->addOption( 'identifier', 'What site identifier to use for the wiki, defaults to $wgDBname', false, true );
00154     }
00155 
00159     public function execute() {
00160         $this->setNamespacePriorities();
00161         $this->url_limit = 50000;
00162         $this->size_limit = pow( 2, 20 ) * 10;
00163         $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) );
00164         $this->urlpath = $this->getOption( 'urlpath', "" );
00165         if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) {
00166             $this->urlpath .= '/';
00167         }
00168         $this->identifier = $this->getOption( 'identifier', wfWikiID() );
00169         $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no';
00170         $this->skipRedirects = $this->getOption( 'skip-redirects', false ) !== false;
00171         $this->dbr = wfGetDB( DB_SLAVE );
00172         $this->generateNamespaces();
00173         $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
00174         $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' );
00175         $this->main();
00176     }
00177 
00178     private function setNamespacePriorities() {
00179         global $wgSitemapNamespacesPriorities;
00180 
00181         // Custom main namespaces
00182         $this->priorities[self::GS_MAIN] = '0.5';
00183         // Custom talk namesspaces
00184         $this->priorities[self::GS_TALK] = '0.1';
00185         // MediaWiki standard namespaces
00186         $this->priorities[NS_MAIN] = '1.0';
00187         $this->priorities[NS_TALK] = '0.1';
00188         $this->priorities[NS_USER] = '0.5';
00189         $this->priorities[NS_USER_TALK] = '0.1';
00190         $this->priorities[NS_PROJECT] = '0.5';
00191         $this->priorities[NS_PROJECT_TALK] = '0.1';
00192         $this->priorities[NS_FILE] = '0.5';
00193         $this->priorities[NS_FILE_TALK] = '0.1';
00194         $this->priorities[NS_MEDIAWIKI] = '0.0';
00195         $this->priorities[NS_MEDIAWIKI_TALK] = '0.1';
00196         $this->priorities[NS_TEMPLATE] = '0.0';
00197         $this->priorities[NS_TEMPLATE_TALK] = '0.1';
00198         $this->priorities[NS_HELP] = '0.5';
00199         $this->priorities[NS_HELP_TALK] = '0.1';
00200         $this->priorities[NS_CATEGORY] = '0.5';
00201         $this->priorities[NS_CATEGORY_TALK] = '0.1';
00202 
00203         // Custom priorities
00204         if ( $wgSitemapNamespacesPriorities !== false ) {
00208             foreach ( $wgSitemapNamespacesPriorities as $namespace => $priority ) {
00209                 $float = floatval( $priority );
00210                 if ( $float > 1.0 ) {
00211                     $priority = '1.0';
00212                 } elseif ( $float < 0.0 ) {
00213                     $priority = '0.0';
00214                 }
00215                 $this->priorities[$namespace] = $priority;
00216             }
00217         }
00218     }
00219 
00225     private static function init_path( $fspath ) {
00226         if ( !isset( $fspath ) ) {
00227             return null;
00228         }
00229         # Create directory if needed
00230         if ( $fspath && !is_dir( $fspath ) ) {
00231             wfMkdirParents( $fspath, null, __METHOD__ ) or die( "Can not create directory $fspath.\n" );
00232         }
00233 
00234         return realpath( $fspath ) . DIRECTORY_SEPARATOR;
00235     }
00236 
00240     function generateNamespaces() {
00241         // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
00242         global $wgSitemapNamespaces;
00243         if ( is_array( $wgSitemapNamespaces ) ) {
00244             $this->namespaces = $wgSitemapNamespaces;
00245             return;
00246         }
00247 
00248         $res = $this->dbr->select( 'page',
00249             array( 'page_namespace' ),
00250             array(),
00251             __METHOD__,
00252             array(
00253                 'GROUP BY' => 'page_namespace',
00254                 'ORDER BY' => 'page_namespace',
00255             )
00256         );
00257 
00258         foreach ( $res as $row ) {
00259             $this->namespaces[] = $row->page_namespace;
00260         }
00261     }
00262 
00269     function priority( $namespace ) {
00270         return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
00271     }
00272 
00281     function guessPriority( $namespace ) {
00282         return MWNamespace::isSubject( $namespace ) ? $this->priorities[self::GS_MAIN] : $this->priorities[self::GS_TALK];
00283     }
00284 
00291     function getPageRes( $namespace ) {
00292         return $this->dbr->select( 'page',
00293             array(
00294                 'page_namespace',
00295                 'page_title',
00296                 'page_touched',
00297                 'page_is_redirect'
00298             ),
00299             array( 'page_namespace' => $namespace ),
00300             __METHOD__
00301         );
00302     }
00303 
00307     public function main() {
00308         global $wgContLang;
00309 
00310         fwrite( $this->findex, $this->openIndex() );
00311 
00312         foreach ( $this->namespaces as $namespace ) {
00313             $res = $this->getPageRes( $namespace );
00314             $this->file = false;
00315             $this->generateLimit( $namespace );
00316             $length = $this->limit[0];
00317             $i = $smcount = 0;
00318 
00319             $fns = $wgContLang->getFormattedNsText( $namespace );
00320             $this->output( "$namespace ($fns)\n" );
00321             $skippedRedirects = 0;  // Number of redirects skipped for that namespace
00322             foreach ( $res as $row ) {
00323                 if ( $this->skipRedirects && $row->page_is_redirect ) {
00324                     $skippedRedirects++;
00325                     continue;
00326                 }
00327 
00328                 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
00329                     if ( $this->file !== false ) {
00330                         $this->write( $this->file, $this->closeFile() );
00331                         $this->close( $this->file );
00332                     }
00333                     $filename = $this->sitemapFilename( $namespace, $smcount++ );
00334                     $this->file = $this->open( $this->fspath . $filename, 'wb' );
00335                     $this->write( $this->file, $this->openFile() );
00336                     fwrite( $this->findex, $this->indexEntry( $filename ) );
00337                     $this->output( "\t$this->fspath$filename\n" );
00338                     $length = $this->limit[0];
00339                     $i = 1;
00340                 }
00341                 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
00342                 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
00343                 $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) );
00344                 $length += strlen( $entry );
00345                 $this->write( $this->file, $entry );
00346                 // generate pages for language variants
00347                 if ( $wgContLang->hasVariants() ) {
00348                     $variants = $wgContLang->getVariants();
00349                     foreach ( $variants as $vCode ) {
00350                         if ( $vCode == $wgContLang->getCode() ) {
00351                             continue; // we don't want default variant
00352                         }
00353                         $entry = $this->fileEntry( $title->getCanonicalURL( '', $vCode ), $date, $this->priority( $namespace ) );
00354                         $length += strlen( $entry );
00355                         $this->write( $this->file, $entry );
00356                     }
00357                 }
00358             }
00359 
00360             if ( $this->skipRedirects && $skippedRedirects > 0 ) {
00361                 $this->output( "  skipped $skippedRedirects redirect(s)\n" );
00362             }
00363 
00364             if ( $this->file ) {
00365                 $this->write( $this->file, $this->closeFile() );
00366                 $this->close( $this->file );
00367             }
00368         }
00369         fwrite( $this->findex, $this->closeIndex() );
00370         fclose( $this->findex );
00371     }
00372 
00378     function open( $file, $flags ) {
00379         $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
00380         if ( $resource === false ) {
00381             throw new MWException( __METHOD__ . " error opening file $file with flags $flags. Check permissions?" );
00382         }
00383         return $resource;
00384     }
00385 
00389     function write( &$handle, $str ) {
00390         if ( $handle === true || $handle === false ) {
00391             throw new MWException( __METHOD__ . " was passed a boolean as a file handle.\n" );
00392         }
00393         if ( $this->compress ) {
00394             gzwrite( $handle, $str );
00395         } else {
00396             fwrite( $handle, $str );
00397         }
00398     }
00399 
00403     function close( &$handle ) {
00404         if ( $this->compress ) {
00405             gzclose( $handle );
00406         } else {
00407             fclose( $handle );
00408         }
00409     }
00410 
00418     function sitemapFilename( $namespace, $count ) {
00419         $ext = $this->compress ? '.gz' : '';
00420         return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext";
00421     }
00422 
00428     function xmlHead() {
00429         return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
00430     }
00431 
00437     function xmlSchema() {
00438         return 'http://www.sitemaps.org/schemas/sitemap/0.9';
00439     }
00440 
00446     function openIndex() {
00447         return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
00448     }
00449 
00456     function indexEntry( $filename ) {
00457         return
00458             "\t<sitemap>\n" .
00459             "\t\t<loc>{$this->urlpath}$filename</loc>\n" .
00460             "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
00461             "\t</sitemap>\n";
00462     }
00463 
00469     function closeIndex() {
00470         return "</sitemapindex>\n";
00471     }
00472 
00478     function openFile() {
00479         return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
00480     }
00481 
00490     function fileEntry( $url, $date, $priority ) {
00491         return
00492             "\t<url>\n" .
00493             // bug 34666: $url may contain bad characters such as ampersands.
00494             "\t\t<loc>" . htmlspecialchars( $url ) . "</loc>\n" .
00495             "\t\t<lastmod>$date</lastmod>\n" .
00496             "\t\t<priority>$priority</priority>\n" .
00497             "\t</url>\n";
00498     }
00499 
00505     function closeFile() {
00506         return "</urlset>\n";
00507     }
00508 
00512     function generateLimit( $namespace ) {
00513         // bug 17961: make a title with the longest possible URL in this namespace
00514         $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
00515 
00516         $this->limit = array(
00517             strlen( $this->openFile() ),
00518             strlen( $this->fileEntry( $title->getCanonicalURL(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
00519             strlen( $this->closeFile() )
00520         );
00521     }
00522 }
00523 
00524 $maintClass = "GenerateSitemap";
00525 require_once RUN_MAINTENANCE_IF_MAIN;