MediaWiki  REL1_19
generateSitemap.php
Go to the documentation of this file.
00001 <?php
00029 require_once( dirname( __FILE__ ) . '/Maintenance.php' );
00030 
00031 class GenerateSitemap extends Maintenance {
00032         const GS_MAIN = -2;
00033         const GS_TALK = -1;
00034 
00042         var $url_limit;
00043 
00051         var $size_limit;
00052 
00058         var $fspath;
00059 
00065         var $urlpath;
00066 
00072         var $compress;
00073 
00079         var $limit = array();
00080 
00086         var $priorities = array();
00087 
00093         var $namespaces = array();
00094 
00100         var $timestamp;
00101 
00107         var $dbr;
00108 
00114         var $findex;
00115 
00116 
00122         var $file;
00123 
00129         private $identifier;
00130 
00134         public function __construct() {
00135                 parent::__construct();
00136                 $this->mDescription = "Creates a sitemap for the site";
00137                 $this->addOption( 'fspath', 'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory', false, true );
00138                 $this->addOption( 'urlpath', 'The URL path corresponding to --fspath, prepended to filenames in the index; defaults to an empty string', false, true );
00139                 $this->addOption( 'compress', 'Compress the sitemap files, can take value yes|no, default yes', false, true );
00140                 $this->addOption( 'identifier', 'What site identifier to use for the wiki, defaults to $wgDBname', false, true );
00141         }
00142 
00146         public function execute() {
00147                 $this->setNamespacePriorities();
00148                 $this->url_limit = 50000;
00149                 $this->size_limit = pow( 2, 20 ) * 10;
00150                 $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) );
00151                 $this->urlpath = $this->getOption( 'urlpath', "" );
00152                 if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) {
00153                         $this->urlpath .= '/';
00154                 }
00155                 $this->identifier = $this->getOption( 'identifier', wfWikiID() );
00156                 $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no';
00157                 $this->dbr = wfGetDB( DB_SLAVE );
00158                 $this->generateNamespaces();
00159                 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
00160                 $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' );
00161                 $this->main();
00162         }
00163 
00164         private function setNamespacePriorities() {
00165                 global $wgSitemapNamespacesPriorities;
00166 
00167                 // Custom main namespaces
00168                 $this->priorities[self::GS_MAIN] = '0.5';
00169                 // Custom talk namesspaces
00170                 $this->priorities[self::GS_TALK] = '0.1';
00171                 // MediaWiki standard namespaces
00172                 $this->priorities[NS_MAIN] = '1.0';
00173                 $this->priorities[NS_TALK] = '0.1';
00174                 $this->priorities[NS_USER] = '0.5';
00175                 $this->priorities[NS_USER_TALK] = '0.1';
00176                 $this->priorities[NS_PROJECT] = '0.5';
00177                 $this->priorities[NS_PROJECT_TALK] = '0.1';
00178                 $this->priorities[NS_FILE] = '0.5';
00179                 $this->priorities[NS_FILE_TALK] = '0.1';
00180                 $this->priorities[NS_MEDIAWIKI] = '0.0';
00181                 $this->priorities[NS_MEDIAWIKI_TALK] = '0.1';
00182                 $this->priorities[NS_TEMPLATE] = '0.0';
00183                 $this->priorities[NS_TEMPLATE_TALK] = '0.1';
00184                 $this->priorities[NS_HELP] = '0.5';
00185                 $this->priorities[NS_HELP_TALK] = '0.1';
00186                 $this->priorities[NS_CATEGORY] = '0.5';
00187                 $this->priorities[NS_CATEGORY_TALK] = '0.1';
00188 
00189                 // Custom priorities
00190                 if ( $wgSitemapNamespacesPriorities !== false ) {
00194                         foreach ( $wgSitemapNamespacesPriorities as $namespace => $priority ) {
00195                                 $float = floatval( $priority );
00196                                 if ( $float > 1.0 ) {
00197                                         $priority = '1.0';
00198                                 } elseif ( $float < 0.0 ) {
00199                                         $priority = '0.0';
00200                                 }
00201                                 $this->priorities[$namespace] = $priority;
00202                         }
00203                 }
00204         }
00205 
00211         private static function init_path( $fspath ) {
00212                 if ( !isset( $fspath ) ) {
00213                         return null;
00214                 }
00215                 # Create directory if needed
00216                 if ( $fspath && !is_dir( $fspath ) ) {
00217                         wfMkdirParents( $fspath, null, __METHOD__ ) or die( "Can not create directory $fspath.\n" );
00218                 }
00219 
00220                 return realpath( $fspath ) . DIRECTORY_SEPARATOR ;
00221         }
00222 
00226         function generateNamespaces() {
00227                 // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
00228                 global $wgSitemapNamespaces;
00229                 if ( is_array( $wgSitemapNamespaces ) ) {
00230                         $this->namespaces = $wgSitemapNamespaces;
00231                         return;
00232                 }
00233 
00234                 $res = $this->dbr->select( 'page',
00235                         array( 'page_namespace' ),
00236                         array(),
00237                         __METHOD__,
00238                         array(
00239                                 'GROUP BY' => 'page_namespace',
00240                                 'ORDER BY' => 'page_namespace',
00241                         )
00242                 );
00243 
00244                 foreach ( $res as $row )
00245                         $this->namespaces[] = $row->page_namespace;
00246         }
00247 
00254         function priority( $namespace ) {
00255                 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
00256         }
00257 
00266         function guessPriority( $namespace ) {
00267                 return MWNamespace::isMain( $namespace ) ? $this->priorities[self::GS_MAIN] : $this->priorities[self::GS_TALK];
00268         }
00269 
00276         function getPageRes( $namespace ) {
00277                 return $this->dbr->select( 'page',
00278                         array(
00279                                 'page_namespace',
00280                                 'page_title',
00281                                 'page_touched',
00282                         ),
00283                         array( 'page_namespace' => $namespace ),
00284                         __METHOD__
00285                 );
00286         }
00287 
00291         public function main() {
00292                 global $wgContLang;
00293 
00294                 fwrite( $this->findex, $this->openIndex() );
00295 
00296                 foreach ( $this->namespaces as $namespace ) {
00297                         $res = $this->getPageRes( $namespace );
00298                         $this->file = false;
00299                         $this->generateLimit( $namespace );
00300                         $length = $this->limit[0];
00301                         $i = $smcount = 0;
00302 
00303                         $fns = $wgContLang->getFormattedNsText( $namespace );
00304                         $this->output( "$namespace ($fns)\n" );
00305                         foreach ( $res as $row ) {
00306                                 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
00307                                         if ( $this->file !== false ) {
00308                                                 $this->write( $this->file, $this->closeFile() );
00309                                                 $this->close( $this->file );
00310                                         }
00311                                         $filename = $this->sitemapFilename( $namespace, $smcount++ );
00312                                         $this->file = $this->open( $this->fspath . $filename, 'wb' );
00313                                         $this->write( $this->file, $this->openFile() );
00314                                         fwrite( $this->findex, $this->indexEntry( $filename ) );
00315                                         $this->output( "\t$this->fspath$filename\n" );
00316                                         $length = $this->limit[0];
00317                                         $i = 1;
00318                                 }
00319                                 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
00320                                 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
00321                                 $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) );
00322                                 $length += strlen( $entry );
00323                                 $this->write( $this->file, $entry );
00324                                 // generate pages for language variants
00325                                 if ( $wgContLang->hasVariants() ) {
00326                                         $variants = $wgContLang->getVariants();
00327                                         foreach ( $variants as $vCode ) {
00328                                                 if ( $vCode == $wgContLang->getCode() ) continue; // we don't want default variant
00329                                                 $entry = $this->fileEntry( $title->getCanonicalURL( '', $vCode ), $date, $this->priority( $namespace ) );
00330                                                 $length += strlen( $entry );
00331                                                 $this->write( $this->file, $entry );
00332                                         }
00333                                 }
00334                         }
00335                         if ( $this->file ) {
00336                                 $this->write( $this->file, $this->closeFile() );
00337                                 $this->close( $this->file );
00338                         }
00339                 }
00340                 fwrite( $this->findex, $this->closeIndex() );
00341                 fclose( $this->findex );
00342         }
00343 
00349         function open( $file, $flags ) {
00350                 $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
00351                 if( $resource === false ) {
00352                         wfDebugDieBacktrace( __METHOD__ . " error opening file $file with flags $flags. Check permissions?" );
00353                 }
00354                 return $resource;
00355         }
00356 
00360         function write( &$handle, $str ) {
00361                 if( $handle === true || $handle === false ) {
00362                         wfDebugDieBacktrace( __METHOD__ . " was passed a boolean as a file handle.\n" );
00363                 }
00364                 if ( $this->compress )
00365                         gzwrite( $handle, $str );
00366                 else
00367                         fwrite( $handle, $str );
00368         }
00369 
00373         function close( &$handle ) {
00374                 if ( $this->compress )
00375                         gzclose( $handle );
00376                 else
00377                         fclose( $handle );
00378         }
00379 
00387         function sitemapFilename( $namespace, $count ) {
00388                 $ext = $this->compress ? '.gz' : '';
00389                 return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext";
00390         }
00391 
00397         function xmlHead() {
00398                 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
00399         }
00400 
00406         function xmlSchema() {
00407                 return 'http://www.sitemaps.org/schemas/sitemap/0.9';
00408         }
00409 
00415         function openIndex() {
00416                 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
00417         }
00418 
00425         function indexEntry( $filename ) {
00426                 return
00427                         "\t<sitemap>\n" .
00428                         "\t\t<loc>{$this->urlpath}$filename</loc>\n" .
00429                         "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
00430                         "\t</sitemap>\n";
00431         }
00432 
00438         function closeIndex() {
00439                 return "</sitemapindex>\n";
00440         }
00441 
00447         function openFile() {
00448                 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
00449         }
00450 
00459         function fileEntry( $url, $date, $priority ) {
00460                 return
00461                         "\t<url>\n" .
00462                         "\t\t<loc>$url</loc>\n" .
00463                         "\t\t<lastmod>$date</lastmod>\n" .
00464                         "\t\t<priority>$priority</priority>\n" .
00465                         "\t</url>\n";
00466         }
00467 
00473         function closeFile() {
00474                 return "</urlset>\n";
00475         }
00476 
00480         function generateLimit( $namespace ) {
00481                 // bug 17961: make a title with the longest possible URL in this namespace
00482                 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
00483 
00484                 $this->limit = array(
00485                         strlen( $this->openFile() ),
00486                         strlen( $this->fileEntry( $title->getCanonicalURL(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
00487                         strlen( $this->closeFile() )
00488                 );
00489         }
00490 }
00491 
00492 $maintClass = "GenerateSitemap";
00493 require_once( RUN_MAINTENANCE_IF_MAIN );