MediaWiki  REL1_24
generateSitemap.php
Go to the documentation of this file.
00001 <?php
00029 require_once __DIR__ . '/Maintenance.php';
00030 
00036 class GenerateSitemap extends Maintenance {
00037     const GS_MAIN = -2;
00038     const GS_TALK = -1;
00039 
00047     public $url_limit;
00048 
00056     public $size_limit;
00057 
00063     public $fspath;
00064 
00071     public $urlpath;
00072 
00078     public $compress;
00079 
00085     public $skipRedirects;
00086 
00092     public $limit = array();
00093 
00099     public $priorities = array();
00100 
00106     public $namespaces = array();
00107 
00113     public $timestamp;
00114 
00120     public $dbr;
00121 
00127     public $findex;
00128 
00134     public $file;
00135 
00141     private $identifier;
00142 
00146     public function __construct() {
00147         parent::__construct();
00148         $this->mDescription = "Creates a sitemap for the site";
00149         $this->addOption(
00150             'fspath',
00151             'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory',
00152             false,
00153             true
00154         );
00155         $this->addOption(
00156             'urlpath',
00157             'The URL path corresponding to --fspath, prepended to filenames in the index; '
00158                 . 'defaults to an empty string',
00159             false,
00160             true
00161         );
00162         $this->addOption(
00163             'compress',
00164             'Compress the sitemap files, can take value yes|no, default yes',
00165             false,
00166             true
00167         );
00168         $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' );
00169         $this->addOption(
00170             'identifier',
00171             'What site identifier to use for the wiki, defaults to $wgDBname',
00172             false,
00173             true
00174         );
00175     }
00176 
00180     public function execute() {
00181         $this->setNamespacePriorities();
00182         $this->url_limit = 50000;
00183         $this->size_limit = pow( 2, 20 ) * 10;
00184         $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) );
00185         $this->urlpath = $this->getOption( 'urlpath', "" );
00186         if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) {
00187             $this->urlpath .= '/';
00188         }
00189         $this->identifier = $this->getOption( 'identifier', wfWikiID() );
00190         $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no';
00191         $this->skipRedirects = $this->getOption( 'skip-redirects', false ) !== false;
00192         $this->dbr = wfGetDB( DB_SLAVE );
00193         $this->generateNamespaces();
00194         $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
00195         $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' );
00196         $this->main();
00197     }
00198 
00199     private function setNamespacePriorities() {
00200         global $wgSitemapNamespacesPriorities;
00201 
00202         // Custom main namespaces
00203         $this->priorities[self::GS_MAIN] = '0.5';
00204         // Custom talk namesspaces
00205         $this->priorities[self::GS_TALK] = '0.1';
00206         // MediaWiki standard namespaces
00207         $this->priorities[NS_MAIN] = '1.0';
00208         $this->priorities[NS_TALK] = '0.1';
00209         $this->priorities[NS_USER] = '0.5';
00210         $this->priorities[NS_USER_TALK] = '0.1';
00211         $this->priorities[NS_PROJECT] = '0.5';
00212         $this->priorities[NS_PROJECT_TALK] = '0.1';
00213         $this->priorities[NS_FILE] = '0.5';
00214         $this->priorities[NS_FILE_TALK] = '0.1';
00215         $this->priorities[NS_MEDIAWIKI] = '0.0';
00216         $this->priorities[NS_MEDIAWIKI_TALK] = '0.1';
00217         $this->priorities[NS_TEMPLATE] = '0.0';
00218         $this->priorities[NS_TEMPLATE_TALK] = '0.1';
00219         $this->priorities[NS_HELP] = '0.5';
00220         $this->priorities[NS_HELP_TALK] = '0.1';
00221         $this->priorities[NS_CATEGORY] = '0.5';
00222         $this->priorities[NS_CATEGORY_TALK] = '0.1';
00223 
00224         // Custom priorities
00225         if ( $wgSitemapNamespacesPriorities !== false ) {
00229             foreach ( $wgSitemapNamespacesPriorities as $namespace => $priority ) {
00230                 $float = floatval( $priority );
00231                 if ( $float > 1.0 ) {
00232                     $priority = '1.0';
00233                 } elseif ( $float < 0.0 ) {
00234                     $priority = '0.0';
00235                 }
00236                 $this->priorities[$namespace] = $priority;
00237             }
00238         }
00239     }
00240 
00246     private static function init_path( $fspath ) {
00247         # Create directory if needed
00248         if ( $fspath && !is_dir( $fspath ) ) {
00249             wfMkdirParents( $fspath, null, __METHOD__ ) or die( "Can not create directory $fspath.\n" );
00250         }
00251 
00252         return realpath( $fspath ) . DIRECTORY_SEPARATOR;
00253     }
00254 
00258     function generateNamespaces() {
00259         // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
00260         global $wgSitemapNamespaces;
00261         if ( is_array( $wgSitemapNamespaces ) ) {
00262             $this->namespaces = $wgSitemapNamespaces;
00263 
00264             return;
00265         }
00266 
00267         $res = $this->dbr->select( 'page',
00268             array( 'page_namespace' ),
00269             array(),
00270             __METHOD__,
00271             array(
00272                 'GROUP BY' => 'page_namespace',
00273                 'ORDER BY' => 'page_namespace',
00274             )
00275         );
00276 
00277         foreach ( $res as $row ) {
00278             $this->namespaces[] = $row->page_namespace;
00279         }
00280     }
00281 
00288     function priority( $namespace ) {
00289         return isset( $this->priorities[$namespace] )
00290             ? $this->priorities[$namespace]
00291             : $this->guessPriority( $namespace );
00292     }
00293 
00302     function guessPriority( $namespace ) {
00303         return MWNamespace::isSubject( $namespace )
00304             ? $this->priorities[self::GS_MAIN]
00305             : $this->priorities[self::GS_TALK];
00306     }
00307 
00314     function getPageRes( $namespace ) {
00315         return $this->dbr->select( 'page',
00316             array(
00317                 'page_namespace',
00318                 'page_title',
00319                 'page_touched',
00320                 'page_is_redirect'
00321             ),
00322             array( 'page_namespace' => $namespace ),
00323             __METHOD__
00324         );
00325     }
00326 
00330     public function main() {
00331         global $wgContLang;
00332 
00333         fwrite( $this->findex, $this->openIndex() );
00334 
00335         foreach ( $this->namespaces as $namespace ) {
00336             $res = $this->getPageRes( $namespace );
00337             $this->file = false;
00338             $this->generateLimit( $namespace );
00339             $length = $this->limit[0];
00340             $i = $smcount = 0;
00341 
00342             $fns = $wgContLang->getFormattedNsText( $namespace );
00343             $this->output( "$namespace ($fns)\n" );
00344             $skippedRedirects = 0; // Number of redirects skipped for that namespace
00345             foreach ( $res as $row ) {
00346                 if ( $this->skipRedirects && $row->page_is_redirect ) {
00347                     $skippedRedirects++;
00348                     continue;
00349                 }
00350 
00351                 if ( $i++ === 0
00352                     || $i === $this->url_limit + 1
00353                     || $length + $this->limit[1] + $this->limit[2] > $this->size_limit
00354                 ) {
00355                     if ( $this->file !== false ) {
00356                         $this->write( $this->file, $this->closeFile() );
00357                         $this->close( $this->file );
00358                     }
00359                     $filename = $this->sitemapFilename( $namespace, $smcount++ );
00360                     $this->file = $this->open( $this->fspath . $filename, 'wb' );
00361                     $this->write( $this->file, $this->openFile() );
00362                     fwrite( $this->findex, $this->indexEntry( $filename ) );
00363                     $this->output( "\t$this->fspath$filename\n" );
00364                     $length = $this->limit[0];
00365                     $i = 1;
00366                 }
00367                 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
00368                 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
00369                 $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) );
00370                 $length += strlen( $entry );
00371                 $this->write( $this->file, $entry );
00372                 // generate pages for language variants
00373                 if ( $wgContLang->hasVariants() ) {
00374                     $variants = $wgContLang->getVariants();
00375                     foreach ( $variants as $vCode ) {
00376                         if ( $vCode == $wgContLang->getCode() ) {
00377                             continue; // we don't want default variant
00378                         }
00379                         $entry = $this->fileEntry(
00380                             $title->getCanonicalURL( '', $vCode ),
00381                             $date,
00382                             $this->priority( $namespace )
00383                         );
00384                         $length += strlen( $entry );
00385                         $this->write( $this->file, $entry );
00386                     }
00387                 }
00388             }
00389 
00390             if ( $this->skipRedirects && $skippedRedirects > 0 ) {
00391                 $this->output( "  skipped $skippedRedirects redirect(s)\n" );
00392             }
00393 
00394             if ( $this->file ) {
00395                 $this->write( $this->file, $this->closeFile() );
00396                 $this->close( $this->file );
00397             }
00398         }
00399         fwrite( $this->findex, $this->closeIndex() );
00400         fclose( $this->findex );
00401     }
00402 
00410     function open( $file, $flags ) {
00411         $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
00412         if ( $resource === false ) {
00413             throw new MWException( __METHOD__
00414                 . " error opening file $file with flags $flags. Check permissions?" );
00415         }
00416 
00417         return $resource;
00418     }
00419 
00426     function write( &$handle, $str ) {
00427         if ( $handle === true || $handle === false ) {
00428             throw new MWException( __METHOD__ . " was passed a boolean as a file handle.\n" );
00429         }
00430         if ( $this->compress ) {
00431             gzwrite( $handle, $str );
00432         } else {
00433             fwrite( $handle, $str );
00434         }
00435     }
00436 
00442     function close( &$handle ) {
00443         if ( $this->compress ) {
00444             gzclose( $handle );
00445         } else {
00446             fclose( $handle );
00447         }
00448     }
00449 
00457     function sitemapFilename( $namespace, $count ) {
00458         $ext = $this->compress ? '.gz' : '';
00459 
00460         return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext";
00461     }
00462 
00468     function xmlHead() {
00469         return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
00470     }
00471 
00477     function xmlSchema() {
00478         return 'http://www.sitemaps.org/schemas/sitemap/0.9';
00479     }
00480 
00486     function openIndex() {
00487         return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
00488     }
00489 
00496     function indexEntry( $filename ) {
00497         return
00498             "\t<sitemap>\n" .
00499             "\t\t<loc>{$this->urlpath}$filename</loc>\n" .
00500             "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
00501             "\t</sitemap>\n";
00502     }
00503 
00509     function closeIndex() {
00510         return "</sitemapindex>\n";
00511     }
00512 
00518     function openFile() {
00519         return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
00520     }
00521 
00530     function fileEntry( $url, $date, $priority ) {
00531         return
00532             "\t<url>\n" .
00533             // bug 34666: $url may contain bad characters such as ampersands.
00534             "\t\t<loc>" . htmlspecialchars( $url ) . "</loc>\n" .
00535             "\t\t<lastmod>$date</lastmod>\n" .
00536             "\t\t<priority>$priority</priority>\n" .
00537             "\t</url>\n";
00538     }
00539 
00545     function closeFile() {
00546         return "</urlset>\n";
00547     }
00548 
00554     function generateLimit( $namespace ) {
00555         // bug 17961: make a title with the longest possible URL in this namespace
00556         $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
00557 
00558         $this->limit = array(
00559             strlen( $this->openFile() ),
00560             strlen( $this->fileEntry(
00561                 $title->getCanonicalURL(),
00562                 wfTimestamp( TS_ISO_8601, wfTimestamp() ),
00563                 $this->priority( $namespace )
00564             ) ),
00565             strlen( $this->closeFile() )
00566         );
00567     }
00568 }
00569 
00570 $maintClass = "GenerateSitemap";
00571 require_once RUN_MAINTENANCE_IF_MAIN;