[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/maintenance/ -> generateSitemap.php (source)

   1  <?php
   2  /**
   3   * Creates a sitemap for the site.
   4   *
   5   * Copyright © 2005, Ævar Arnfjörð Bjarmason, Jens Frank <[email protected]> and
   6   * Brion Vibber <[email protected]>
   7   *
   8   * This program is free software; you can redistribute it and/or modify
   9   * it under the terms of the GNU General Public License as published by
  10   * the Free Software Foundation; either version 2 of the License, or
  11   * (at your option) any later version.
  12   *
  13   * This program is distributed in the hope that it will be useful,
  14   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16   * GNU General Public License for more details.
  17   *
  18   * You should have received a copy of the GNU General Public License along
  19   * with this program; if not, write to the Free Software Foundation, Inc.,
  20   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21   * http://www.gnu.org/copyleft/gpl.html
  22   *
  23   * @file
  24   * @ingroup Maintenance
  25   * @see http://www.sitemaps.org/
  26   * @see http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
  27   */
  28  
  29  require_once  __DIR__ . '/Maintenance.php';
  30  
  31  /**
  32   * Maintenance script that generates a sitemap for the site.
  33   *
  34   * @ingroup Maintenance
  35   */
  36  class GenerateSitemap extends Maintenance {
  37      const GS_MAIN = -2;
  38      const GS_TALK = -1;
  39  
  40      /**
  41       * The maximum amount of urls in a sitemap file
  42       *
  43       * @link http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
  44       *
  45       * @var int
  46       */
  47      public $url_limit;
  48  
  49      /**
  50       * The maximum size of a sitemap file
  51       *
  52       * @link http://www.sitemaps.org/faq.php#faq_sitemap_size
  53       *
  54       * @var int
  55       */
  56      public $size_limit;
  57  
  58      /**
  59       * The path to prepend to the filename
  60       *
  61       * @var string
  62       */
  63      public $fspath;
  64  
  65      /**
  66       * The URL path to prepend to filenames in the index;
  67       * should resolve to the same directory as $fspath.
  68       *
  69       * @var string
  70       */
  71      public $urlpath;
  72  
  73      /**
  74       * Whether or not to use compression
  75       *
  76       * @var bool
  77       */
  78      public $compress;
  79  
  80      /**
  81       * Whether or not to include redirection pages
  82       *
  83       * @var bool
  84       */
  85      public $skipRedirects;
  86  
  87      /**
  88       * The number of entries to save in each sitemap file
  89       *
  90       * @var array
  91       */
  92      public $limit = array();
  93  
  94      /**
  95       * Key => value entries of namespaces and their priorities
  96       *
  97       * @var array
  98       */
  99      public $priorities = array();
 100  
 101      /**
 102       * A one-dimensional array of namespaces in the wiki
 103       *
 104       * @var array
 105       */
 106      public $namespaces = array();
 107  
 108      /**
 109       * When this sitemap batch was generated
 110       *
 111       * @var string
 112       */
 113      public $timestamp;
 114  
 115      /**
 116       * A database slave object
 117       *
 118       * @var object
 119       */
 120      public $dbr;
 121  
 122      /**
 123       * A resource pointing to the sitemap index file
 124       *
 125       * @var resource
 126       */
 127      public $findex;
 128  
 129      /**
 130       * A resource pointing to a sitemap file
 131       *
 132       * @var resource
 133       */
 134      public $file;
 135  
 136      /**
 137       * Identifier to use in filenames, default $wgDBname
 138       *
 139       * @var string
 140       */
 141      private $identifier;
 142  
 143      /**
 144       * Constructor
 145       */
 146  	public function __construct() {
 147          parent::__construct();
 148          $this->mDescription = "Creates a sitemap for the site";
 149          $this->addOption(
 150              'fspath',
 151              'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory',
 152              false,
 153              true
 154          );
 155          $this->addOption(
 156              'urlpath',
 157              'The URL path corresponding to --fspath, prepended to filenames in the index; '
 158                  . 'defaults to an empty string',
 159              false,
 160              true
 161          );
 162          $this->addOption(
 163              'compress',
 164              'Compress the sitemap files, can take value yes|no, default yes',
 165              false,
 166              true
 167          );
 168          $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' );
 169          $this->addOption(
 170              'identifier',
 171              'What site identifier to use for the wiki, defaults to $wgDBname',
 172              false,
 173              true
 174          );
 175      }
 176  
 177      /**
 178       * Execute
 179       */
 180  	public function execute() {
 181          $this->setNamespacePriorities();
 182          $this->url_limit = 50000;
 183          $this->size_limit = pow( 2, 20 ) * 10;
 184          $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) );
 185          $this->urlpath = $this->getOption( 'urlpath', "" );
 186          if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) {
 187              $this->urlpath .= '/';
 188          }
 189          $this->identifier = $this->getOption( 'identifier', wfWikiID() );
 190          $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no';
 191          $this->skipRedirects = $this->getOption( 'skip-redirects', false ) !== false;
 192          $this->dbr = wfGetDB( DB_SLAVE );
 193          $this->generateNamespaces();
 194          $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
 195          $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' );
 196          $this->main();
 197      }
 198  
 199  	private function setNamespacePriorities() {
 200          global $wgSitemapNamespacesPriorities;
 201  
 202          // Custom main namespaces
 203          $this->priorities[self::GS_MAIN] = '0.5';
 204          // Custom talk namesspaces
 205          $this->priorities[self::GS_TALK] = '0.1';
 206          // MediaWiki standard namespaces
 207          $this->priorities[NS_MAIN] = '1.0';
 208          $this->priorities[NS_TALK] = '0.1';
 209          $this->priorities[NS_USER] = '0.5';
 210          $this->priorities[NS_USER_TALK] = '0.1';
 211          $this->priorities[NS_PROJECT] = '0.5';
 212          $this->priorities[NS_PROJECT_TALK] = '0.1';
 213          $this->priorities[NS_FILE] = '0.5';
 214          $this->priorities[NS_FILE_TALK] = '0.1';
 215          $this->priorities[NS_MEDIAWIKI] = '0.0';
 216          $this->priorities[NS_MEDIAWIKI_TALK] = '0.1';
 217          $this->priorities[NS_TEMPLATE] = '0.0';
 218          $this->priorities[NS_TEMPLATE_TALK] = '0.1';
 219          $this->priorities[NS_HELP] = '0.5';
 220          $this->priorities[NS_HELP_TALK] = '0.1';
 221          $this->priorities[NS_CATEGORY] = '0.5';
 222          $this->priorities[NS_CATEGORY_TALK] = '0.1';
 223  
 224          // Custom priorities
 225          if ( $wgSitemapNamespacesPriorities !== false ) {
 226              /**
 227               * @var $wgSitemapNamespacesPriorities array
 228               */
 229              foreach ( $wgSitemapNamespacesPriorities as $namespace => $priority ) {
 230                  $float = floatval( $priority );
 231                  if ( $float > 1.0 ) {
 232                      $priority = '1.0';
 233                  } elseif ( $float < 0.0 ) {
 234                      $priority = '0.0';
 235                  }
 236                  $this->priorities[$namespace] = $priority;
 237              }
 238          }
 239      }
 240  
 241      /**
 242       * Create directory if it does not exist and return pathname with a trailing slash
 243       * @param string $fspath
 244       * @return null|string
 245       */
 246  	private static function init_path( $fspath ) {
 247          # Create directory if needed
 248          if ( $fspath && !is_dir( $fspath ) ) {
 249              wfMkdirParents( $fspath, null, __METHOD__ ) or die( "Can not create directory $fspath.\n" );
 250          }
 251  
 252          return realpath( $fspath ) . DIRECTORY_SEPARATOR;
 253      }
 254  
 255      /**
 256       * Generate a one-dimensional array of existing namespaces
 257       */
 258  	function generateNamespaces() {
 259          // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
 260          global $wgSitemapNamespaces;
 261          if ( is_array( $wgSitemapNamespaces ) ) {
 262              $this->namespaces = $wgSitemapNamespaces;
 263  
 264              return;
 265          }
 266  
 267          $res = $this->dbr->select( 'page',
 268              array( 'page_namespace' ),
 269              array(),
 270              __METHOD__,
 271              array(
 272                  'GROUP BY' => 'page_namespace',
 273                  'ORDER BY' => 'page_namespace',
 274              )
 275          );
 276  
 277          foreach ( $res as $row ) {
 278              $this->namespaces[] = $row->page_namespace;
 279          }
 280      }
 281  
 282      /**
 283       * Get the priority of a given namespace
 284       *
 285       * @param int $namespace The namespace to get the priority for
 286       * @return string
 287       */
 288  	function priority( $namespace ) {
 289          return isset( $this->priorities[$namespace] )
 290              ? $this->priorities[$namespace]
 291              : $this->guessPriority( $namespace );
 292      }
 293  
 294      /**
 295       * If the namespace isn't listed on the priority list return the
 296       * default priority for the namespace, varies depending on whether it's
 297       * a talkpage or not.
 298       *
 299       * @param int $namespace The namespace to get the priority for
 300       * @return string
 301       */
 302  	function guessPriority( $namespace ) {
 303          return MWNamespace::isSubject( $namespace )
 304              ? $this->priorities[self::GS_MAIN]
 305              : $this->priorities[self::GS_TALK];
 306      }
 307  
 308      /**
 309       * Return a database resolution of all the pages in a given namespace
 310       *
 311       * @param int $namespace Limit the query to this namespace
 312       * @return Resource
 313       */
 314  	function getPageRes( $namespace ) {
 315          return $this->dbr->select( 'page',
 316              array(
 317                  'page_namespace',
 318                  'page_title',
 319                  'page_touched',
 320                  'page_is_redirect'
 321              ),
 322              array( 'page_namespace' => $namespace ),
 323              __METHOD__
 324          );
 325      }
 326  
 327      /**
 328       * Main loop
 329       */
 330  	public function main() {
 331          global $wgContLang;
 332  
 333          fwrite( $this->findex, $this->openIndex() );
 334  
 335          foreach ( $this->namespaces as $namespace ) {
 336              $res = $this->getPageRes( $namespace );
 337              $this->file = false;
 338              $this->generateLimit( $namespace );
 339              $length = $this->limit[0];
 340              $i = $smcount = 0;
 341  
 342              $fns = $wgContLang->getFormattedNsText( $namespace );
 343              $this->output( "$namespace ($fns)\n" );
 344              $skippedRedirects = 0; // Number of redirects skipped for that namespace
 345              foreach ( $res as $row ) {
 346                  if ( $this->skipRedirects && $row->page_is_redirect ) {
 347                      $skippedRedirects++;
 348                      continue;
 349                  }
 350  
 351                  if ( $i++ === 0
 352                      || $i === $this->url_limit + 1
 353                      || $length + $this->limit[1] + $this->limit[2] > $this->size_limit
 354                  ) {
 355                      if ( $this->file !== false ) {
 356                          $this->write( $this->file, $this->closeFile() );
 357                          $this->close( $this->file );
 358                      }
 359                      $filename = $this->sitemapFilename( $namespace, $smcount++ );
 360                      $this->file = $this->open( $this->fspath . $filename, 'wb' );
 361                      $this->write( $this->file, $this->openFile() );
 362                      fwrite( $this->findex, $this->indexEntry( $filename ) );
 363                      $this->output( "\t$this->fspath$filename\n" );
 364                      $length = $this->limit[0];
 365                      $i = 1;
 366                  }
 367                  $title = Title::makeTitle( $row->page_namespace, $row->page_title );
 368                  $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
 369                  $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) );
 370                  $length += strlen( $entry );
 371                  $this->write( $this->file, $entry );
 372                  // generate pages for language variants
 373                  if ( $wgContLang->hasVariants() ) {
 374                      $variants = $wgContLang->getVariants();
 375                      foreach ( $variants as $vCode ) {
 376                          if ( $vCode == $wgContLang->getCode() ) {
 377                              continue; // we don't want default variant
 378                          }
 379                          $entry = $this->fileEntry(
 380                              $title->getCanonicalURL( '', $vCode ),
 381                              $date,
 382                              $this->priority( $namespace )
 383                          );
 384                          $length += strlen( $entry );
 385                          $this->write( $this->file, $entry );
 386                      }
 387                  }
 388              }
 389  
 390              if ( $this->skipRedirects && $skippedRedirects > 0 ) {
 391                  $this->output( "  skipped $skippedRedirects redirect(s)\n" );
 392              }
 393  
 394              if ( $this->file ) {
 395                  $this->write( $this->file, $this->closeFile() );
 396                  $this->close( $this->file );
 397              }
 398          }
 399          fwrite( $this->findex, $this->closeIndex() );
 400          fclose( $this->findex );
 401      }
 402  
 403      /**
 404       * gzopen() / fopen() wrapper
 405       *
 406       * @param string $file
 407       * @param string $flags
 408       * @return resource
 409       */
 410  	function open( $file, $flags ) {
 411          $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
 412          if ( $resource === false ) {
 413              throw new MWException( __METHOD__
 414                  . " error opening file $file with flags $flags. Check permissions?" );
 415          }
 416  
 417          return $resource;
 418      }
 419  
 420      /**
 421       * gzwrite() / fwrite() wrapper
 422       *
 423       * @param resource $handle
 424       * @param string $str
 425       */
 426  	function write( &$handle, $str ) {
 427          if ( $handle === true || $handle === false ) {
 428              throw new MWException( __METHOD__ . " was passed a boolean as a file handle.\n" );
 429          }
 430          if ( $this->compress ) {
 431              gzwrite( $handle, $str );
 432          } else {
 433              fwrite( $handle, $str );
 434          }
 435      }
 436  
 437      /**
 438       * gzclose() / fclose() wrapper
 439       *
 440       * @param resource $handle
 441       */
 442  	function close( &$handle ) {
 443          if ( $this->compress ) {
 444              gzclose( $handle );
 445          } else {
 446              fclose( $handle );
 447          }
 448      }
 449  
 450      /**
 451       * Get a sitemap filename
 452       *
 453       * @param int $namespace The namespace
 454       * @param int $count The count
 455       * @return string
 456       */
 457  	function sitemapFilename( $namespace, $count ) {
 458          $ext = $this->compress ? '.gz' : '';
 459  
 460          return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext";
 461      }
 462  
 463      /**
 464       * Return the XML required to open an XML file
 465       *
 466       * @return string
 467       */
 468  	function xmlHead() {
 469          return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
 470      }
 471  
 472      /**
 473       * Return the XML schema being used
 474       *
 475       * @return string
 476       */
 477  	function xmlSchema() {
 478          return 'http://www.sitemaps.org/schemas/sitemap/0.9';
 479      }
 480  
 481      /**
 482       * Return the XML required to open a sitemap index file
 483       *
 484       * @return string
 485       */
 486  	function openIndex() {
 487          return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
 488      }
 489  
 490      /**
 491       * Return the XML for a single sitemap indexfile entry
 492       *
 493       * @param string $filename The filename of the sitemap file
 494       * @return string
 495       */
 496  	function indexEntry( $filename ) {
 497          return
 498              "\t<sitemap>\n" .
 499              "\t\t<loc>{$this->urlpath}$filename</loc>\n" .
 500              "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
 501              "\t</sitemap>\n";
 502      }
 503  
 504      /**
 505       * Return the XML required to close a sitemap index file
 506       *
 507       * @return string
 508       */
 509  	function closeIndex() {
 510          return "</sitemapindex>\n";
 511      }
 512  
 513      /**
 514       * Return the XML required to open a sitemap file
 515       *
 516       * @return string
 517       */
 518  	function openFile() {
 519          return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
 520      }
 521  
 522      /**
 523       * Return the XML for a single sitemap entry
 524       *
 525       * @param string $url An RFC 2396 compliant URL
 526       * @param string $date A ISO 8601 date
 527       * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
 528       * @return string
 529       */
 530  	function fileEntry( $url, $date, $priority ) {
 531          return
 532              "\t<url>\n" .
 533              // bug 34666: $url may contain bad characters such as ampersands.
 534              "\t\t<loc>" . htmlspecialchars( $url ) . "</loc>\n" .
 535              "\t\t<lastmod>$date</lastmod>\n" .
 536              "\t\t<priority>$priority</priority>\n" .
 537              "\t</url>\n";
 538      }
 539  
 540      /**
 541       * Return the XML required to close sitemap file
 542       *
 543       * @return string
 544       */
 545  	function closeFile() {
 546          return "</urlset>\n";
 547      }
 548  
 549      /**
 550       * Populate $this->limit
 551       *
 552       * @param int $namespace
 553       */
 554  	function generateLimit( $namespace ) {
 555          // bug 17961: make a title with the longest possible URL in this namespace
 556          $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
 557  
 558          $this->limit = array(
 559              strlen( $this->openFile() ),
 560              strlen( $this->fileEntry(
 561                  $title->getCanonicalURL(),
 562                  wfTimestamp( TS_ISO_8601, wfTimestamp() ),
 563                  $this->priority( $namespace )
 564              ) ),
 565              strlen( $this->closeFile() )
 566          );
 567      }
 568  }
 569  
 570  $maintClass = "GenerateSitemap";
 571  require_once RUN_MAINTENANCE_IF_MAIN;


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1