MediaWiki
REL1_24
|
00001 <?php 00029 require_once __DIR__ . '/Maintenance.php'; 00030 00036 class GenerateSitemap extends Maintenance { 00037 const GS_MAIN = -2; 00038 const GS_TALK = -1; 00039 00047 public $url_limit; 00048 00056 public $size_limit; 00057 00063 public $fspath; 00064 00071 public $urlpath; 00072 00078 public $compress; 00079 00085 public $skipRedirects; 00086 00092 public $limit = array(); 00093 00099 public $priorities = array(); 00100 00106 public $namespaces = array(); 00107 00113 public $timestamp; 00114 00120 public $dbr; 00121 00127 public $findex; 00128 00134 public $file; 00135 00141 private $identifier; 00142 00146 public function __construct() { 00147 parent::__construct(); 00148 $this->mDescription = "Creates a sitemap for the site"; 00149 $this->addOption( 00150 'fspath', 00151 'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory', 00152 false, 00153 true 00154 ); 00155 $this->addOption( 00156 'urlpath', 00157 'The URL path corresponding to --fspath, prepended to filenames in the index; ' 00158 . 'defaults to an empty string', 00159 false, 00160 true 00161 ); 00162 $this->addOption( 00163 'compress', 00164 'Compress the sitemap files, can take value yes|no, default yes', 00165 false, 00166 true 00167 ); 00168 $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' ); 00169 $this->addOption( 00170 'identifier', 00171 'What site identifier to use for the wiki, defaults to $wgDBname', 00172 false, 00173 true 00174 ); 00175 } 00176 00180 public function execute() { 00181 $this->setNamespacePriorities(); 00182 $this->url_limit = 50000; 00183 $this->size_limit = pow( 2, 20 ) * 10; 00184 $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) ); 00185 $this->urlpath = $this->getOption( 'urlpath', "" ); 00186 if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) { 00187 $this->urlpath .= '/'; 00188 } 00189 $this->identifier = $this->getOption( 'identifier', wfWikiID() ); 00190 $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no'; 00191 $this->skipRedirects = $this->getOption( 'skip-redirects', false ) !== false; 00192 $this->dbr = wfGetDB( DB_SLAVE ); 00193 $this->generateNamespaces(); 00194 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() ); 00195 $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' ); 00196 $this->main(); 00197 } 00198 00199 private function setNamespacePriorities() { 00200 global $wgSitemapNamespacesPriorities; 00201 00202 // Custom main namespaces 00203 $this->priorities[self::GS_MAIN] = '0.5'; 00204 // Custom talk namesspaces 00205 $this->priorities[self::GS_TALK] = '0.1'; 00206 // MediaWiki standard namespaces 00207 $this->priorities[NS_MAIN] = '1.0'; 00208 $this->priorities[NS_TALK] = '0.1'; 00209 $this->priorities[NS_USER] = '0.5'; 00210 $this->priorities[NS_USER_TALK] = '0.1'; 00211 $this->priorities[NS_PROJECT] = '0.5'; 00212 $this->priorities[NS_PROJECT_TALK] = '0.1'; 00213 $this->priorities[NS_FILE] = '0.5'; 00214 $this->priorities[NS_FILE_TALK] = '0.1'; 00215 $this->priorities[NS_MEDIAWIKI] = '0.0'; 00216 $this->priorities[NS_MEDIAWIKI_TALK] = '0.1'; 00217 $this->priorities[NS_TEMPLATE] = '0.0'; 00218 $this->priorities[NS_TEMPLATE_TALK] = '0.1'; 00219 $this->priorities[NS_HELP] = '0.5'; 00220 $this->priorities[NS_HELP_TALK] = '0.1'; 00221 $this->priorities[NS_CATEGORY] = '0.5'; 00222 $this->priorities[NS_CATEGORY_TALK] = '0.1'; 00223 00224 // Custom priorities 00225 if ( $wgSitemapNamespacesPriorities !== false ) { 00229 foreach ( $wgSitemapNamespacesPriorities as $namespace => $priority ) { 00230 $float = floatval( $priority ); 00231 if ( $float > 1.0 ) { 00232 $priority = '1.0'; 00233 } elseif ( $float < 0.0 ) { 00234 $priority = '0.0'; 00235 } 00236 $this->priorities[$namespace] = $priority; 00237 } 00238 } 00239 } 00240 00246 private static function init_path( $fspath ) { 00247 # Create directory if needed 00248 if ( $fspath && !is_dir( $fspath ) ) { 00249 wfMkdirParents( $fspath, null, __METHOD__ ) or die( "Can not create directory $fspath.\n" ); 00250 } 00251 00252 return realpath( $fspath ) . DIRECTORY_SEPARATOR; 00253 } 00254 00258 function generateNamespaces() { 00259 // Only generate for specific namespaces if $wgSitemapNamespaces is an array. 00260 global $wgSitemapNamespaces; 00261 if ( is_array( $wgSitemapNamespaces ) ) { 00262 $this->namespaces = $wgSitemapNamespaces; 00263 00264 return; 00265 } 00266 00267 $res = $this->dbr->select( 'page', 00268 array( 'page_namespace' ), 00269 array(), 00270 __METHOD__, 00271 array( 00272 'GROUP BY' => 'page_namespace', 00273 'ORDER BY' => 'page_namespace', 00274 ) 00275 ); 00276 00277 foreach ( $res as $row ) { 00278 $this->namespaces[] = $row->page_namespace; 00279 } 00280 } 00281 00288 function priority( $namespace ) { 00289 return isset( $this->priorities[$namespace] ) 00290 ? $this->priorities[$namespace] 00291 : $this->guessPriority( $namespace ); 00292 } 00293 00302 function guessPriority( $namespace ) { 00303 return MWNamespace::isSubject( $namespace ) 00304 ? $this->priorities[self::GS_MAIN] 00305 : $this->priorities[self::GS_TALK]; 00306 } 00307 00314 function getPageRes( $namespace ) { 00315 return $this->dbr->select( 'page', 00316 array( 00317 'page_namespace', 00318 'page_title', 00319 'page_touched', 00320 'page_is_redirect' 00321 ), 00322 array( 'page_namespace' => $namespace ), 00323 __METHOD__ 00324 ); 00325 } 00326 00330 public function main() { 00331 global $wgContLang; 00332 00333 fwrite( $this->findex, $this->openIndex() ); 00334 00335 foreach ( $this->namespaces as $namespace ) { 00336 $res = $this->getPageRes( $namespace ); 00337 $this->file = false; 00338 $this->generateLimit( $namespace ); 00339 $length = $this->limit[0]; 00340 $i = $smcount = 0; 00341 00342 $fns = $wgContLang->getFormattedNsText( $namespace ); 00343 $this->output( "$namespace ($fns)\n" ); 00344 $skippedRedirects = 0; // Number of redirects skipped for that namespace 00345 foreach ( $res as $row ) { 00346 if ( $this->skipRedirects && $row->page_is_redirect ) { 00347 $skippedRedirects++; 00348 continue; 00349 } 00350 00351 if ( $i++ === 0 00352 || $i === $this->url_limit + 1 00353 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit 00354 ) { 00355 if ( $this->file !== false ) { 00356 $this->write( $this->file, $this->closeFile() ); 00357 $this->close( $this->file ); 00358 } 00359 $filename = $this->sitemapFilename( $namespace, $smcount++ ); 00360 $this->file = $this->open( $this->fspath . $filename, 'wb' ); 00361 $this->write( $this->file, $this->openFile() ); 00362 fwrite( $this->findex, $this->indexEntry( $filename ) ); 00363 $this->output( "\t$this->fspath$filename\n" ); 00364 $length = $this->limit[0]; 00365 $i = 1; 00366 } 00367 $title = Title::makeTitle( $row->page_namespace, $row->page_title ); 00368 $date = wfTimestamp( TS_ISO_8601, $row->page_touched ); 00369 $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) ); 00370 $length += strlen( $entry ); 00371 $this->write( $this->file, $entry ); 00372 // generate pages for language variants 00373 if ( $wgContLang->hasVariants() ) { 00374 $variants = $wgContLang->getVariants(); 00375 foreach ( $variants as $vCode ) { 00376 if ( $vCode == $wgContLang->getCode() ) { 00377 continue; // we don't want default variant 00378 } 00379 $entry = $this->fileEntry( 00380 $title->getCanonicalURL( '', $vCode ), 00381 $date, 00382 $this->priority( $namespace ) 00383 ); 00384 $length += strlen( $entry ); 00385 $this->write( $this->file, $entry ); 00386 } 00387 } 00388 } 00389 00390 if ( $this->skipRedirects && $skippedRedirects > 0 ) { 00391 $this->output( " skipped $skippedRedirects redirect(s)\n" ); 00392 } 00393 00394 if ( $this->file ) { 00395 $this->write( $this->file, $this->closeFile() ); 00396 $this->close( $this->file ); 00397 } 00398 } 00399 fwrite( $this->findex, $this->closeIndex() ); 00400 fclose( $this->findex ); 00401 } 00402 00410 function open( $file, $flags ) { 00411 $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags ); 00412 if ( $resource === false ) { 00413 throw new MWException( __METHOD__ 00414 . " error opening file $file with flags $flags. Check permissions?" ); 00415 } 00416 00417 return $resource; 00418 } 00419 00426 function write( &$handle, $str ) { 00427 if ( $handle === true || $handle === false ) { 00428 throw new MWException( __METHOD__ . " was passed a boolean as a file handle.\n" ); 00429 } 00430 if ( $this->compress ) { 00431 gzwrite( $handle, $str ); 00432 } else { 00433 fwrite( $handle, $str ); 00434 } 00435 } 00436 00442 function close( &$handle ) { 00443 if ( $this->compress ) { 00444 gzclose( $handle ); 00445 } else { 00446 fclose( $handle ); 00447 } 00448 } 00449 00457 function sitemapFilename( $namespace, $count ) { 00458 $ext = $this->compress ? '.gz' : ''; 00459 00460 return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext"; 00461 } 00462 00468 function xmlHead() { 00469 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n"; 00470 } 00471 00477 function xmlSchema() { 00478 return 'http://www.sitemaps.org/schemas/sitemap/0.9'; 00479 } 00480 00486 function openIndex() { 00487 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n"; 00488 } 00489 00496 function indexEntry( $filename ) { 00497 return 00498 "\t<sitemap>\n" . 00499 "\t\t<loc>{$this->urlpath}$filename</loc>\n" . 00500 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" . 00501 "\t</sitemap>\n"; 00502 } 00503 00509 function closeIndex() { 00510 return "</sitemapindex>\n"; 00511 } 00512 00518 function openFile() { 00519 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n"; 00520 } 00521 00530 function fileEntry( $url, $date, $priority ) { 00531 return 00532 "\t<url>\n" . 00533 // bug 34666: $url may contain bad characters such as ampersands. 00534 "\t\t<loc>" . htmlspecialchars( $url ) . "</loc>\n" . 00535 "\t\t<lastmod>$date</lastmod>\n" . 00536 "\t\t<priority>$priority</priority>\n" . 00537 "\t</url>\n"; 00538 } 00539 00545 function closeFile() { 00546 return "</urlset>\n"; 00547 } 00548 00554 function generateLimit( $namespace ) { 00555 // bug 17961: make a title with the longest possible URL in this namespace 00556 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" ); 00557 00558 $this->limit = array( 00559 strlen( $this->openFile() ), 00560 strlen( $this->fileEntry( 00561 $title->getCanonicalURL(), 00562 wfTimestamp( TS_ISO_8601, wfTimestamp() ), 00563 $this->priority( $namespace ) 00564 ) ), 00565 strlen( $this->closeFile() ) 00566 ); 00567 } 00568 } 00569 00570 $maintClass = "GenerateSitemap"; 00571 require_once RUN_MAINTENANCE_IF_MAIN;