MediaWiki
REL1_23
|
00001 <?php 00029 require_once __DIR__ . '/Maintenance.php'; 00030 00036 class GenerateSitemap extends Maintenance { 00037 const GS_MAIN = -2; 00038 const GS_TALK = -1; 00039 00047 public $url_limit; 00048 00056 public $size_limit; 00057 00063 public $fspath; 00064 00070 public $urlpath; 00071 00077 public $compress; 00078 00084 public $skipRedirects; 00085 00091 public $limit = array(); 00092 00098 public $priorities = array(); 00099 00105 public $namespaces = array(); 00106 00112 public $timestamp; 00113 00119 public $dbr; 00120 00126 public $findex; 00127 00133 public $file; 00134 00140 private $identifier; 00141 00145 public function __construct() { 00146 parent::__construct(); 00147 $this->mDescription = "Creates a sitemap for the site"; 00148 $this->addOption( 'fspath', 'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory', false, true ); 00149 $this->addOption( 'urlpath', 'The URL path corresponding to --fspath, prepended to filenames in the index; defaults to an empty string', false, true ); 00150 $this->addOption( 'compress', 'Compress the sitemap files, can take value yes|no, default yes', false, true ); 00151 $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' ); 00152 $this->addOption( 'identifier', 'What site identifier to use for the wiki, defaults to $wgDBname', false, true ); 00153 } 00154 00158 public function execute() { 00159 $this->setNamespacePriorities(); 00160 $this->url_limit = 50000; 00161 $this->size_limit = pow( 2, 20 ) * 10; 00162 $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) ); 00163 $this->urlpath = $this->getOption( 'urlpath', "" ); 00164 if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) { 00165 $this->urlpath .= '/'; 00166 } 00167 $this->identifier = $this->getOption( 'identifier', wfWikiID() ); 00168 $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no'; 00169 $this->skipRedirects = $this->getOption( 'skip-redirects', false ) !== false; 00170 $this->dbr = wfGetDB( DB_SLAVE ); 00171 $this->generateNamespaces(); 00172 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() ); 00173 $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' ); 00174 $this->main(); 00175 } 00176 00177 private function setNamespacePriorities() { 00178 global $wgSitemapNamespacesPriorities; 00179 00180 // Custom main namespaces 00181 $this->priorities[self::GS_MAIN] = '0.5'; 00182 // Custom talk namesspaces 00183 $this->priorities[self::GS_TALK] = '0.1'; 00184 // MediaWiki standard namespaces 00185 $this->priorities[NS_MAIN] = '1.0'; 00186 $this->priorities[NS_TALK] = '0.1'; 00187 $this->priorities[NS_USER] = '0.5'; 00188 $this->priorities[NS_USER_TALK] = '0.1'; 00189 $this->priorities[NS_PROJECT] = '0.5'; 00190 $this->priorities[NS_PROJECT_TALK] = '0.1'; 00191 $this->priorities[NS_FILE] = '0.5'; 00192 $this->priorities[NS_FILE_TALK] = '0.1'; 00193 $this->priorities[NS_MEDIAWIKI] = '0.0'; 00194 $this->priorities[NS_MEDIAWIKI_TALK] = '0.1'; 00195 $this->priorities[NS_TEMPLATE] = '0.0'; 00196 $this->priorities[NS_TEMPLATE_TALK] = '0.1'; 00197 $this->priorities[NS_HELP] = '0.5'; 00198 $this->priorities[NS_HELP_TALK] = '0.1'; 00199 $this->priorities[NS_CATEGORY] = '0.5'; 00200 $this->priorities[NS_CATEGORY_TALK] = '0.1'; 00201 00202 // Custom priorities 00203 if ( $wgSitemapNamespacesPriorities !== false ) { 00207 foreach ( $wgSitemapNamespacesPriorities as $namespace => $priority ) { 00208 $float = floatval( $priority ); 00209 if ( $float > 1.0 ) { 00210 $priority = '1.0'; 00211 } elseif ( $float < 0.0 ) { 00212 $priority = '0.0'; 00213 } 00214 $this->priorities[$namespace] = $priority; 00215 } 00216 } 00217 } 00218 00224 private static function init_path( $fspath ) { 00225 if ( !isset( $fspath ) ) { 00226 return null; 00227 } 00228 # Create directory if needed 00229 if ( $fspath && !is_dir( $fspath ) ) { 00230 wfMkdirParents( $fspath, null, __METHOD__ ) or die( "Can not create directory $fspath.\n" ); 00231 } 00232 00233 return realpath( $fspath ) . DIRECTORY_SEPARATOR; 00234 } 00235 00239 function generateNamespaces() { 00240 // Only generate for specific namespaces if $wgSitemapNamespaces is an array. 00241 global $wgSitemapNamespaces; 00242 if ( is_array( $wgSitemapNamespaces ) ) { 00243 $this->namespaces = $wgSitemapNamespaces; 00244 return; 00245 } 00246 00247 $res = $this->dbr->select( 'page', 00248 array( 'page_namespace' ), 00249 array(), 00250 __METHOD__, 00251 array( 00252 'GROUP BY' => 'page_namespace', 00253 'ORDER BY' => 'page_namespace', 00254 ) 00255 ); 00256 00257 foreach ( $res as $row ) { 00258 $this->namespaces[] = $row->page_namespace; 00259 } 00260 } 00261 00268 function priority( $namespace ) { 00269 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace ); 00270 } 00271 00280 function guessPriority( $namespace ) { 00281 return MWNamespace::isSubject( $namespace ) ? $this->priorities[self::GS_MAIN] : $this->priorities[self::GS_TALK]; 00282 } 00283 00290 function getPageRes( $namespace ) { 00291 return $this->dbr->select( 'page', 00292 array( 00293 'page_namespace', 00294 'page_title', 00295 'page_touched', 00296 'page_is_redirect' 00297 ), 00298 array( 'page_namespace' => $namespace ), 00299 __METHOD__ 00300 ); 00301 } 00302 00306 public function main() { 00307 global $wgContLang; 00308 00309 fwrite( $this->findex, $this->openIndex() ); 00310 00311 foreach ( $this->namespaces as $namespace ) { 00312 $res = $this->getPageRes( $namespace ); 00313 $this->file = false; 00314 $this->generateLimit( $namespace ); 00315 $length = $this->limit[0]; 00316 $i = $smcount = 0; 00317 00318 $fns = $wgContLang->getFormattedNsText( $namespace ); 00319 $this->output( "$namespace ($fns)\n" ); 00320 $skippedRedirects = 0; // Number of redirects skipped for that namespace 00321 foreach ( $res as $row ) { 00322 if ( $this->skipRedirects && $row->page_is_redirect ) { 00323 $skippedRedirects++; 00324 continue; 00325 } 00326 00327 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) { 00328 if ( $this->file !== false ) { 00329 $this->write( $this->file, $this->closeFile() ); 00330 $this->close( $this->file ); 00331 } 00332 $filename = $this->sitemapFilename( $namespace, $smcount++ ); 00333 $this->file = $this->open( $this->fspath . $filename, 'wb' ); 00334 $this->write( $this->file, $this->openFile() ); 00335 fwrite( $this->findex, $this->indexEntry( $filename ) ); 00336 $this->output( "\t$this->fspath$filename\n" ); 00337 $length = $this->limit[0]; 00338 $i = 1; 00339 } 00340 $title = Title::makeTitle( $row->page_namespace, $row->page_title ); 00341 $date = wfTimestamp( TS_ISO_8601, $row->page_touched ); 00342 $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) ); 00343 $length += strlen( $entry ); 00344 $this->write( $this->file, $entry ); 00345 // generate pages for language variants 00346 if ( $wgContLang->hasVariants() ) { 00347 $variants = $wgContLang->getVariants(); 00348 foreach ( $variants as $vCode ) { 00349 if ( $vCode == $wgContLang->getCode() ) { 00350 continue; // we don't want default variant 00351 } 00352 $entry = $this->fileEntry( $title->getCanonicalURL( '', $vCode ), $date, $this->priority( $namespace ) ); 00353 $length += strlen( $entry ); 00354 $this->write( $this->file, $entry ); 00355 } 00356 } 00357 } 00358 00359 if ( $this->skipRedirects && $skippedRedirects > 0 ) { 00360 $this->output( " skipped $skippedRedirects redirect(s)\n" ); 00361 } 00362 00363 if ( $this->file ) { 00364 $this->write( $this->file, $this->closeFile() ); 00365 $this->close( $this->file ); 00366 } 00367 } 00368 fwrite( $this->findex, $this->closeIndex() ); 00369 fclose( $this->findex ); 00370 } 00371 00377 function open( $file, $flags ) { 00378 $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags ); 00379 if ( $resource === false ) { 00380 throw new MWException( __METHOD__ . " error opening file $file with flags $flags. Check permissions?" ); 00381 } 00382 return $resource; 00383 } 00384 00388 function write( &$handle, $str ) { 00389 if ( $handle === true || $handle === false ) { 00390 throw new MWException( __METHOD__ . " was passed a boolean as a file handle.\n" ); 00391 } 00392 if ( $this->compress ) { 00393 gzwrite( $handle, $str ); 00394 } else { 00395 fwrite( $handle, $str ); 00396 } 00397 } 00398 00402 function close( &$handle ) { 00403 if ( $this->compress ) { 00404 gzclose( $handle ); 00405 } else { 00406 fclose( $handle ); 00407 } 00408 } 00409 00417 function sitemapFilename( $namespace, $count ) { 00418 $ext = $this->compress ? '.gz' : ''; 00419 return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext"; 00420 } 00421 00427 function xmlHead() { 00428 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n"; 00429 } 00430 00436 function xmlSchema() { 00437 return 'http://www.sitemaps.org/schemas/sitemap/0.9'; 00438 } 00439 00445 function openIndex() { 00446 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n"; 00447 } 00448 00455 function indexEntry( $filename ) { 00456 return 00457 "\t<sitemap>\n" . 00458 "\t\t<loc>{$this->urlpath}$filename</loc>\n" . 00459 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" . 00460 "\t</sitemap>\n"; 00461 } 00462 00468 function closeIndex() { 00469 return "</sitemapindex>\n"; 00470 } 00471 00477 function openFile() { 00478 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n"; 00479 } 00480 00489 function fileEntry( $url, $date, $priority ) { 00490 return 00491 "\t<url>\n" . 00492 // bug 34666: $url may contain bad characters such as ampersands. 00493 "\t\t<loc>" . htmlspecialchars( $url ) . "</loc>\n" . 00494 "\t\t<lastmod>$date</lastmod>\n" . 00495 "\t\t<priority>$priority</priority>\n" . 00496 "\t</url>\n"; 00497 } 00498 00504 function closeFile() { 00505 return "</urlset>\n"; 00506 } 00507 00511 function generateLimit( $namespace ) { 00512 // bug 17961: make a title with the longest possible URL in this namespace 00513 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" ); 00514 00515 $this->limit = array( 00516 strlen( $this->openFile() ), 00517 strlen( $this->fileEntry( $title->getCanonicalURL(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ), 00518 strlen( $this->closeFile() ) 00519 ); 00520 } 00521 } 00522 00523 $maintClass = "GenerateSitemap"; 00524 require_once RUN_MAINTENANCE_IF_MAIN;