MediaWiki
REL1_22
|
00001 <?php 00029 require_once __DIR__ . '/Maintenance.php'; 00030 00036 class GenerateSitemap extends Maintenance { 00037 const GS_MAIN = -2; 00038 const GS_TALK = -1; 00039 00047 public $url_limit; 00048 00056 public $size_limit; 00057 00063 public $fspath; 00064 00070 public $urlpath; 00071 00077 public $compress; 00078 00084 public $skipRedirects; 00085 00091 public $limit = array(); 00092 00098 public $priorities = array(); 00099 00105 public $namespaces = array(); 00106 00112 public $timestamp; 00113 00119 public $dbr; 00120 00126 public $findex; 00127 00128 00134 public $file; 00135 00141 private $identifier; 00142 00146 public function __construct() { 00147 parent::__construct(); 00148 $this->mDescription = "Creates a sitemap for the site"; 00149 $this->addOption( 'fspath', 'The file system path to save to, e.g. /tmp/sitemap; defaults to current directory', false, true ); 00150 $this->addOption( 'urlpath', 'The URL path corresponding to --fspath, prepended to filenames in the index; defaults to an empty string', false, true ); 00151 $this->addOption( 'compress', 'Compress the sitemap files, can take value yes|no, default yes', false, true ); 00152 $this->addOption( 'skip-redirects', 'Do not include redirecting articles in the sitemap' ); 00153 $this->addOption( 'identifier', 'What site identifier to use for the wiki, defaults to $wgDBname', false, true ); 00154 } 00155 00159 public function execute() { 00160 $this->setNamespacePriorities(); 00161 $this->url_limit = 50000; 00162 $this->size_limit = pow( 2, 20 ) * 10; 00163 $this->fspath = self::init_path( $this->getOption( 'fspath', getcwd() ) ); 00164 $this->urlpath = $this->getOption( 'urlpath', "" ); 00165 if ( $this->urlpath !== "" && substr( $this->urlpath, -1 ) !== '/' ) { 00166 $this->urlpath .= '/'; 00167 } 00168 $this->identifier = $this->getOption( 'identifier', wfWikiID() ); 00169 $this->compress = $this->getOption( 'compress', 'yes' ) !== 'no'; 00170 $this->skipRedirects = $this->getOption( 'skip-redirects', false ) !== false; 00171 $this->dbr = wfGetDB( DB_SLAVE ); 00172 $this->generateNamespaces(); 00173 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() ); 00174 $this->findex = fopen( "{$this->fspath}sitemap-index-{$this->identifier}.xml", 'wb' ); 00175 $this->main(); 00176 } 00177 00178 private function setNamespacePriorities() { 00179 global $wgSitemapNamespacesPriorities; 00180 00181 // Custom main namespaces 00182 $this->priorities[self::GS_MAIN] = '0.5'; 00183 // Custom talk namesspaces 00184 $this->priorities[self::GS_TALK] = '0.1'; 00185 // MediaWiki standard namespaces 00186 $this->priorities[NS_MAIN] = '1.0'; 00187 $this->priorities[NS_TALK] = '0.1'; 00188 $this->priorities[NS_USER] = '0.5'; 00189 $this->priorities[NS_USER_TALK] = '0.1'; 00190 $this->priorities[NS_PROJECT] = '0.5'; 00191 $this->priorities[NS_PROJECT_TALK] = '0.1'; 00192 $this->priorities[NS_FILE] = '0.5'; 00193 $this->priorities[NS_FILE_TALK] = '0.1'; 00194 $this->priorities[NS_MEDIAWIKI] = '0.0'; 00195 $this->priorities[NS_MEDIAWIKI_TALK] = '0.1'; 00196 $this->priorities[NS_TEMPLATE] = '0.0'; 00197 $this->priorities[NS_TEMPLATE_TALK] = '0.1'; 00198 $this->priorities[NS_HELP] = '0.5'; 00199 $this->priorities[NS_HELP_TALK] = '0.1'; 00200 $this->priorities[NS_CATEGORY] = '0.5'; 00201 $this->priorities[NS_CATEGORY_TALK] = '0.1'; 00202 00203 // Custom priorities 00204 if ( $wgSitemapNamespacesPriorities !== false ) { 00208 foreach ( $wgSitemapNamespacesPriorities as $namespace => $priority ) { 00209 $float = floatval( $priority ); 00210 if ( $float > 1.0 ) { 00211 $priority = '1.0'; 00212 } elseif ( $float < 0.0 ) { 00213 $priority = '0.0'; 00214 } 00215 $this->priorities[$namespace] = $priority; 00216 } 00217 } 00218 } 00219 00225 private static function init_path( $fspath ) { 00226 if ( !isset( $fspath ) ) { 00227 return null; 00228 } 00229 # Create directory if needed 00230 if ( $fspath && !is_dir( $fspath ) ) { 00231 wfMkdirParents( $fspath, null, __METHOD__ ) or die( "Can not create directory $fspath.\n" ); 00232 } 00233 00234 return realpath( $fspath ) . DIRECTORY_SEPARATOR; 00235 } 00236 00240 function generateNamespaces() { 00241 // Only generate for specific namespaces if $wgSitemapNamespaces is an array. 00242 global $wgSitemapNamespaces; 00243 if ( is_array( $wgSitemapNamespaces ) ) { 00244 $this->namespaces = $wgSitemapNamespaces; 00245 return; 00246 } 00247 00248 $res = $this->dbr->select( 'page', 00249 array( 'page_namespace' ), 00250 array(), 00251 __METHOD__, 00252 array( 00253 'GROUP BY' => 'page_namespace', 00254 'ORDER BY' => 'page_namespace', 00255 ) 00256 ); 00257 00258 foreach ( $res as $row ) { 00259 $this->namespaces[] = $row->page_namespace; 00260 } 00261 } 00262 00269 function priority( $namespace ) { 00270 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace ); 00271 } 00272 00281 function guessPriority( $namespace ) { 00282 return MWNamespace::isSubject( $namespace ) ? $this->priorities[self::GS_MAIN] : $this->priorities[self::GS_TALK]; 00283 } 00284 00291 function getPageRes( $namespace ) { 00292 return $this->dbr->select( 'page', 00293 array( 00294 'page_namespace', 00295 'page_title', 00296 'page_touched', 00297 'page_is_redirect' 00298 ), 00299 array( 'page_namespace' => $namespace ), 00300 __METHOD__ 00301 ); 00302 } 00303 00307 public function main() { 00308 global $wgContLang; 00309 00310 fwrite( $this->findex, $this->openIndex() ); 00311 00312 foreach ( $this->namespaces as $namespace ) { 00313 $res = $this->getPageRes( $namespace ); 00314 $this->file = false; 00315 $this->generateLimit( $namespace ); 00316 $length = $this->limit[0]; 00317 $i = $smcount = 0; 00318 00319 $fns = $wgContLang->getFormattedNsText( $namespace ); 00320 $this->output( "$namespace ($fns)\n" ); 00321 $skippedRedirects = 0; // Number of redirects skipped for that namespace 00322 foreach ( $res as $row ) { 00323 if ( $this->skipRedirects && $row->page_is_redirect ) { 00324 $skippedRedirects++; 00325 continue; 00326 } 00327 00328 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) { 00329 if ( $this->file !== false ) { 00330 $this->write( $this->file, $this->closeFile() ); 00331 $this->close( $this->file ); 00332 } 00333 $filename = $this->sitemapFilename( $namespace, $smcount++ ); 00334 $this->file = $this->open( $this->fspath . $filename, 'wb' ); 00335 $this->write( $this->file, $this->openFile() ); 00336 fwrite( $this->findex, $this->indexEntry( $filename ) ); 00337 $this->output( "\t$this->fspath$filename\n" ); 00338 $length = $this->limit[0]; 00339 $i = 1; 00340 } 00341 $title = Title::makeTitle( $row->page_namespace, $row->page_title ); 00342 $date = wfTimestamp( TS_ISO_8601, $row->page_touched ); 00343 $entry = $this->fileEntry( $title->getCanonicalURL(), $date, $this->priority( $namespace ) ); 00344 $length += strlen( $entry ); 00345 $this->write( $this->file, $entry ); 00346 // generate pages for language variants 00347 if ( $wgContLang->hasVariants() ) { 00348 $variants = $wgContLang->getVariants(); 00349 foreach ( $variants as $vCode ) { 00350 if ( $vCode == $wgContLang->getCode() ) { 00351 continue; // we don't want default variant 00352 } 00353 $entry = $this->fileEntry( $title->getCanonicalURL( '', $vCode ), $date, $this->priority( $namespace ) ); 00354 $length += strlen( $entry ); 00355 $this->write( $this->file, $entry ); 00356 } 00357 } 00358 } 00359 00360 if ( $this->skipRedirects && $skippedRedirects > 0 ) { 00361 $this->output( " skipped $skippedRedirects redirect(s)\n" ); 00362 } 00363 00364 if ( $this->file ) { 00365 $this->write( $this->file, $this->closeFile() ); 00366 $this->close( $this->file ); 00367 } 00368 } 00369 fwrite( $this->findex, $this->closeIndex() ); 00370 fclose( $this->findex ); 00371 } 00372 00378 function open( $file, $flags ) { 00379 $resource = $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags ); 00380 if ( $resource === false ) { 00381 throw new MWException( __METHOD__ . " error opening file $file with flags $flags. Check permissions?" ); 00382 } 00383 return $resource; 00384 } 00385 00389 function write( &$handle, $str ) { 00390 if ( $handle === true || $handle === false ) { 00391 throw new MWException( __METHOD__ . " was passed a boolean as a file handle.\n" ); 00392 } 00393 if ( $this->compress ) { 00394 gzwrite( $handle, $str ); 00395 } else { 00396 fwrite( $handle, $str ); 00397 } 00398 } 00399 00403 function close( &$handle ) { 00404 if ( $this->compress ) { 00405 gzclose( $handle ); 00406 } else { 00407 fclose( $handle ); 00408 } 00409 } 00410 00418 function sitemapFilename( $namespace, $count ) { 00419 $ext = $this->compress ? '.gz' : ''; 00420 return "sitemap-{$this->identifier}-NS_$namespace-$count.xml$ext"; 00421 } 00422 00428 function xmlHead() { 00429 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n"; 00430 } 00431 00437 function xmlSchema() { 00438 return 'http://www.sitemaps.org/schemas/sitemap/0.9'; 00439 } 00440 00446 function openIndex() { 00447 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n"; 00448 } 00449 00456 function indexEntry( $filename ) { 00457 return 00458 "\t<sitemap>\n" . 00459 "\t\t<loc>{$this->urlpath}$filename</loc>\n" . 00460 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" . 00461 "\t</sitemap>\n"; 00462 } 00463 00469 function closeIndex() { 00470 return "</sitemapindex>\n"; 00471 } 00472 00478 function openFile() { 00479 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n"; 00480 } 00481 00490 function fileEntry( $url, $date, $priority ) { 00491 return 00492 "\t<url>\n" . 00493 // bug 34666: $url may contain bad characters such as ampersands. 00494 "\t\t<loc>" . htmlspecialchars( $url ) . "</loc>\n" . 00495 "\t\t<lastmod>$date</lastmod>\n" . 00496 "\t\t<priority>$priority</priority>\n" . 00497 "\t</url>\n"; 00498 } 00499 00505 function closeFile() { 00506 return "</urlset>\n"; 00507 } 00508 00512 function generateLimit( $namespace ) { 00513 // bug 17961: make a title with the longest possible URL in this namespace 00514 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" ); 00515 00516 $this->limit = array( 00517 strlen( $this->openFile() ), 00518 strlen( $this->fileEntry( $title->getCanonicalURL(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ), 00519 strlen( $this->closeFile() ) 00520 ); 00521 } 00522 } 00523 00524 $maintClass = "GenerateSitemap"; 00525 require_once RUN_MAINTENANCE_IF_MAIN;