[ Index ] |
PHP Cross Reference of MediaWiki-1.24.0 |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Helper class for the --prefetch option of dumpTextPass.php 4 * 5 * Copyright © 2005 Brion Vibber <[email protected]> 6 * https://www.mediawiki.org/ 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, write to the Free Software Foundation, Inc., 20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 21 * http://www.gnu.org/copyleft/gpl.html 22 * 23 * @file 24 * @ingroup Maintenance 25 */ 26 27 /** 28 * Readahead helper for making large MediaWiki data dumps; 29 * reads in a previous XML dump to sequentially prefetch text 30 * records already normalized and decompressed. 31 * 32 * This can save load on the external database servers, hopefully. 33 * 34 * Assumes that dumps will be recorded in the canonical order: 35 * - ascending by page_id 36 * - ascending by rev_id within each page 37 * - text contents are immutable and should not change once 38 * recorded, so the previous dump is a reliable source 39 * 40 * @ingroup Maintenance 41 */ 42 class BaseDump { 43 protected $reader = null; 44 protected $atEnd = false; 45 protected $atPageEnd = false; 46 protected $lastPage = 0; 47 protected $lastRev = 0; 48 protected $infiles = null; 49 50 public function __construct( $infile ) { 51 $this->infiles = explode( ';', $infile ); 52 $this->reader = new XMLReader(); 53 $infile = array_shift( $this->infiles ); 54 if ( defined( 'LIBXML_PARSEHUGE' ) ) { 55 $this->reader->open( $infile, null, LIBXML_PARSEHUGE ); 56 } else { 57 $this->reader->open( $infile ); 58 } 59 } 60 61 /** 62 * Attempts to fetch the text of a particular page revision 63 * from the dump stream. May return null if the page is 64 * unavailable. 65 * 66 * @param int $page ID number of page to read 67 * @param int $rev ID number of revision to read 68 * @return string|null 69 */ 70 function prefetch( $page, $rev ) { 71 $page = intval( $page ); 72 $rev = intval( $rev ); 73 while ( $this->lastPage < $page && !$this->atEnd ) { 74 $this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" ); 75 $this->nextPage(); 76 } 77 if ( $this->lastPage > $page || $this->atEnd ) { 78 $this->debug( "BaseDump::prefetch already past page $page " 79 . "looking for rev $rev [$this->lastPage, $this->lastRev]" ); 80 81 return null; 82 } 83 while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) { 84 $this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, " 85 . "looking for $page, $rev" ); 86 $this->nextRev(); 87 } 88 if ( $this->lastRev == $rev && !$this->atEnd ) { 89 $this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" ); 90 91 return $this->nextText(); 92 } else { 93 $this->debug( "BaseDump::prefetch already past rev $rev on page $page " 94 . "[$this->lastPage, $this->lastRev]" ); 95 96 return null; 97 } 98 } 99 100 function debug( $str ) { 101 wfDebug( $str . "\n" ); 102 // global $dumper; 103 // $dumper->progress( $str ); 104 } 105 106 /** 107 * @access private 108 */ 109 function nextPage() { 110 if ( $this->skipTo( 'page', 'mediawiki' ) ) { 111 if ( $this->skipTo( 'id' ) ) { 112 $this->lastPage = intval( $this->nodeContents() ); 113 $this->lastRev = 0; 114 $this->atPageEnd = false; 115 } 116 } else { 117 $this->close(); 118 if ( count( $this->infiles ) ) { 119 $infile = array_shift( $this->infiles ); 120 $this->reader->open( $infile ); 121 $this->atEnd = false; 122 } 123 } 124 } 125 126 /** 127 * @access private 128 */ 129 function nextRev() { 130 if ( $this->skipTo( 'revision' ) ) { 131 if ( $this->skipTo( 'id' ) ) { 132 $this->lastRev = intval( $this->nodeContents() ); 133 } 134 } else { 135 $this->atPageEnd = true; 136 } 137 } 138 139 /** 140 * @access private 141 * @return string 142 */ 143 function nextText() { 144 $this->skipTo( 'text' ); 145 146 return strval( $this->nodeContents() ); 147 } 148 149 /** 150 * @access private 151 * @param string $name 152 * @param string $parent 153 * @return bool|null 154 */ 155 function skipTo( $name, $parent = 'page' ) { 156 if ( $this->atEnd ) { 157 return false; 158 } 159 while ( $this->reader->read() ) { 160 if ( $this->reader->nodeType == XMLReader::ELEMENT 161 && $this->reader->name == $name 162 ) { 163 return true; 164 } 165 if ( $this->reader->nodeType == XMLReader::END_ELEMENT 166 && $this->reader->name == $parent 167 ) { 168 $this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" ); 169 170 return false; 171 } 172 } 173 174 return $this->close(); 175 } 176 177 /** 178 * Shouldn't something like this be built-in to XMLReader? 179 * Fetches text contents of the current element, assuming 180 * no sub-elements or such scary things. 181 * 182 * @return string 183 * @access private 184 */ 185 function nodeContents() { 186 if ( $this->atEnd ) { 187 return null; 188 } 189 if ( $this->reader->isEmptyElement ) { 190 return ""; 191 } 192 $buffer = ""; 193 while ( $this->reader->read() ) { 194 switch ( $this->reader->nodeType ) { 195 case XMLReader::TEXT: 196 //case XMLReader::WHITESPACE: 197 case XMLReader::SIGNIFICANT_WHITESPACE: 198 $buffer .= $this->reader->value; 199 break; 200 case XMLReader::END_ELEMENT: 201 return $buffer; 202 } 203 } 204 205 return $this->close(); 206 } 207 208 /** 209 * @access private 210 * @return null 211 */ 212 function close() { 213 $this->reader->close(); 214 $this->atEnd = true; 215 216 return null; 217 } 218 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 14:03:12 2014 | Cross-referenced by PHPXref 0.7.1 |