MediaWiki  master
checkStorage.php
Go to the documentation of this file.
1 <?php
24 if ( !defined( 'MEDIAWIKI' ) ) {
25  $optionsWithoutArgs = [ 'fix' ];
26  require_once __DIR__ . '/../commandLine.inc';
27 
28  $cs = new CheckStorage;
29  $fix = isset( $options['fix'] );
30  if ( isset( $args[0] ) ) {
31  $xml = $args[0];
32  } else {
33  $xml = false;
34  }
35  $cs->check( $fix, $xml );
36 }
37 
38 // ----------------------------------------------------------------------------------
39 
46 class CheckStorage {
47  const CONCAT_HEADER = 'O:27:"concatenatedgziphistoryblob"';
48  public $oldIdMap, $errors;
49  public $dbStore = null;
50 
51  public $errorDescriptions = [
52  'restore text' => 'Damaged text, need to be restored from a backup',
53  'restore revision' => 'Damaged revision row, need to be restored from a backup',
54  'unfixable' => 'Unexpected errors with no automated fixing method',
55  'fixed' => 'Errors already fixed',
56  'fixable' => 'Errors which would already be fixed if --fix was specified',
57  ];
58 
59  function check( $fix = false, $xml = '' ) {
60  $dbr = wfGetDB( DB_SLAVE );
61  if ( $fix ) {
62  print "Checking, will fix errors if possible...\n";
63  } else {
64  print "Checking...\n";
65  }
66  $maxRevId = $dbr->selectField( 'revision', 'MAX(rev_id)', false, __METHOD__ );
67  $chunkSize = 1000;
68  $flagStats = [];
69  $objectStats = [];
70  $knownFlags = [ 'external', 'gzip', 'object', 'utf-8' ];
71  $this->errors = [
72  'restore text' => [],
73  'restore revision' => [],
74  'unfixable' => [],
75  'fixed' => [],
76  'fixable' => [],
77  ];
78 
79  for ( $chunkStart = 1; $chunkStart < $maxRevId; $chunkStart += $chunkSize ) {
80  $chunkEnd = $chunkStart + $chunkSize - 1;
81  // print "$chunkStart of $maxRevId\n";
82 
83  // Fetch revision rows
84  $this->oldIdMap = [];
85  $dbr->ping();
86  $res = $dbr->select( 'revision', [ 'rev_id', 'rev_text_id' ],
87  [ "rev_id BETWEEN $chunkStart AND $chunkEnd" ], __METHOD__ );
88  foreach ( $res as $row ) {
89  $this->oldIdMap[$row->rev_id] = $row->rev_text_id;
90  }
91  $dbr->freeResult( $res );
92 
93  if ( !count( $this->oldIdMap ) ) {
94  continue;
95  }
96 
97  // Fetch old_flags
98  $missingTextRows = array_flip( $this->oldIdMap );
99  $externalRevs = [];
100  $objectRevs = [];
101  $res = $dbr->select( 'text', [ 'old_id', 'old_flags' ],
102  'old_id IN (' . implode( ',', $this->oldIdMap ) . ')', __METHOD__ );
103  foreach ( $res as $row ) {
107  $flags = $row->old_flags;
108  $id = $row->old_id;
109 
110  // Create flagStats row if it doesn't exist
111  $flagStats = $flagStats + [ $flags => 0 ];
112  // Increment counter
113  $flagStats[$flags]++;
114 
115  // Not missing
116  unset( $missingTextRows[$row->old_id] );
117 
118  // Check for external or object
119  if ( $flags == '' ) {
120  $flagArray = [];
121  } else {
122  $flagArray = explode( ',', $flags );
123  }
124  if ( in_array( 'external', $flagArray ) ) {
125  $externalRevs[] = $id;
126  } elseif ( in_array( 'object', $flagArray ) ) {
127  $objectRevs[] = $id;
128  }
129 
130  // Check for unrecognised flags
131  if ( $flags == '0' ) {
132  // This is a known bug from 2004
133  // It's safe to just erase the old_flags field
134  if ( $fix ) {
135  $this->error( 'fixed', "Warning: old_flags set to 0", $id );
136  $dbw = wfGetDB( DB_MASTER );
137  $dbw->ping();
138  $dbw->update( 'text', [ 'old_flags' => '' ],
139  [ 'old_id' => $id ], __METHOD__ );
140  echo "Fixed\n";
141  } else {
142  $this->error( 'fixable', "Warning: old_flags set to 0", $id );
143  }
144  } elseif ( count( array_diff( $flagArray, $knownFlags ) ) ) {
145  $this->error( 'unfixable', "Error: invalid flags field \"$flags\"", $id );
146  }
147  }
148  $dbr->freeResult( $res );
149 
150  // Output errors for any missing text rows
151  foreach ( $missingTextRows as $oldId => $revId ) {
152  $this->error( 'restore revision', "Error: missing text row", $oldId );
153  }
154 
155  // Verify external revisions
156  $externalConcatBlobs = [];
157  $externalNormalBlobs = [];
158  if ( count( $externalRevs ) ) {
159  $res = $dbr->select( 'text', [ 'old_id', 'old_flags', 'old_text' ],
160  [ 'old_id IN (' . implode( ',', $externalRevs ) . ')' ], __METHOD__ );
161  foreach ( $res as $row ) {
162  $urlParts = explode( '://', $row->old_text, 2 );
163  if ( count( $urlParts ) !== 2 || $urlParts[1] == '' ) {
164  $this->error( 'restore text', "Error: invalid URL \"{$row->old_text}\"", $row->old_id );
165  continue;
166  }
167  list( $proto, ) = $urlParts;
168  if ( $proto != 'DB' ) {
169  $this->error( 'restore text', "Error: invalid external protocol \"$proto\"", $row->old_id );
170  continue;
171  }
172  $path = explode( '/', $row->old_text );
173  $cluster = $path[2];
174  $id = $path[3];
175  if ( isset( $path[4] ) ) {
176  $externalConcatBlobs[$cluster][$id][] = $row->old_id;
177  } else {
178  $externalNormalBlobs[$cluster][$id][] = $row->old_id;
179  }
180  }
181  $dbr->freeResult( $res );
182  }
183 
184  // Check external concat blobs for the right header
185  $this->checkExternalConcatBlobs( $externalConcatBlobs );
186 
187  // Check external normal blobs for existence
188  if ( count( $externalNormalBlobs ) ) {
189  if ( is_null( $this->dbStore ) ) {
190  $this->dbStore = new ExternalStoreDB;
191  }
192  foreach ( $externalConcatBlobs as $cluster => $xBlobIds ) {
193  $blobIds = array_keys( $xBlobIds );
194  $extDb =& $this->dbStore->getSlave( $cluster );
195  $blobsTable = $this->dbStore->getTable( $extDb );
196  $res = $extDb->select( $blobsTable,
197  [ 'blob_id' ],
198  [ 'blob_id IN( ' . implode( ',', $blobIds ) . ')' ], __METHOD__ );
199  foreach ( $res as $row ) {
200  unset( $xBlobIds[$row->blob_id] );
201  }
202  $extDb->freeResult( $res );
203  // Print errors for missing blobs rows
204  foreach ( $xBlobIds as $blobId => $oldId ) {
205  $this->error( 'restore text', "Error: missing target $blobId for one-part ES URL", $oldId );
206  }
207  }
208  }
209 
210  // Check local objects
211  $dbr->ping();
212  $concatBlobs = [];
213  $curIds = [];
214  if ( count( $objectRevs ) ) {
215  $headerLength = 300;
216  $res = $dbr->select(
217  'text',
218  [ 'old_id', 'old_flags', "LEFT(old_text, $headerLength) AS header" ],
219  [ 'old_id IN (' . implode( ',', $objectRevs ) . ')' ],
220  __METHOD__
221  );
222  foreach ( $res as $row ) {
223  $oldId = $row->old_id;
224  $matches = [];
225  if ( !preg_match( '/^O:(\d+):"(\w+)"/', $row->header, $matches ) ) {
226  $this->error( 'restore text', "Error: invalid object header", $oldId );
227  continue;
228  }
229 
230  $className = strtolower( $matches[2] );
231  if ( strlen( $className ) != $matches[1] ) {
232  $this->error(
233  'restore text',
234  "Error: invalid object header, wrong class name length",
235  $oldId
236  );
237  continue;
238  }
239 
240  $objectStats = $objectStats + [ $className => 0 ];
241  $objectStats[$className]++;
242 
243  switch ( $className ) {
244  case 'concatenatedgziphistoryblob':
245  // Good
246  break;
247  case 'historyblobstub':
248  case 'historyblobcurstub':
249  if ( strlen( $row->header ) == $headerLength ) {
250  $this->error( 'unfixable', "Error: overlong stub header", $oldId );
251  continue;
252  }
253  $stubObj = unserialize( $row->header );
254  if ( !is_object( $stubObj ) ) {
255  $this->error( 'restore text', "Error: unable to unserialize stub object", $oldId );
256  continue;
257  }
258  if ( $className == 'historyblobstub' ) {
259  $concatBlobs[$stubObj->mOldId][] = $oldId;
260  } else {
261  $curIds[$stubObj->mCurId][] = $oldId;
262  }
263  break;
264  default:
265  $this->error( 'unfixable', "Error: unrecognised object class \"$className\"", $oldId );
266  }
267  }
268  $dbr->freeResult( $res );
269  }
270 
271  // Check local concat blob validity
272  $externalConcatBlobs = [];
273  if ( count( $concatBlobs ) ) {
274  $headerLength = 300;
275  $res = $dbr->select(
276  'text',
277  [ 'old_id', 'old_flags', "LEFT(old_text, $headerLength) AS header" ],
278  [ 'old_id IN (' . implode( ',', array_keys( $concatBlobs ) ) . ')' ],
279  __METHOD__
280  );
281  foreach ( $res as $row ) {
282  $flags = explode( ',', $row->old_flags );
283  if ( in_array( 'external', $flags ) ) {
284  // Concat blob is in external storage?
285  if ( in_array( 'object', $flags ) ) {
286  $urlParts = explode( '/', $row->header );
287  if ( $urlParts[0] != 'DB:' ) {
288  $this->error(
289  'unfixable',
290  "Error: unrecognised external storage type \"{$urlParts[0]}",
291  $row->old_id
292  );
293  } else {
294  $cluster = $urlParts[2];
295  $id = $urlParts[3];
296  if ( !isset( $externalConcatBlobs[$cluster][$id] ) ) {
297  $externalConcatBlobs[$cluster][$id] = [];
298  }
299  $externalConcatBlobs[$cluster][$id] = array_merge(
300  $externalConcatBlobs[$cluster][$id], $concatBlobs[$row->old_id]
301  );
302  }
303  } else {
304  $this->error(
305  'unfixable',
306  "Error: invalid flags \"{$row->old_flags}\" on concat bulk row {$row->old_id}",
307  $concatBlobs[$row->old_id] );
308  }
309  } elseif ( strcasecmp(
310  substr( $row->header, 0, strlen( self::CONCAT_HEADER ) ),
311  self::CONCAT_HEADER
312  ) ) {
313  $this->error(
314  'restore text',
315  "Error: Incorrect object header for concat bulk row {$row->old_id}",
316  $concatBlobs[$row->old_id]
317  );
318  } # else good
319 
320  unset( $concatBlobs[$row->old_id] );
321  }
322  $dbr->freeResult( $res );
323  }
324 
325  // Check targets of unresolved stubs
326  $this->checkExternalConcatBlobs( $externalConcatBlobs );
327  // next chunk
328  }
329 
330  print "\n\nErrors:\n";
331  foreach ( $this->errors as $name => $errors ) {
332  if ( count( $errors ) ) {
333  $description = $this->errorDescriptions[$name];
334  echo "$description: " . implode( ',', array_keys( $errors ) ) . "\n";
335  }
336  }
337 
338  if ( count( $this->errors['restore text'] ) && $fix ) {
339  if ( (string)$xml !== '' ) {
340  $this->restoreText( array_keys( $this->errors['restore text'] ), $xml );
341  } else {
342  echo "Can't fix text, no XML backup specified\n";
343  }
344  }
345 
346  print "\nFlag statistics:\n";
347  $total = array_sum( $flagStats );
348  foreach ( $flagStats as $flag => $count ) {
349  printf( "%-30s %10d %5.2f%%\n", $flag, $count, $count / $total * 100 );
350  }
351  print "\nLocal object statistics:\n";
352  $total = array_sum( $objectStats );
353  foreach ( $objectStats as $className => $count ) {
354  printf( "%-30s %10d %5.2f%%\n", $className, $count, $count / $total * 100 );
355  }
356  }
357 
358  function error( $type, $msg, $ids ) {
359  if ( is_array( $ids ) && count( $ids ) == 1 ) {
360  $ids = reset( $ids );
361  }
362  if ( is_array( $ids ) ) {
363  $revIds = [];
364  foreach ( $ids as $id ) {
365  $revIds = array_merge( $revIds, array_keys( $this->oldIdMap, $id ) );
366  }
367  print "$msg in text rows " . implode( ', ', $ids ) .
368  ", revisions " . implode( ', ', $revIds ) . "\n";
369  } else {
370  $id = $ids;
371  $revIds = array_keys( $this->oldIdMap, $id );
372  if ( count( $revIds ) == 1 ) {
373  print "$msg in old_id $id, rev_id {$revIds[0]}\n";
374  } else {
375  print "$msg in old_id $id, revisions " . implode( ', ', $revIds ) . "\n";
376  }
377  }
378  $this->errors[$type] = $this->errors[$type] + array_flip( $revIds );
379  }
380 
381  function checkExternalConcatBlobs( $externalConcatBlobs ) {
382  if ( !count( $externalConcatBlobs ) ) {
383  return;
384  }
385 
386  if ( is_null( $this->dbStore ) ) {
387  $this->dbStore = new ExternalStoreDB;
388  }
389 
390  foreach ( $externalConcatBlobs as $cluster => $oldIds ) {
391  $blobIds = array_keys( $oldIds );
392  $extDb =& $this->dbStore->getSlave( $cluster );
393  $blobsTable = $this->dbStore->getTable( $extDb );
394  $headerLength = strlen( self::CONCAT_HEADER );
395  $res = $extDb->select( $blobsTable,
396  [ 'blob_id', "LEFT(blob_text, $headerLength) AS header" ],
397  [ 'blob_id IN( ' . implode( ',', $blobIds ) . ')' ], __METHOD__ );
398  foreach ( $res as $row ) {
399  if ( strcasecmp( $row->header, self::CONCAT_HEADER ) ) {
400  $this->error(
401  'restore text',
402  "Error: invalid header on target $cluster/{$row->blob_id} of two-part ES URL",
403  $oldIds[$row->blob_id]
404  );
405  }
406  unset( $oldIds[$row->blob_id] );
407  }
408  $extDb->freeResult( $res );
409 
410  // Print errors for missing blobs rows
411  foreach ( $oldIds as $blobId => $oldIds2 ) {
412  $this->error(
413  'restore text',
414  "Error: missing target $cluster/$blobId for two-part ES URL",
415  $oldIds2
416  );
417  }
418  }
419  }
420 
421  function restoreText( $revIds, $xml ) {
423  $tmpDir = wfTempDir();
424 
425  if ( !count( $revIds ) ) {
426  return;
427  }
428 
429  print "Restoring text from XML backup...\n";
430 
431  $revFileName = "$tmpDir/broken-revlist-$wgDBname";
432  $filteredXmlFileName = "$tmpDir/filtered-$wgDBname.xml";
433 
434  // Write revision list
435  if ( !file_put_contents( $revFileName, implode( "\n", $revIds ) ) ) {
436  echo "Error writing revision list, can't restore text\n";
437 
438  return;
439  }
440 
441  // Run mwdumper
442  echo "Filtering XML dump...\n";
443  $exitStatus = 0;
444  passthru( 'mwdumper ' .
446  "--output=file:$filteredXmlFileName",
447  "--filter=revlist:$revFileName",
448  $xml
449  ), $exitStatus
450  );
451 
452  if ( $exitStatus ) {
453  echo "mwdumper died with exit status $exitStatus\n";
454 
455  return;
456  }
457 
458  $file = fopen( $filteredXmlFileName, 'r' );
459  if ( !$file ) {
460  echo "Unable to open filtered XML file\n";
461 
462  return;
463  }
464 
465  $dbr = wfGetDB( DB_SLAVE );
466  $dbw = wfGetDB( DB_MASTER );
467  $dbr->ping();
468  $dbw->ping();
469 
470  $source = new ImportStreamSource( $file );
471  $importer = new WikiImporter(
472  $source,
473  ConfigFactory::getDefaultInstance()->makeConfig( 'main' )
474  );
475  $importer->setRevisionCallback( [ $this, 'importRevision' ] );
476  $importer->doImport();
477  }
478 
479  function importRevision( &$revision, &$importer ) {
480  $id = $revision->getID();
481  $content = $revision->getContent( Revision::RAW );
482  $id = $id ? $id : '';
483 
484  if ( $content === null ) {
485  echo "Revision $id is broken, we have no content available\n";
486 
487  return;
488  }
489 
490  $text = $content->serialize();
491  if ( $text === '' ) {
492  // This is what happens if the revision was broken at the time the
493  // dump was made. Unfortunately, it also happens if the revision was
494  // legitimately blank, so there's no way to tell the difference. To
495  // be safe, we'll skip it and leave it broken
496 
497  echo "Revision $id is blank in the dump, may have been broken before export\n";
498 
499  return;
500  }
501 
502  if ( !$id ) {
503  // No ID, can't import
504  echo "No id tag in revision, can't import\n";
505 
506  return;
507  }
508 
509  // Find text row again
510  $dbr = wfGetDB( DB_SLAVE );
511  $oldId = $dbr->selectField( 'revision', 'rev_text_id', [ 'rev_id' => $id ], __METHOD__ );
512  if ( !$oldId ) {
513  echo "Missing revision row for rev_id $id\n";
514 
515  return;
516  }
517 
518  // Compress the text
520 
521  // Update the text row
522  $dbw = wfGetDB( DB_MASTER );
523  $dbw->update( 'text',
524  [ 'old_flags' => $flags, 'old_text' => $text ],
525  [ 'old_id' => $oldId ],
526  __METHOD__, [ 'LIMIT' => 1 ]
527  );
528 
529  // Remove it from the unfixed list and add it to the fixed list
530  unset( $this->errors['restore text'][$id] );
531  $this->errors['fixed'][$id] = true;
532  }
533 }
const CONCAT_HEADER
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
wfGetDB($db, $groups=[], $wiki=false)
Get a Database object.
checkExternalConcatBlobs($externalConcatBlobs)
XML file reader for the page data importer.
$source
this hook is for auditing only RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist e g Watchlist removed from all revisions and log entries to which it was applied This gives extensions a chance to take it off their books as the deletion has already been partly carried out by this point or something similar the user will be unable to create the tag set and then return false from the hook function Ensure you consume the ChangeTagAfterDelete hook to carry out custom deletion actions as context called by AbstractContent::getParserOutput May be used to override the normal model specific rendering of page content as context $revId
Definition: hooks.txt:1020
it s the revision text itself In either if gzip is the revision text is gzipped $flags
Definition: hooks.txt:2588
when a variable name is used in a it is silently declared as a new local masking the global
Definition: design.txt:93
if($line===false) $args
Definition: cdb.php:64
check($fix=false, $xml= '')
Maintenance script to do various checks on external storage.
wfTempDir()
Tries to get the system directory for temporary files.
unserialize($serialized)
Definition: ApiMessage.php:102
this hook is for auditing only RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist e g Watchlist removed from all revisions and log entries to which it was applied This gives extensions a chance to take it off their books as the deletion has already been partly carried out by this point or something similar the user will be unable to create the tag set and then return false from the hook function Ensure you consume the ChangeTagAfterDelete hook to carry out custom deletion actions as context called by AbstractContent::getParserOutput May be used to override the normal model specific rendering of page content as context as context $options
Definition: hooks.txt:1020
$res
Definition: database.txt:21
static compressRevisionText(&$text)
If $wgCompressRevisions is enabled, we will compress data.
Definition: Revision.php:1293
Imports a XML dump from a file (either from file upload, files on disk, or HTTP)
const DB_SLAVE
Definition: Defines.php:46
const RAW
Definition: Revision.php:85
This document is intended to provide useful advice for parties seeking to redistribute MediaWiki to end users It s targeted particularly at maintainers for Linux since it s been observed that distribution packages of MediaWiki often break We ve consistently had to recommend that users seeking support use official tarballs instead of their distribution s and this often solves whatever problem the user is having It would be nice if this could such as
Definition: distributors.txt:9
error($type, $msg, $ids)
if the prop value should be in the metadata multi language array can modify can modify indexed by page_id indexed by prefixed DB keys can modify can modify can modify this should be populated with an alert message to that effect to be fed to an HTMLForm object and populate $result with the reason in the form of error messages should be plain text with no special etc to show that they re errors
Definition: hooks.txt:1581
static getDefaultInstance()
injection txt This is an overview of how MediaWiki makes use of dependency injection The design described here grew from the discussion of RFC T384 The term dependency this means that anything an object needs to operate should be injected from the the object itself should only know narrow no concrete implementation of the logic it relies on The requirement to inject everything typically results in an architecture that based on two main types of and essentially stateless service objects that use other service objects to operate on the value objects As of the beginning MediaWiki is only starting to use the DI approach Much of the code still relies on global state or direct resulting in a highly cyclical dependency which acts as the top level factory for services in MediaWiki which can be used to gain access to default instances of various services MediaWikiServices however also allows new services to be defined and default services to be redefined Services are defined or redefined by providing a callback the instantiator that will return a new instance of the service When it will create an instance of MediaWikiServices and populate it with the services defined in the files listed by thereby bootstrapping the DI framework Per $wgServiceWiringFiles lists includes ServiceWiring php
Definition: injection.txt:35
this hook is for auditing only RecentChangesLinked and Watchlist RecentChangesLinked and Watchlist e g Watchlist removed from all revisions and log entries to which it was applied This gives extensions a chance to take it off their books as the deletion has already been partly carried out by this point or something similar the user will be unable to create the tag set and then return false from the hook function Ensure you consume the ChangeTagAfterDelete hook to carry out custom deletion actions as context called by AbstractContent::getParserOutput May be used to override the normal model specific rendering of page content $content
Definition: hooks.txt:1020
$count
wfEscapeShellArg()
Windows-compatible version of escapeshellarg() Windows doesn't recognise single-quotes in the shell...
const DB_MASTER
Definition: Defines.php:47
importRevision(&$revision, &$importer)
restoreText($revIds, $xml)
controlled by $wgMainCacheType controlled by $wgParserCacheType controlled by $wgMessageCacheType If you set CACHE_NONE to one of the three control default value for MediaWiki still create a but requests to it are no ops and we always fall through to the database If the cache daemon can t be it should also disable itself fairly smoothly By $wgMemc is used but when it is $parserMemc or $messageMemc this is mentioned $wgDBname
Definition: memcached.txt:96
DB accessable external objects.
do that in ParserLimitReportFormat instead use this to modify the parameters of the image and a DIV can begin in one section and end in another Make sure your code can handle that case gracefully See the EditSectionClearerLink extension for an example zero but section is usually empty its values are the globals values before the output is cached one of or reset my talk my contributions etc etc otherwise the built in rate limiting checks are if enabled allows for interception of redirect as a string mapping parameter names to values & $type
Definition: hooks.txt:2376
global $optionsWithoutArgs
Definition: commandLine.inc:28
$matches
Allows to change the fields on the form that will be generated $name
Definition: hooks.txt:310