[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/extensions/SpamBlacklist/ -> cleanup.php (source)

   1  <?php
   2  
   3  /**
   4   * An aggressive spam cleanup script.
   5   * Searches the database for matching pages, and reverts them to the last non-spammed revision.
   6   * If all revisions contain spam, deletes the page
   7   */
   8  
   9  require_once ( '../../maintenance/commandLine.inc' );
  10  require_once ( 'SpamBlacklist_body.php' );
  11  
  12  /**
  13   * Find the latest revision of the article that does not contain spam and revert to it
  14   */
  15  function cleanupArticle( Revision $rev, $regexes, $match ) {
  16      $title = $rev->getTitle();
  17      $revId = $rev->getId();
  18      while ( $rev ) {
  19          $matches = false;
  20          foreach ( $regexes as $regex ) {
  21              $matches = $matches || preg_match( $regex, $rev->getText() );
  22          }
  23          if ( !$matches ) {
  24              // Didn't find any spam
  25              break;
  26          }
  27          # Revision::getPrevious can't be used in this way before MW 1.6 (Revision.php 1.26)
  28          #$rev = $rev->getPrevious();
  29          $revId = $title->getPreviousRevisionID( $revId );
  30          if ( $revId ) {
  31              $rev = Revision::newFromTitle( $title, $revId );
  32          } else {
  33              $rev = false;
  34          }
  35      }
  36      $dbw = wfGetDB( DB_MASTER );
  37      $dbw->begin();
  38      if ( !$rev ) {
  39          // Didn't find a non-spammy revision, delete the page
  40          /*
  41          print "All revisions are spam, deleting...\n";
  42          $article = new Article( $title );
  43          $article->doDeleteArticle( "All revisions matched the spam blacklist" );
  44          */
  45          // Too scary, blank instead
  46          print "All revisions are spam, blanking...\n";
  47          $text = '';
  48          $comment = "All revisions matched the spam blacklist ($match), blanking";
  49      } else {
  50          // Revert to this revision
  51          $text = $rev->getText();
  52          $comment = "Cleaning up links to $match";
  53      }
  54      $wikiPage = new WikiPage( $title );
  55      $wikiPage->doEdit( $text, $comment );
  56      $dbw->commit();
  57  }
  58  
  59  //------------------------------------------------------------------------------
  60  
  61  $username = 'Spam cleanup script';
  62  $wgUser = User::newFromName( $username );
  63  if ( $wgUser->idForName() == 0 ) {
  64      // Create the user
  65      $status = $wgUser->addToDatabase();
  66      if ( $status === null || $status->isOK() ) {
  67          $dbw = wfGetDB( DB_MASTER );
  68          $dbw->update( 'user', array( 'user_password' => 'nologin' ),
  69              array( 'user_name' => $username ), $username );
  70      }
  71  }
  72  
  73  if ( isset( $options['n'] ) ) {
  74      $dryRun = true;
  75  } else {
  76      $dryRun = false;
  77  }
  78  
  79  $sb = new SpamBlacklist( $wgSpamBlacklistSettings );
  80  if ( $wgSpamBlacklistFiles ) {
  81      $sb->files = $wgSpamBlacklistFiles;
  82  }
  83  $regexes = $sb->getBlacklists();
  84  if ( !$regexes ) {
  85      print "Invalid regex, can't clean up spam\n";
  86      exit( 1 );
  87  }
  88  
  89  $dbr = wfGetDB( DB_SLAVE );
  90  $maxID = $dbr->selectField( 'page', 'MAX(page_id)' );
  91  $reportingInterval = 100;
  92  
  93  print "Regexes are " . implode( ', ', array_map( 'count', $regexes ) ) . " bytes\n";
  94  print "Searching for spam in $maxID pages...\n";
  95  if ( $dryRun ) {
  96      print "Dry run only\n";
  97  }
  98  
  99  for ( $id = 1; $id <= $maxID; $id++ ) {
 100      if ( $id % $reportingInterval == 0 ) {
 101          printf( "%-8d  %-5.2f%%\r", $id, $id / $maxID * 100 );
 102      }
 103      $revision = Revision::loadFromPageId( $dbr, $id );
 104      if ( $revision ) {
 105          $text = $revision->getText();
 106          if ( $text ) {
 107              foreach ( $regexes as $regex ) {
 108                  if ( preg_match( $regex, $text, $matches ) ) {
 109                      $title = $revision->getTitle();
 110                      $titleText = $title->getPrefixedText();
 111                      if ( $dryRun ) {
 112                          print "\nFound spam in [[$titleText]]\n";
 113                      } else {
 114                          print "\nCleaning up links to {$matches[0]} in [[$titleText]]\n";
 115                          $match = str_replace( 'http://', '', $matches[0] );
 116                          cleanupArticle( $revision, $regexes, $match );
 117                      }
 118                  }
 119              }
 120          }
 121      }
 122  }
 123  // Just for satisfaction
 124  printf( "%-8d  %-5.2f%%\n", $id - 1, ( $id - 1 ) / $maxID * 100 );
 125  


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1