[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/extensions/SpamBlacklist/ -> SpamRegexBatch.php (source)

   1  <?php
   2  
   3  /**
   4   * Utility class for working with blacklists
   5   */
   6  class SpamRegexBatch {
   7      /**
   8       * Build a set of regular expressions matching URLs with the list of regex fragments.
   9       * Returns an empty list if the input list is empty.
  10       *
  11       * @param array $lines list of fragments which will match in URLs
  12       * @param BaseBlacklist $blacklist
  13       * @param int $batchSize largest allowed batch regex;
  14       *                       if 0, will produce one regex per line
  15       * @return array
  16       */
  17  	static function buildRegexes( $lines, BaseBlacklist $blacklist, $batchSize=4096 ) {
  18          # Make regex
  19          # It's faster using the S modifier even though it will usually only be run once
  20          //$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')';
  21          //return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim';
  22          $regexes = array();
  23          $regexStart = $blacklist->getRegexStart();
  24          $regexEnd = $blacklist->getRegexEnd( $batchSize );
  25          $build = false;
  26          foreach( $lines as $line ) {
  27              if( substr( $line, -1, 1 ) == "\\" ) {
  28                  // Final \ will break silently on the batched regexes.
  29                  // Skip it here to avoid breaking the next line;
  30                  // warnings from getBadLines() will still trigger on
  31                  // edit to keep new ones from floating in.
  32                  continue;
  33              }
  34              // FIXME: not very robust size check, but should work. :)
  35              if( $build === false ) {
  36                  $build = $line;
  37              } elseif( strlen( $build ) + strlen( $line ) > $batchSize ) {
  38                  $regexes[] = $regexStart .
  39                      str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) .
  40                      $regexEnd;
  41                  $build = $line;
  42              } else {
  43                  $build .= '|';
  44                  $build .= $line;
  45              }
  46          }
  47          if( $build !== false ) {
  48              $regexes[] = $regexStart .
  49                  str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) .
  50                  $regexEnd;
  51          }
  52          return $regexes;
  53      }
  54  
  55      /**
  56       * Confirm that a set of regexes is either empty or valid.
  57       *
  58       * @param $regexes array set of regexes
  59       * @return bool true if ok, false if contains invalid lines
  60       */
  61  	static function validateRegexes( $regexes ) {
  62          foreach( $regexes as $regex ) {
  63              wfSuppressWarnings();
  64              $ok = preg_match( $regex, '' );
  65              wfRestoreWarnings();
  66  
  67              if( $ok === false ) {
  68                  return false;
  69              }
  70          }
  71          return true;
  72      }
  73  
  74      /**
  75       * Strip comments and whitespace, then remove blanks
  76       *
  77       * @param $lines array
  78       * @return array
  79       */
  80  	static function stripLines( $lines ) {
  81          return array_filter(
  82              array_map( 'trim',
  83                  preg_replace( '/#.*$/', '',
  84                      $lines ) ) );
  85      }
  86  
  87      /**
  88       * Do a sanity check on the batch regex.
  89       *
  90       * @param $lines string unsanitized input lines
  91       * @param $blacklist BaseBlacklist
  92       * @param $fileName bool|string optional for debug reporting
  93       * @return array of regexes
  94       */
  95  	static function buildSafeRegexes( $lines, BaseBlacklist $blacklist, $fileName=false ) {
  96          $lines = SpamRegexBatch::stripLines( $lines );
  97          $regexes = SpamRegexBatch::buildRegexes( $lines, $blacklist );
  98          if( SpamRegexBatch::validateRegexes( $regexes ) ) {
  99              return $regexes;
 100          } else {
 101              // _Something_ broke... rebuild line-by-line; it'll be
 102              // slower if there's a lot of blacklist lines, but one
 103              // broken line won't take out hundreds of its brothers.
 104              if( $fileName ) {
 105                  wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" );
 106              }
 107              return SpamRegexBatch::buildRegexes( $lines, $blacklist, 0 );
 108          }
 109      }
 110  
 111      /**
 112       * Returns an array of invalid lines
 113       *
 114       * @param array $lines
 115       * @param $blacklist BaseBlacklist
 116       * @return array of input lines which produce invalid input, or empty array if no problems
 117       */
 118  	static function getBadLines( $lines, BaseBlacklist $blacklist ) {
 119          $lines = SpamRegexBatch::stripLines( $lines );
 120  
 121          $badLines = array();
 122          foreach( $lines as $line ) {
 123              if( substr( $line, -1, 1 ) == "\\" ) {
 124                  // Final \ will break silently on the batched regexes.
 125                  $badLines[] = $line;
 126              }
 127          }
 128  
 129          $regexes = SpamRegexBatch::buildRegexes( $lines, $blacklist );
 130          if( SpamRegexBatch::validateRegexes( $regexes ) ) {
 131              // No other problems!
 132              return $badLines;
 133          }
 134  
 135          // Something failed in the batch, so check them one by one.
 136          foreach( $lines as $line ) {
 137              $regexes = SpamRegexBatch::buildRegexes( array( $line ), $blacklist );
 138              if( !SpamRegexBatch::validateRegexes( $regexes ) ) {
 139                  $badLines[] = $line;
 140              }
 141          }
 142          return $badLines;
 143      }
 144  
 145      /**
 146       * Build a set of regular expressions from the given multiline input text,
 147       * with empty lines and comments stripped.
 148       *
 149       * @param $source string
 150       * @param $blacklist BaseBlacklist
 151       * @param $fileName bool|string optional, for reporting of bad files
 152       * @return array of regular expressions, potentially empty
 153       */
 154  	static function regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false ) {
 155          $lines = explode( "\n", $source );
 156          return SpamRegexBatch::buildSafeRegexes( $lines, $blacklist, $fileName );
 157      }
 158  
 159      /**
 160       * Build a set of regular expressions from a MediaWiki message.
 161       * Will be correctly empty if the message isn't present.
 162       *
 163       * @param $message string
 164       * @param $blacklist BaseBlacklist
 165       * @return array of regular expressions, potentially empty
 166       */
 167  	static function regexesFromMessage( $message, BaseBlacklist $blacklist ) {
 168          $source = wfMessage( $message )->inContentLanguage();
 169          if( !$source->isDisabled() ) {
 170              return SpamRegexBatch::regexesFromText( $source->plain(), $blacklist );
 171          } else {
 172              return array();
 173          }
 174      }
 175  }


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1