| [ Index ] |
PHP Cross Reference of MediaWiki-1.24.0 |
[Summary view] [Print] [Text view]
1 <?php 2 3 /** 4 * Utility class for working with blacklists 5 */ 6 class SpamRegexBatch { 7 /** 8 * Build a set of regular expressions matching URLs with the list of regex fragments. 9 * Returns an empty list if the input list is empty. 10 * 11 * @param array $lines list of fragments which will match in URLs 12 * @param BaseBlacklist $blacklist 13 * @param int $batchSize largest allowed batch regex; 14 * if 0, will produce one regex per line 15 * @return array 16 */ 17 static function buildRegexes( $lines, BaseBlacklist $blacklist, $batchSize=4096 ) { 18 # Make regex 19 # It's faster using the S modifier even though it will usually only be run once 20 //$regex = 'https?://+[a-z0-9_\-.]*(' . implode( '|', $lines ) . ')'; 21 //return '/' . str_replace( '/', '\/', preg_replace('|\\\*/|', '/', $regex) ) . '/Sim'; 22 $regexes = array(); 23 $regexStart = $blacklist->getRegexStart(); 24 $regexEnd = $blacklist->getRegexEnd( $batchSize ); 25 $build = false; 26 foreach( $lines as $line ) { 27 if( substr( $line, -1, 1 ) == "\\" ) { 28 // Final \ will break silently on the batched regexes. 29 // Skip it here to avoid breaking the next line; 30 // warnings from getBadLines() will still trigger on 31 // edit to keep new ones from floating in. 32 continue; 33 } 34 // FIXME: not very robust size check, but should work. :) 35 if( $build === false ) { 36 $build = $line; 37 } elseif( strlen( $build ) + strlen( $line ) > $batchSize ) { 38 $regexes[] = $regexStart . 39 str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) . 40 $regexEnd; 41 $build = $line; 42 } else { 43 $build .= '|'; 44 $build .= $line; 45 } 46 } 47 if( $build !== false ) { 48 $regexes[] = $regexStart . 49 str_replace( '/', '\/', preg_replace('|\\\*/|u', '/', $build) ) . 50 $regexEnd; 51 } 52 return $regexes; 53 } 54 55 /** 56 * Confirm that a set of regexes is either empty or valid. 57 * 58 * @param $regexes array set of regexes 59 * @return bool true if ok, false if contains invalid lines 60 */ 61 static function validateRegexes( $regexes ) { 62 foreach( $regexes as $regex ) { 63 wfSuppressWarnings(); 64 $ok = preg_match( $regex, '' ); 65 wfRestoreWarnings(); 66 67 if( $ok === false ) { 68 return false; 69 } 70 } 71 return true; 72 } 73 74 /** 75 * Strip comments and whitespace, then remove blanks 76 * 77 * @param $lines array 78 * @return array 79 */ 80 static function stripLines( $lines ) { 81 return array_filter( 82 array_map( 'trim', 83 preg_replace( '/#.*$/', '', 84 $lines ) ) ); 85 } 86 87 /** 88 * Do a sanity check on the batch regex. 89 * 90 * @param $lines string unsanitized input lines 91 * @param $blacklist BaseBlacklist 92 * @param $fileName bool|string optional for debug reporting 93 * @return array of regexes 94 */ 95 static function buildSafeRegexes( $lines, BaseBlacklist $blacklist, $fileName=false ) { 96 $lines = SpamRegexBatch::stripLines( $lines ); 97 $regexes = SpamRegexBatch::buildRegexes( $lines, $blacklist ); 98 if( SpamRegexBatch::validateRegexes( $regexes ) ) { 99 return $regexes; 100 } else { 101 // _Something_ broke... rebuild line-by-line; it'll be 102 // slower if there's a lot of blacklist lines, but one 103 // broken line won't take out hundreds of its brothers. 104 if( $fileName ) { 105 wfDebugLog( 'SpamBlacklist', "Spam blacklist warning: bogus line in $fileName\n" ); 106 } 107 return SpamRegexBatch::buildRegexes( $lines, $blacklist, 0 ); 108 } 109 } 110 111 /** 112 * Returns an array of invalid lines 113 * 114 * @param array $lines 115 * @param $blacklist BaseBlacklist 116 * @return array of input lines which produce invalid input, or empty array if no problems 117 */ 118 static function getBadLines( $lines, BaseBlacklist $blacklist ) { 119 $lines = SpamRegexBatch::stripLines( $lines ); 120 121 $badLines = array(); 122 foreach( $lines as $line ) { 123 if( substr( $line, -1, 1 ) == "\\" ) { 124 // Final \ will break silently on the batched regexes. 125 $badLines[] = $line; 126 } 127 } 128 129 $regexes = SpamRegexBatch::buildRegexes( $lines, $blacklist ); 130 if( SpamRegexBatch::validateRegexes( $regexes ) ) { 131 // No other problems! 132 return $badLines; 133 } 134 135 // Something failed in the batch, so check them one by one. 136 foreach( $lines as $line ) { 137 $regexes = SpamRegexBatch::buildRegexes( array( $line ), $blacklist ); 138 if( !SpamRegexBatch::validateRegexes( $regexes ) ) { 139 $badLines[] = $line; 140 } 141 } 142 return $badLines; 143 } 144 145 /** 146 * Build a set of regular expressions from the given multiline input text, 147 * with empty lines and comments stripped. 148 * 149 * @param $source string 150 * @param $blacklist BaseBlacklist 151 * @param $fileName bool|string optional, for reporting of bad files 152 * @return array of regular expressions, potentially empty 153 */ 154 static function regexesFromText( $source, BaseBlacklist $blacklist, $fileName=false ) { 155 $lines = explode( "\n", $source ); 156 return SpamRegexBatch::buildSafeRegexes( $lines, $blacklist, $fileName ); 157 } 158 159 /** 160 * Build a set of regular expressions from a MediaWiki message. 161 * Will be correctly empty if the message isn't present. 162 * 163 * @param $message string 164 * @param $blacklist BaseBlacklist 165 * @return array of regular expressions, potentially empty 166 */ 167 static function regexesFromMessage( $message, BaseBlacklist $blacklist ) { 168 $source = wfMessage( $message )->inContentLanguage(); 169 if( !$source->isDisabled() ) { 170 return SpamRegexBatch::regexesFromText( $source->plain(), $blacklist ); 171 } else { 172 return array(); 173 } 174 } 175 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Fri Nov 28 14:03:12 2014 | Cross-referenced by PHPXref 0.7.1 |