MediaWiki  REL1_22
CLDRPluralRuleEvaluator.php
Go to the documentation of this file.
00001 <?php
00034 class CLDRPluralRuleEvaluator {
00043     public static function evaluate( $number, array $rules ) {
00044         $rules = self::compile( $rules );
00045         return self::evaluateCompiled( $number, $rules );
00046     }
00047 
00055     public static function compile( array $rules ) {
00056         // We can't use array_map() for this because it generates a warning if
00057         // there is an exception.
00058         foreach ( $rules as &$rule ) {
00059             $rule = CLDRPluralRuleConverter::convert( $rule );
00060         }
00061         return $rules;
00062     }
00063 
00073     public static function evaluateCompiled( $number, array $rules ) {
00074         // Calculate the values of the operand symbols
00075         $number = strval( $number );
00076         if ( !preg_match( '/^ -? ( ([0-9]+) (?: \. ([0-9]+) )? )$/x', $number,  $m ) ) {
00077             wfDebug( __METHOD__.': invalid number input, returning "other"' );
00078             return count( $rules );
00079         }
00080         if ( !isset( $m[3] ) ) {
00081             $operandSymbols = array(
00082                 'n' => intval( $m[1] ),
00083                 'i' => intval( $m[1] ),
00084                 'v' => 0,
00085                 'w' => 0,
00086                 'f' => 0,
00087                 't' => 0
00088             );
00089         } else {
00090             $absValStr = $m[1];
00091             $intStr = $m[2];
00092             $fracStr = $m[3];
00093             $operandSymbols = array(
00094                 'n' => floatval( $absValStr ),
00095                 'i' => intval( $intStr ),
00096                 'v' => strlen( $fracStr ),
00097                 'w' => strlen( rtrim( $fracStr, '0' ) ),
00098                 'f' => intval( $fracStr ),
00099                 't' => intval( rtrim( $fracStr, '0' ) ),
00100             );
00101         }
00102 
00103         // The compiled form is RPN, with tokens strictly delimited by
00104         // spaces, so this is a simple RPN evaluator.
00105         foreach ( $rules as $i => $rule ) {
00106             $stack = array();
00107             $zero = ord( '0' );
00108             $nine = ord( '9' );
00109             foreach ( StringUtils::explode( ' ', $rule ) as $token ) {
00110                 $ord = ord( $token );
00111                 if ( isset( $operandSymbols[$token] ) ) {
00112                     $stack[] = $operandSymbols[$token];
00113                 } elseif ( $ord >= $zero && $ord <= $nine ) {
00114                     $stack[] = intval( $token );
00115                 } else {
00116                     $right = array_pop( $stack );
00117                     $left = array_pop( $stack );
00118                     $result = self::doOperation( $token, $left, $right );
00119                     $stack[] = $result;
00120                 }
00121             }
00122             if ( $stack[0] ) {
00123                 return $i;
00124             }
00125         }
00126         // None of the provided rules match. The number belongs to category
00127         // 'other', which comes last.
00128         return count( $rules );
00129     }
00130 
00140     private static function doOperation( $token, $left, $right ) {
00141         if ( in_array( $token, array( 'in', 'not-in', 'within', 'not-within' ) ) ) {
00142             if ( !( $right instanceof CLDRPluralRuleEvaluator_Range ) ) {
00143                 $right = new CLDRPluralRuleEvaluator_Range( $right );
00144             }
00145         }
00146         switch ( $token ) {
00147             case 'or':
00148                 return $left || $right;
00149             case 'and':
00150                 return $left && $right;
00151             case 'is':
00152                 return $left == $right;
00153             case 'is-not':
00154                 return $left != $right;
00155             case 'in':
00156                 return $right->isNumberIn( $left );
00157             case 'not-in':
00158                 return !$right->isNumberIn( $left );
00159             case 'within':
00160                 return $right->isNumberWithin( $left );
00161             case 'not-within':
00162                 return !$right->isNumberWithin( $left );
00163             case 'mod':
00164                 if ( is_int( $left ) ) {
00165                     return (int)fmod( $left, $right );
00166                 }
00167                 return fmod( $left, $right );
00168             case ',':
00169                 if ( $left instanceof CLDRPluralRuleEvaluator_Range ) {
00170                     $range = $left;
00171                 } else {
00172                     $range = new CLDRPluralRuleEvaluator_Range( $left );
00173                 }
00174                 $range->add( $right );
00175                 return $range;
00176             case '..':
00177                 return new CLDRPluralRuleEvaluator_Range( $left, $right );
00178             default:
00179                 throw new CLDRPluralRuleError( "Invalid RPN token" );
00180         }
00181     }
00182 }
00183 
00187 class CLDRPluralRuleEvaluator_Range {
00188     public $parts = array();
00189 
00190     function __construct( $start, $end = false ) {
00191         if ( $end === false ) {
00192             $this->parts[] = $start;
00193         } else {
00194             $this->parts[] = array( $start, $end );
00195         }
00196     }
00197 
00203     function isNumberIn( $number, $integerConstraint = true ) {
00204         foreach ( $this->parts as $part ) {
00205             if ( is_array( $part ) ) {
00206                 if ( ( !$integerConstraint || floor( $number ) === (float)$number )
00207                     && $number >= $part[0] && $number <= $part[1] )
00208                 {
00209                     return true;
00210                 }
00211             } else {
00212                 if ( $number == $part ) {
00213                     return true;
00214                 }
00215             }
00216         }
00217         return false;
00218     }
00219 
00224     function isNumberWithin( $number ) {
00225         return $this->isNumberIn( $number, false );
00226     }
00227 
00232     function add( $other ) {
00233         if ( $other instanceof self ) {
00234             $this->parts = array_merge( $this->parts, $other->parts );
00235         } else {
00236             $this->parts[] = $other;
00237         }
00238     }
00239 
00243     function __toString() {
00244         $s = 'Range(';
00245         foreach ( $this->parts as $i => $part ) {
00246             if ( $i ) {
00247                 $s .= ', ';
00248             }
00249             if ( is_array( $part ) ) {
00250                 $s .= $part[0] . '..' . $part[1];
00251             } else {
00252                 $s .= $part;
00253             }
00254         }
00255         $s .= ')';
00256         return $s;
00257     }
00258 
00259 }
00260 
00264 class CLDRPluralRuleConverter {
00270     public $rule;
00271 
00277     public $pos;
00278 
00284     public $end;
00285 
00291     public $operators = array();
00292 
00298     public $operands = array();
00299 
00305     static $precedence = array(
00306         'or' => 2,
00307         'and' => 3,
00308         'is' => 4,
00309         'is-not' => 4,
00310         'in' => 4,
00311         'not-in' => 4,
00312         'within' => 4,
00313         'not-within' => 4,
00314         'mod' => 5,
00315         ',' => 6,
00316         '..' => 7,
00317     );
00318 
00322     const WHITESPACE_CLASS = " \t\r\n";
00323 
00328     const NUMBER_CLASS = '0123456789';
00329 
00333     const OPERAND_SYMBOLS = 'nivwft';
00334 
00338     const WORD_REGEX = '/[a-zA-Z@]+/A';
00339 
00343     public static function convert( $rule ) {
00344         $parser = new self( $rule );
00345         return $parser->doConvert();
00346     }
00347 
00351     protected function __construct( $rule ) {
00352         $this->rule = $rule;
00353         $this->pos = 0;
00354         $this->end = strlen( $rule );
00355     }
00356 
00360     protected function doConvert() {
00361         $expectOperator = true;
00362 
00363         // Iterate through all tokens, saving the operators and operands to a
00364         // stack per Dijkstra's shunting yard algorithm.
00365         while ( false !== ( $token = $this->nextToken() ) ) {
00366             // In this grammar, there are only binary operators, so every valid
00367             // rule string will alternate between operator and operand tokens.
00368             $expectOperator = !$expectOperator;
00369 
00370             if ( $token instanceof CLDRPluralRuleConverter_Expression ) {
00371                 // Operand
00372                 if ( $expectOperator ) {
00373                     $token->error( 'unexpected operand' );
00374                 }
00375                 $this->operands[] = $token;
00376                 continue;
00377             } else {
00378                 // Operator
00379                 if ( !$expectOperator ) {
00380                     $token->error( 'unexpected operator' );
00381                 }
00382                 // Resolve higher precedence levels
00383                 $lastOp = end( $this->operators );
00384                 while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) {
00385                     $this->doOperation( $lastOp, $this->operands );
00386                     array_pop( $this->operators );
00387                     $lastOp = end( $this->operators );
00388                 }
00389                 $this->operators[] = $token;
00390             }
00391         }
00392 
00393         // Finish off the stack
00394         while ( $op = array_pop( $this->operators ) ) {
00395             $this->doOperation( $op, $this->operands );
00396         }
00397 
00398         // Make sure the result is sane. The first case is possible for an empty
00399         // string input, the second should be unreachable.
00400         if ( !count( $this->operands ) ) {
00401             $this->error( 'condition expected' );
00402         } elseif ( count( $this->operands ) > 1 ) {
00403             $this->error( 'missing operator or too many operands' );
00404         }
00405 
00406         $value = $this->operands[0];
00407         if ( $value->type !== 'boolean' ) {
00408             $this->error( 'the result must have a boolean type' );
00409         }
00410 
00411         return $this->operands[0]->rpn;
00412     }
00413 
00418     protected function nextToken() {
00419         if ( $this->pos >= $this->end ) {
00420             return false;
00421         }
00422 
00423         // Whitespace
00424         $length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos );
00425         $this->pos += $length;
00426 
00427         if ( $this->pos >= $this->end ) {
00428             return false;
00429         }
00430 
00431         // Number
00432         $length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos );
00433         if ( $length !== 0 ) {
00434             $token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos );
00435             $this->pos += $length;
00436             return $token;
00437         }
00438 
00439         // Two-character operators
00440         $op2 = substr( $this->rule, $this->pos, 2 );
00441         if ( $op2 === '..' || $op2 === '!=' ) {
00442             $token = $this->newOperator( $op2, $this->pos, 2 );
00443             $this->pos += 2;
00444             return $token;
00445         }
00446 
00447         // Single-character operators
00448         $op1 = $this->rule[$this->pos];
00449         if ( $op1 === ',' || $op1 === '=' || $op1 === '%' ) {
00450             $token = $this->newOperator( $op1, $this->pos, 1 );
00451             $this->pos ++;
00452             return $token;
00453         }
00454 
00455         // Word
00456         if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) {
00457             $this->error( 'unexpected character "' . $this->rule[$this->pos] . '"' );
00458         }
00459         $word1 = strtolower( $m[0] );
00460         $word2 = '';
00461         $nextTokenPos = $this->pos + strlen( $word1 );
00462         if ( $word1 === 'not' || $word1 === 'is' ) {
00463             // Look ahead one word
00464             $nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos );
00465             if ( $nextTokenPos < $this->end
00466                     && preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos ) )
00467             {
00468                 $word2 = strtolower( $m[0] );
00469                 $nextTokenPos += strlen( $word2 );
00470             }
00471         }
00472 
00473         // Two-word operators like "is not" take precedence over single-word operators like "is"
00474         if ( $word2 !== '' ) {
00475             $bothWords = "{$word1}-{$word2}";
00476             if ( isset( self::$precedence[$bothWords] ) ) {
00477                 $token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos );
00478                 $this->pos = $nextTokenPos;
00479                 return $token;
00480             }
00481         }
00482 
00483         // Single-word operators
00484         if ( isset( self::$precedence[$word1] ) ) {
00485             $token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) );
00486             $this->pos += strlen( $word1 );
00487             return $token;
00488         }
00489 
00490         // The single-character operand symbols
00491         if ( strpos( self::OPERAND_SYMBOLS, $word1 ) !== false ) {
00492             $token = $this->newNumber( $word1, $this->pos );
00493             $this->pos ++;
00494             return $token;
00495         }
00496 
00497         // Samples
00498         if ( $word1 === '@integer' || $word1 === '@decimal' ) {
00499             // Samples are like comments, they have no effect on rule evaluation.
00500             // They run from the first sample indicator to the end of the string.
00501             $this->pos = $this->end;
00502             return false;
00503         }
00504 
00505         $this->error( 'unrecognised word' );
00506     }
00507 
00513     protected function doOperation( $op ) {
00514         if ( count( $this->operands ) < 2 ) {
00515             $op->error( 'missing operand' );
00516         }
00517         $right = array_pop( $this->operands );
00518         $left = array_pop( $this->operands );
00519         $result = $op->operate( $left, $right );
00520         $this->operands[] = $result;
00521     }
00522 
00526     protected function newNumber( $text, $pos ) {
00527         return new CLDRPluralRuleConverter_Expression( $this, 'number', $text, $pos, strlen( $text ) );
00528     }
00529 
00533     protected function newOperator( $type, $pos, $length ) {
00534         return new CLDRPluralRuleConverter_Operator( $this, $type, $pos, $length );
00535     }
00536 
00540     protected function error( $message ) {
00541         throw new CLDRPluralRuleError( $message );
00542     }
00543 }
00544 
00549 class CLDRPluralRuleConverter_Fragment {
00550     public $parser, $pos, $length, $end;
00551 
00552     function __construct( $parser, $pos, $length ) {
00553         $this->parser = $parser;
00554         $this->pos = $pos;
00555         $this->length = $length;
00556         $this->end = $pos + $length;
00557     }
00558 
00559     public function error( $message ) {
00560         $text = $this->getText();
00561         throw new CLDRPluralRuleError( "$message at position " . ( $this->pos + 1 ) . ": \"$text\"" );
00562     }
00563 
00564     public function getText() {
00565         return substr( $this->parser->rule, $this->pos, $this->length );
00566     }
00567 }
00568 
00575 class CLDRPluralRuleConverter_Expression extends CLDRPluralRuleConverter_Fragment {
00576     public $type, $rpn;
00577 
00578     function __construct( $parser, $type, $rpn, $pos, $length ) {
00579         parent::__construct( $parser, $pos, $length );
00580         $this->type = $type;
00581         $this->rpn = $rpn;
00582     }
00583 
00584     public function isType( $type ) {
00585         if ( $type === 'range' && ( $this->type === 'range' || $this->type === 'number' ) ) {
00586             return true;
00587         }
00588         if ( $type === $this->type ) {
00589             return true;
00590         }
00591         return false;
00592     }
00593 }
00594 
00600 class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment {
00601     public $name;
00602 
00612     static $opTypes = array(
00613         'or' => 'bbb',
00614         'and' => 'bbb',
00615         'is' => 'nnb',
00616         'is-not' => 'nnb',
00617         'in' => 'nrb',
00618         'not-in' => 'nrb',
00619         'within' => 'nrb',
00620         'not-within' => 'nrb',
00621         'mod' => 'nnn',
00622         ',' => 'rrr',
00623         '..' => 'nnr',
00624     );
00625 
00629     static $typeSpecMap = array(
00630         'b' => 'boolean',
00631         'n' => 'number',
00632         'r' => 'range',
00633     );
00634 
00638     static $aliasMap = array(
00639         '%' => 'mod',
00640         '!=' => 'not-in',
00641         '=' => 'in'
00642     );
00643 
00652     function __construct( $parser, $name, $pos, $length ) {
00653         parent::__construct( $parser, $pos, $length );
00654         if ( isset( self::$aliasMap[$name] ) ) {
00655             $name = self::$aliasMap[$name];
00656         }
00657         $this->name = $name;
00658     }
00659 
00660     public function operate( $left, $right ) {
00661         $typeSpec = self::$opTypes[$this->name];
00662 
00663         $leftType = self::$typeSpecMap[$typeSpec[0]];
00664         $rightType = self::$typeSpecMap[$typeSpec[1]];
00665         $resultType = self::$typeSpecMap[$typeSpec[2]];
00666 
00667         $start = min( $this->pos, $left->pos, $right->pos );
00668         $end = max( $this->end, $left->end, $right->end );
00669         $length = $end - $start;
00670 
00671         $newExpr = new CLDRPluralRuleConverter_Expression( $this->parser, $resultType,
00672             "{$left->rpn} {$right->rpn} {$this->name}",
00673             $start, $length );
00674 
00675         if ( !$left->isType( $leftType ) ) {
00676             $newExpr->error( "invalid type for left operand: expected $leftType, got {$left->type}" );
00677         }
00678 
00679         if ( !$right->isType( $rightType ) ) {
00680             $newExpr->error( "invalid type for right operand: expected $rightType, got {$right->type}" );
00681         }
00682         return $newExpr;
00683     }
00684 }
00685 
00690 class CLDRPluralRuleError extends MWException {
00691     function __construct( $message ) {
00692         parent::__construct( 'CLDR plural rule error: ' . $message );
00693     }
00694 }