MediaWiki  REL1_20
CLDRPluralRuleEvaluator.php
Go to the documentation of this file.
00001 <?php
00015 class CLDRPluralRuleEvaluator {
00024         public static function evaluate( $number, array $rules ) {
00025                 $rules = self::compile( $rules );
00026                 return self::evaluateCompiled( $number, $rules );
00027         }
00028 
00036         public static function compile( array $rules ) {
00037                 // We can't use array_map() for this because it generates a warning if
00038                 // there is an exception.
00039                 foreach ( $rules as &$rule ) {
00040                         $rule = CLDRPluralRuleConverter::convert( $rule );
00041                 }
00042                 return $rules;
00043         }
00044 
00049         public static function evaluateCompiled( $number, array $rules ) {
00050                 // The compiled form is RPN, with tokens strictly delimited by
00051                 // spaces, so this is a simple RPN evaluator.
00052                 foreach ( $rules as $i => $rule  ) {
00053                         $stack = array();
00054                         $zero = ord( '0' );
00055                         $nine = ord( '9' );
00056                         foreach ( StringUtils::explode( ' ', $rule ) as $token ) {
00057                                 $ord = ord( $token );
00058                                 if ( $token === 'n' ) {
00059                                         $stack[] = $number;
00060                                 } elseif ( $ord >= $zero && $ord <= $nine ) {
00061                                         $stack[] = intval( $token );
00062                                 } else {
00063                                         $right = array_pop( $stack );
00064                                         $left = array_pop( $stack );
00065                                         $result = self::doOperation( $token, $left, $right );
00066                                         $stack[] = $result;
00067                                 }
00068                         }
00069                         if ( $stack[0] ) {
00070                                 return $i;
00071                         }
00072                 }
00073                 // None of the provided rules match. The number belongs to caregory
00074                 // 'other' which comes last.
00075                 return count( $rules );
00076         }
00077 
00086         private static function doOperation( $token, $left, $right ) {
00087                 if ( in_array( $token, array( 'in', 'not-in', 'within', 'not-within' ) ) ) {
00088                         if ( !($right instanceof CLDRPluralRuleEvaluator_Range ) ) {
00089                                 $right = new CLDRPluralRuleEvaluator_Range( $right );
00090                         }
00091                 }
00092                 switch ( $token ) {
00093                         case 'or':
00094                                 return $left || $right;
00095                         case 'and':
00096                                 return $left && $right;
00097                         case 'is':
00098                                 return $left == $right;
00099                         case 'is-not':
00100                                 return $left != $right;
00101                         case 'in':
00102                                 return $right->isNumberIn( $left );
00103                         case 'not-in':
00104                                 return !$right->isNumberIn( $left );
00105                         case 'within':
00106                                 return $right->isNumberWithin( $left );
00107                         case 'not-within':
00108                                 return !$right->isNumberWithin( $left );
00109                         case 'mod':
00110                                 if ( is_int( $left ) ) {
00111                                         return (int) fmod( $left, $right );
00112                                 }
00113                                 return fmod( $left, $right );
00114                         case ',':
00115                                 if ( $left instanceof CLDRPluralRuleEvaluator_Range ) {
00116                                         $range = $left;
00117                                 } else {
00118                                         $range = new CLDRPluralRuleEvaluator_Range( $left );
00119                                 }
00120                                 $range->add( $right );
00121                                 return $range;
00122                         case '..':
00123                                 return new CLDRPluralRuleEvaluator_Range( $left, $right );
00124                         default:
00125                                 throw new CLDRPluralRuleError( "Invalid RPN token" );
00126                 }
00127         }
00128 }
00129 
00133 class CLDRPluralRuleEvaluator_Range {
00134         var $parts = array();
00135 
00136         function __construct( $start, $end = false ) {
00137                 if ( $end === false ) {
00138                         $this->parts[] = $start;
00139                 } else {
00140                         $this->parts[] = array( $start, $end );
00141                 }
00142         }
00143 
00149         function isNumberIn( $number, $integerConstraint = true ) {
00150                 foreach ( $this->parts as $part ) {
00151                         if ( is_array( $part ) ) {
00152                                 if ( ( !$integerConstraint || floor( $number ) === (float)$number )
00153                                         && $number >= $part[0] && $number <= $part[1] )
00154                                 {
00155                                         return true;
00156                                 }
00157                         } else {
00158                                 if ( $number == $part ) {
00159                                         return true;
00160                                 }
00161                         }
00162                 }
00163                 return false;
00164         }
00165 
00170         function isNumberWithin( $number ) {
00171                 return $this->isNumberIn( $number, false );
00172         }
00173 
00178         function add( $other ) {
00179                 if ( $other instanceof self ) {
00180                         $this->parts = array_merge( $this->parts, $other->parts );
00181                 } else {
00182                         $this->parts[] = $other;
00183                 }
00184         }
00185 
00189         function __toString() {
00190                 $s = 'Range(';
00191                 foreach ( $this->parts as $i => $part ) {
00192                         if ( $i ) {
00193                                 $s .= ', ';
00194                         }
00195                         if ( is_array( $part ) ) {
00196                                 $s .= $part[0] . '..' . $part[1];
00197                         } else {
00198                                 $s .= $part;
00199                         }
00200                 }
00201                 $s .= ')';
00202                 return $s;
00203         }
00204 
00205 }
00206 
00210 class CLDRPluralRuleConverter {
00211         var $rule, $pos, $end;
00212         var $operators = array();
00213         var $operands = array();
00214 
00220         static $precedence = array(
00221                 'or' => 2,
00222                 'and' => 3,
00223                 'is' => 4,
00224                 'is-not' => 4,
00225                 'in' => 4,
00226                 'not-in' => 4,
00227                 'within' => 4,
00228                 'not-within' => 4,
00229                 'mod' => 5,
00230                 ',' => 6,
00231                 '..' => 7,
00232         );
00233 
00237         const WHITESPACE_CLASS = " \t\r\n";
00238 
00243         const NUMBER_CLASS = '0123456789';
00244 
00248         const WORD_REGEX = '/[a-zA-Z]+/A';
00249 
00253         public static function convert( $rule ) {
00254                 $parser = new self( $rule );
00255                 return $parser->doConvert();
00256         }
00257 
00261         protected function __construct( $rule ) {
00262                 $this->rule = $rule;
00263                 $this->pos = 0;
00264                 $this->end = strlen( $rule );
00265         }
00266 
00270         protected function doConvert() {
00271                 $expectOperator = true;
00272 
00273                 // Iterate through all tokens, saving the operators and operands to a
00274                 // stack per Dijkstra's shunting yard algorithm.
00275                 while ( false !== ( $token = $this->nextToken() ) ) {
00276                         // In this grammar, there are only binary operators, so every valid
00277                         // rule string will alternate between operator and operand tokens.
00278                         $expectOperator = !$expectOperator;
00279 
00280                         if ( $token instanceof CLDRPluralRuleConverter_Expression ) {
00281                                 // Operand
00282                                 if ( $expectOperator ) {
00283                                         $token->error( 'unexpected operand' );
00284                                 }
00285                                 $this->operands[] = $token;
00286                                 continue;
00287                         } else {
00288                                 // Operator
00289                                 if  ( !$expectOperator ) {
00290                                         $token->error( 'unexpected operator' );
00291                                 }
00292                                 // Resolve higher precedence levels
00293                                 $lastOp = end( $this->operators );
00294                                 while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) {
00295                                         $this->doOperation( $lastOp, $this->operands );
00296                                         array_pop( $this->operators );
00297                                         $lastOp = end( $this->operators );
00298                                 }
00299                                 $this->operators[] = $token;
00300                         }
00301                 }
00302 
00303                 // Finish off the stack
00304                 while ( $op = array_pop( $this->operators ) ) {
00305                         $this->doOperation( $op, $this->operands );
00306                 }
00307 
00308                 // Make sure the result is sane. The first case is possible for an empty
00309                 // string input, the second should be unreachable.
00310                 if ( !count( $this->operands ) ) {
00311                         $this->error( 'condition expected' );
00312                 } elseif ( count( $this->operands ) > 1 ) {
00313                         $this->error( 'missing operator or too many operands' );
00314                 }
00315 
00316                 $value = $this->operands[0];
00317                 if ( $value->type !== 'boolean' ) {
00318                         $this->error( 'the result must have a boolean type' );
00319                 }
00320 
00321                 return $this->operands[0]->rpn;
00322         }
00323 
00328         protected function nextToken() {
00329                 if ( $this->pos >= $this->end ) {
00330                         return false;
00331                 }
00332 
00333                 // Whitespace
00334                 $length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos );
00335                 $this->pos += $length;
00336 
00337                 if ( $this->pos >= $this->end ) {
00338                         return false;
00339                 }
00340 
00341                 // Number
00342                 $length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos );
00343                 if ( $length !== 0 ) {
00344                         $token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos );
00345                         $this->pos += $length;
00346                         return $token;
00347                 }
00348 
00349                 // Comma
00350                 if ( $this->rule[$this->pos] === ',' ) {
00351                         $token = $this->newOperator( ',', $this->pos, 1 );
00352                         $this->pos ++;
00353                         return $token;
00354                 }
00355 
00356                 // Dot dot
00357                 if ( substr( $this->rule, $this->pos, 2 ) === '..' ) {
00358                         $token = $this->newOperator( '..', $this->pos, 2 );
00359                         $this->pos += 2;
00360                         return $token;
00361                 }
00362 
00363                 // Word
00364                 if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) {
00365                         $this->error( 'unexpected character "' . $this->rule[$this->pos] . '"'  );
00366                 }
00367                 $word1 = strtolower( $m[0] );
00368                 $word2 = '';
00369                 $nextTokenPos = $this->pos + strlen( $word1 );
00370                 if ( $word1 === 'not' || $word1 === 'is' ) {
00371                         // Look ahead one word
00372                         $nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos );
00373                         if ( $nextTokenPos < $this->end
00374                                         && preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos ) )
00375                         {
00376                                 $word2 = strtolower( $m[0] );
00377                                 $nextTokenPos += strlen( $word2 );
00378                         }
00379                 }
00380 
00381                 // Two-word operators like "is not" take precedence over single-word operators like "is"
00382                 if ( $word2 !== '' ) {
00383                         $bothWords = "{$word1}-{$word2}";
00384                         if ( isset( self::$precedence[$bothWords] ) ) {
00385                                 $token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos );
00386                                 $this->pos = $nextTokenPos;
00387                                 return $token;
00388                         }
00389                 }
00390 
00391                 // Single-word operators
00392                 if ( isset( self::$precedence[$word1] ) ) {
00393                         $token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) );
00394                         $this->pos += strlen( $word1 );
00395                         return $token;
00396                 }
00397 
00398                 // The special numerical keyword "n"
00399                 if ( $word1 === 'n' ) {
00400                         $token = $this->newNumber( 'n', $this->pos );
00401                         $this->pos ++;
00402                         return $token;
00403                 }
00404 
00405                 $this->error( 'unrecognised word' );
00406         }
00407 
00413         protected function doOperation( $op ) {
00414                 if ( count( $this->operands ) < 2 ) {
00415                         $op->error( 'missing operand' );
00416                 }
00417                 $right = array_pop( $this->operands );
00418                 $left = array_pop( $this->operands );
00419                 $result = $op->operate( $left, $right );
00420                 $this->operands[] = $result;
00421         }
00422 
00426         protected function newNumber( $text, $pos ) {
00427                 return new CLDRPluralRuleConverter_Expression( $this, 'number', $text, $pos, strlen( $text ) );
00428         }
00429 
00433         protected function newOperator( $type, $pos, $length ) {
00434                 return new CLDRPluralRuleConverter_Operator( $this, $type, $pos, $length );
00435         }
00436 
00440         protected function error( $message ) {
00441                 throw new CLDRPluralRuleError( $message );
00442         }
00443 }
00444 
00449 class CLDRPluralRuleConverter_Fragment {
00450         var $parser, $pos, $length, $end;
00451 
00452         function __construct( $parser, $pos, $length ) {
00453                 $this->parser = $parser;
00454                 $this->pos = $pos;
00455                 $this->length = $length;
00456                 $this->end = $pos + $length;
00457         }
00458 
00459         public function error( $message ) {
00460                 $text = $this->getText();
00461                 throw new CLDRPluralRuleError( "$message at position " . ( $this->pos + 1 ) . ": \"$text\"" );
00462         }
00463 
00464         public function getText() {
00465                 return substr( $this->parser->rule, $this->pos, $this->length );
00466         }
00467 }
00468 
00475 class CLDRPluralRuleConverter_Expression extends CLDRPluralRuleConverter_Fragment {
00476         var $type, $rpn;
00477 
00478         function __construct( $parser, $type, $rpn, $pos, $length ) {
00479                 parent::__construct( $parser, $pos, $length );
00480                 $this->type = $type;
00481                 $this->rpn = $rpn;
00482         }
00483 
00484         public function isType( $type ) {
00485                 if ( $type === 'range' && ( $this->type === 'range' || $this->type === 'number' ) ) {
00486                         return true;
00487                 }
00488                 if ( $type === $this->type ) {
00489                         return true;
00490                 }
00491                 return false;
00492         }
00493 }
00494 
00500 class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment {
00501         var $name;
00502 
00512         static $opTypes = array(
00513                 'or' => 'bbb',
00514                 'and' => 'bbb',
00515                 'is' => 'nnb',
00516                 'is-not' => 'nnb',
00517                 'in' => 'nrb',
00518                 'not-in' => 'nrb',
00519                 'within' => 'nrb',
00520                 'not-within' => 'nrb',
00521                 'mod' => 'nnn',
00522                 ',' => 'rrr',
00523                 '..' => 'nnr',
00524         );
00525 
00529         static $typeSpecMap = array(
00530                 'b' => 'boolean',
00531                 'n' => 'number',
00532                 'r' => 'range',
00533         );
00534 
00535         function __construct( $parser, $name, $pos, $length ) {
00536                 parent::__construct( $parser, $pos, $length );
00537                 $this->name = $name;
00538         }
00539 
00540         public function operate( $left, $right ) {
00541                 $typeSpec = self::$opTypes[$this->name];
00542 
00543                 $leftType = self::$typeSpecMap[$typeSpec[0]];
00544                 $rightType = self::$typeSpecMap[$typeSpec[1]];
00545                 $resultType = self::$typeSpecMap[$typeSpec[2]];
00546 
00547                 $start = min( $this->pos, $left->pos, $right->pos );
00548                 $end = max( $this->end, $left->end, $right->end );
00549                 $length = $end - $start;
00550 
00551                 $newExpr = new CLDRPluralRuleConverter_Expression( $this->parser, $resultType,
00552                         "{$left->rpn} {$right->rpn} {$this->name}",
00553                         $start, $length );
00554 
00555                 if ( !$left->isType( $leftType ) ) {
00556                         $newExpr->error( "invalid type for left operand: expected $leftType, got {$left->type}" );
00557                 }
00558 
00559                 if ( !$right->isType( $rightType ) ) {
00560                         $newExpr->error( "invalid type for right operand: expected $rightType, got {$right->type}" );
00561                 }
00562                 return $newExpr;
00563         }
00564 }
00565 
00570 class CLDRPluralRuleError extends MWException {
00571         function __construct( $message ) {
00572                 parent::__construct( 'CLDR plural rule error: ' . $message );
00573         }
00574 }