MediaWiki  REL1_21
CLDRPluralRuleEvaluator.php
Go to the documentation of this file.
00001 <?php
00033 class CLDRPluralRuleEvaluator {
00042         public static function evaluate( $number, array $rules ) {
00043                 $rules = self::compile( $rules );
00044                 return self::evaluateCompiled( $number, $rules );
00045         }
00046 
00054         public static function compile( array $rules ) {
00055                 // We can't use array_map() for this because it generates a warning if
00056                 // there is an exception.
00057                 foreach ( $rules as &$rule ) {
00058                         $rule = CLDRPluralRuleConverter::convert( $rule );
00059                 }
00060                 return $rules;
00061         }
00062 
00067         public static function evaluateCompiled( $number, array $rules ) {
00068                 // The compiled form is RPN, with tokens strictly delimited by
00069                 // spaces, so this is a simple RPN evaluator.
00070                 foreach ( $rules as $i => $rule  ) {
00071                         $stack = array();
00072                         $zero = ord( '0' );
00073                         $nine = ord( '9' );
00074                         foreach ( StringUtils::explode( ' ', $rule ) as $token ) {
00075                                 $ord = ord( $token );
00076                                 if ( $token === 'n' ) {
00077                                         $stack[] = $number;
00078                                 } elseif ( $ord >= $zero && $ord <= $nine ) {
00079                                         $stack[] = intval( $token );
00080                                 } else {
00081                                         $right = array_pop( $stack );
00082                                         $left = array_pop( $stack );
00083                                         $result = self::doOperation( $token, $left, $right );
00084                                         $stack[] = $result;
00085                                 }
00086                         }
00087                         if ( $stack[0] ) {
00088                                 return $i;
00089                         }
00090                 }
00091                 // None of the provided rules match. The number belongs to caregory
00092                 // 'other' which comes last.
00093                 return count( $rules );
00094         }
00095 
00105         private static function doOperation( $token, $left, $right ) {
00106                 if ( in_array( $token, array( 'in', 'not-in', 'within', 'not-within' ) ) ) {
00107                         if ( !($right instanceof CLDRPluralRuleEvaluator_Range ) ) {
00108                                 $right = new CLDRPluralRuleEvaluator_Range( $right );
00109                         }
00110                 }
00111                 switch ( $token ) {
00112                         case 'or':
00113                                 return $left || $right;
00114                         case 'and':
00115                                 return $left && $right;
00116                         case 'is':
00117                                 return $left == $right;
00118                         case 'is-not':
00119                                 return $left != $right;
00120                         case 'in':
00121                                 return $right->isNumberIn( $left );
00122                         case 'not-in':
00123                                 return !$right->isNumberIn( $left );
00124                         case 'within':
00125                                 return $right->isNumberWithin( $left );
00126                         case 'not-within':
00127                                 return !$right->isNumberWithin( $left );
00128                         case 'mod':
00129                                 if ( is_int( $left ) ) {
00130                                         return (int) fmod( $left, $right );
00131                                 }
00132                                 return fmod( $left, $right );
00133                         case ',':
00134                                 if ( $left instanceof CLDRPluralRuleEvaluator_Range ) {
00135                                         $range = $left;
00136                                 } else {
00137                                         $range = new CLDRPluralRuleEvaluator_Range( $left );
00138                                 }
00139                                 $range->add( $right );
00140                                 return $range;
00141                         case '..':
00142                                 return new CLDRPluralRuleEvaluator_Range( $left, $right );
00143                         default:
00144                                 throw new CLDRPluralRuleError( "Invalid RPN token" );
00145                 }
00146         }
00147 }
00148 
00152 class CLDRPluralRuleEvaluator_Range {
00153         public $parts = array();
00154 
00155         function __construct( $start, $end = false ) {
00156                 if ( $end === false ) {
00157                         $this->parts[] = $start;
00158                 } else {
00159                         $this->parts[] = array( $start, $end );
00160                 }
00161         }
00162 
00168         function isNumberIn( $number, $integerConstraint = true ) {
00169                 foreach ( $this->parts as $part ) {
00170                         if ( is_array( $part ) ) {
00171                                 if ( ( !$integerConstraint || floor( $number ) === (float)$number )
00172                                         && $number >= $part[0] && $number <= $part[1] )
00173                                 {
00174                                         return true;
00175                                 }
00176                         } else {
00177                                 if ( $number == $part ) {
00178                                         return true;
00179                                 }
00180                         }
00181                 }
00182                 return false;
00183         }
00184 
00189         function isNumberWithin( $number ) {
00190                 return $this->isNumberIn( $number, false );
00191         }
00192 
00197         function add( $other ) {
00198                 if ( $other instanceof self ) {
00199                         $this->parts = array_merge( $this->parts, $other->parts );
00200                 } else {
00201                         $this->parts[] = $other;
00202                 }
00203         }
00204 
00208         function __toString() {
00209                 $s = 'Range(';
00210                 foreach ( $this->parts as $i => $part ) {
00211                         if ( $i ) {
00212                                 $s .= ', ';
00213                         }
00214                         if ( is_array( $part ) ) {
00215                                 $s .= $part[0] . '..' . $part[1];
00216                         } else {
00217                                 $s .= $part;
00218                         }
00219                 }
00220                 $s .= ')';
00221                 return $s;
00222         }
00223 
00224 }
00225 
00229 class CLDRPluralRuleConverter {
00230         public $rule, $pos, $end;
00231         public $operators = array();
00232         public $operands = array();
00233 
00239         static $precedence = array(
00240                 'or' => 2,
00241                 'and' => 3,
00242                 'is' => 4,
00243                 'is-not' => 4,
00244                 'in' => 4,
00245                 'not-in' => 4,
00246                 'within' => 4,
00247                 'not-within' => 4,
00248                 'mod' => 5,
00249                 ',' => 6,
00250                 '..' => 7,
00251         );
00252 
00256         const WHITESPACE_CLASS = " \t\r\n";
00257 
00262         const NUMBER_CLASS = '0123456789';
00263 
00267         const WORD_REGEX = '/[a-zA-Z]+/A';
00268 
00272         public static function convert( $rule ) {
00273                 $parser = new self( $rule );
00274                 return $parser->doConvert();
00275         }
00276 
00280         protected function __construct( $rule ) {
00281                 $this->rule = $rule;
00282                 $this->pos = 0;
00283                 $this->end = strlen( $rule );
00284         }
00285 
00289         protected function doConvert() {
00290                 $expectOperator = true;
00291 
00292                 // Iterate through all tokens, saving the operators and operands to a
00293                 // stack per Dijkstra's shunting yard algorithm.
00294                 while ( false !== ( $token = $this->nextToken() ) ) {
00295                         // In this grammar, there are only binary operators, so every valid
00296                         // rule string will alternate between operator and operand tokens.
00297                         $expectOperator = !$expectOperator;
00298 
00299                         if ( $token instanceof CLDRPluralRuleConverter_Expression ) {
00300                                 // Operand
00301                                 if ( $expectOperator ) {
00302                                         $token->error( 'unexpected operand' );
00303                                 }
00304                                 $this->operands[] = $token;
00305                                 continue;
00306                         } else {
00307                                 // Operator
00308                                 if  ( !$expectOperator ) {
00309                                         $token->error( 'unexpected operator' );
00310                                 }
00311                                 // Resolve higher precedence levels
00312                                 $lastOp = end( $this->operators );
00313                                 while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) {
00314                                         $this->doOperation( $lastOp, $this->operands );
00315                                         array_pop( $this->operators );
00316                                         $lastOp = end( $this->operators );
00317                                 }
00318                                 $this->operators[] = $token;
00319                         }
00320                 }
00321 
00322                 // Finish off the stack
00323                 while ( $op = array_pop( $this->operators ) ) {
00324                         $this->doOperation( $op, $this->operands );
00325                 }
00326 
00327                 // Make sure the result is sane. The first case is possible for an empty
00328                 // string input, the second should be unreachable.
00329                 if ( !count( $this->operands ) ) {
00330                         $this->error( 'condition expected' );
00331                 } elseif ( count( $this->operands ) > 1 ) {
00332                         $this->error( 'missing operator or too many operands' );
00333                 }
00334 
00335                 $value = $this->operands[0];
00336                 if ( $value->type !== 'boolean' ) {
00337                         $this->error( 'the result must have a boolean type' );
00338                 }
00339 
00340                 return $this->operands[0]->rpn;
00341         }
00342 
00347         protected function nextToken() {
00348                 if ( $this->pos >= $this->end ) {
00349                         return false;
00350                 }
00351 
00352                 // Whitespace
00353                 $length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos );
00354                 $this->pos += $length;
00355 
00356                 if ( $this->pos >= $this->end ) {
00357                         return false;
00358                 }
00359 
00360                 // Number
00361                 $length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos );
00362                 if ( $length !== 0 ) {
00363                         $token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos );
00364                         $this->pos += $length;
00365                         return $token;
00366                 }
00367 
00368                 // Comma
00369                 if ( $this->rule[$this->pos] === ',' ) {
00370                         $token = $this->newOperator( ',', $this->pos, 1 );
00371                         $this->pos ++;
00372                         return $token;
00373                 }
00374 
00375                 // Dot dot
00376                 if ( substr( $this->rule, $this->pos, 2 ) === '..' ) {
00377                         $token = $this->newOperator( '..', $this->pos, 2 );
00378                         $this->pos += 2;
00379                         return $token;
00380                 }
00381 
00382                 // Word
00383                 if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) {
00384                         $this->error( 'unexpected character "' . $this->rule[$this->pos] . '"'  );
00385                 }
00386                 $word1 = strtolower( $m[0] );
00387                 $word2 = '';
00388                 $nextTokenPos = $this->pos + strlen( $word1 );
00389                 if ( $word1 === 'not' || $word1 === 'is' ) {
00390                         // Look ahead one word
00391                         $nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos );
00392                         if ( $nextTokenPos < $this->end
00393                                         && preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos ) )
00394                         {
00395                                 $word2 = strtolower( $m[0] );
00396                                 $nextTokenPos += strlen( $word2 );
00397                         }
00398                 }
00399 
00400                 // Two-word operators like "is not" take precedence over single-word operators like "is"
00401                 if ( $word2 !== '' ) {
00402                         $bothWords = "{$word1}-{$word2}";
00403                         if ( isset( self::$precedence[$bothWords] ) ) {
00404                                 $token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos );
00405                                 $this->pos = $nextTokenPos;
00406                                 return $token;
00407                         }
00408                 }
00409 
00410                 // Single-word operators
00411                 if ( isset( self::$precedence[$word1] ) ) {
00412                         $token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) );
00413                         $this->pos += strlen( $word1 );
00414                         return $token;
00415                 }
00416 
00417                 // The special numerical keyword "n"
00418                 if ( $word1 === 'n' ) {
00419                         $token = $this->newNumber( 'n', $this->pos );
00420                         $this->pos ++;
00421                         return $token;
00422                 }
00423 
00424                 $this->error( 'unrecognised word' );
00425         }
00426 
00432         protected function doOperation( $op ) {
00433                 if ( count( $this->operands ) < 2 ) {
00434                         $op->error( 'missing operand' );
00435                 }
00436                 $right = array_pop( $this->operands );
00437                 $left = array_pop( $this->operands );
00438                 $result = $op->operate( $left, $right );
00439                 $this->operands[] = $result;
00440         }
00441 
00445         protected function newNumber( $text, $pos ) {
00446                 return new CLDRPluralRuleConverter_Expression( $this, 'number', $text, $pos, strlen( $text ) );
00447         }
00448 
00452         protected function newOperator( $type, $pos, $length ) {
00453                 return new CLDRPluralRuleConverter_Operator( $this, $type, $pos, $length );
00454         }
00455 
00459         protected function error( $message ) {
00460                 throw new CLDRPluralRuleError( $message );
00461         }
00462 }
00463 
00468 class CLDRPluralRuleConverter_Fragment {
00469         public $parser, $pos, $length, $end;
00470 
00471         function __construct( $parser, $pos, $length ) {
00472                 $this->parser = $parser;
00473                 $this->pos = $pos;
00474                 $this->length = $length;
00475                 $this->end = $pos + $length;
00476         }
00477 
00478         public function error( $message ) {
00479                 $text = $this->getText();
00480                 throw new CLDRPluralRuleError( "$message at position " . ( $this->pos + 1 ) . ": \"$text\"" );
00481         }
00482 
00483         public function getText() {
00484                 return substr( $this->parser->rule, $this->pos, $this->length );
00485         }
00486 }
00487 
00494 class CLDRPluralRuleConverter_Expression extends CLDRPluralRuleConverter_Fragment {
00495         public $type, $rpn;
00496 
00497         function __construct( $parser, $type, $rpn, $pos, $length ) {
00498                 parent::__construct( $parser, $pos, $length );
00499                 $this->type = $type;
00500                 $this->rpn = $rpn;
00501         }
00502 
00503         public function isType( $type ) {
00504                 if ( $type === 'range' && ( $this->type === 'range' || $this->type === 'number' ) ) {
00505                         return true;
00506                 }
00507                 if ( $type === $this->type ) {
00508                         return true;
00509                 }
00510                 return false;
00511         }
00512 }
00513 
00519 class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment {
00520         public $name;
00521 
00531         static $opTypes = array(
00532                 'or' => 'bbb',
00533                 'and' => 'bbb',
00534                 'is' => 'nnb',
00535                 'is-not' => 'nnb',
00536                 'in' => 'nrb',
00537                 'not-in' => 'nrb',
00538                 'within' => 'nrb',
00539                 'not-within' => 'nrb',
00540                 'mod' => 'nnn',
00541                 ',' => 'rrr',
00542                 '..' => 'nnr',
00543         );
00544 
00548         static $typeSpecMap = array(
00549                 'b' => 'boolean',
00550                 'n' => 'number',
00551                 'r' => 'range',
00552         );
00553 
00554         function __construct( $parser, $name, $pos, $length ) {
00555                 parent::__construct( $parser, $pos, $length );
00556                 $this->name = $name;
00557         }
00558 
00559         public function operate( $left, $right ) {
00560                 $typeSpec = self::$opTypes[$this->name];
00561 
00562                 $leftType = self::$typeSpecMap[$typeSpec[0]];
00563                 $rightType = self::$typeSpecMap[$typeSpec[1]];
00564                 $resultType = self::$typeSpecMap[$typeSpec[2]];
00565 
00566                 $start = min( $this->pos, $left->pos, $right->pos );
00567                 $end = max( $this->end, $left->end, $right->end );
00568                 $length = $end - $start;
00569 
00570                 $newExpr = new CLDRPluralRuleConverter_Expression( $this->parser, $resultType,
00571                         "{$left->rpn} {$right->rpn} {$this->name}",
00572                         $start, $length );
00573 
00574                 if ( !$left->isType( $leftType ) ) {
00575                         $newExpr->error( "invalid type for left operand: expected $leftType, got {$left->type}" );
00576                 }
00577 
00578                 if ( !$right->isType( $rightType ) ) {
00579                         $newExpr->error( "invalid type for right operand: expected $rightType, got {$right->type}" );
00580                 }
00581                 return $newExpr;
00582         }
00583 }
00584 
00589 class CLDRPluralRuleError extends MWException {
00590         function __construct( $message ) {
00591                 parent::__construct( 'CLDR plural rule error: ' . $message );
00592         }
00593 }