MediaWiki  REL1_19
Collation.php
Go to the documentation of this file.
00001 <?php
00002 
00003 abstract class Collation {
00004         static $instance;
00005 
00009         static function singleton() {
00010                 if ( !self::$instance ) {
00011                         global $wgCategoryCollation;
00012                         self::$instance = self::factory( $wgCategoryCollation );
00013                 }
00014                 return self::$instance;
00015         }
00016 
00022         static function factory( $collationName ) {
00023                 switch( $collationName ) {
00024                         case 'uppercase':
00025                                 return new UppercaseCollation;
00026                         case 'identity':
00027                                 return new IdentityCollation;
00028                         case 'uca-default':
00029                                 return new IcuCollation( 'root' );
00030                         default:
00031                                 # Provide a mechanism for extensions to hook in.
00032 
00033                                 $collationObject = null;
00034                                 wfRunHooks( 'Collation::factory', array( $collationName, &$collationObject ) );
00035 
00036                                 if ( $collationObject instanceof Collation ) {
00037                                         return $collationObject;
00038                                 }
00039 
00040                                 // If all else fails...
00041                                 throw new MWException( __METHOD__.": unknown collation type \"$collationName\"" );
00042                 }
00043         }
00044 
00056         abstract function getSortKey( $string );
00057 
00081         abstract function getFirstLetter( $string );
00082 }
00083 
00084 class UppercaseCollation extends Collation {
00085         var $lang;
00086         function __construct() {
00087                 // Get a language object so that we can use the generic UTF-8 uppercase
00088                 // function there
00089                 $this->lang = Language::factory( 'en' );
00090         }
00091 
00092         function getSortKey( $string ) {
00093                 return $this->lang->uc( $string );
00094         }
00095 
00096         function getFirstLetter( $string ) {
00097                 if ( $string[0] == "\0" ) {
00098                         $string = substr( $string, 1 );
00099                 }
00100                 return $this->lang->ucfirst( $this->lang->firstChar( $string ) );
00101         }
00102 }
00103 
00110 class IdentityCollation extends Collation {
00111 
00112         function getSortKey( $string ) {
00113                 return $string;
00114         }
00115 
00116         function getFirstLetter( $string ) {
00117                 global $wgContLang;
00118                 // Copied from UppercaseCollation.
00119                 // I'm kind of unclear on when this could happen...
00120                 if ( $string[0] == "\0" ) {
00121                         $string = substr( $string, 1 );
00122                 }
00123                 return $wgContLang->firstChar( $string );
00124         }
00125 }
00126 
00127 
00128 class IcuCollation extends Collation {
00129         var $primaryCollator, $mainCollator, $locale;
00130         var $firstLetterData;
00131 
00141         static $cjkBlocks = array(
00142                 array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement
00143                 array( 0x2F00, 0x2FDF ), // Kangxi Radicals
00144                 array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters
00145                 array( 0x3000, 0x303F ), // CJK Symbols and Punctuation
00146                 array( 0x31C0, 0x31EF ), // CJK Strokes
00147                 array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months
00148                 array( 0x3300, 0x33FF ), // CJK Compatibility
00149                 array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A
00150                 array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs
00151                 array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs
00152                 array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms
00153                 array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B
00154                 array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C
00155                 array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D
00156                 array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement
00157         );
00158 
00159         const RECORD_LENGTH = 14;
00160 
00161         function __construct( $locale ) {
00162                 if ( !extension_loaded( 'intl' ) ) {
00163                         throw new MWException( 'An ICU collation was requested, ' . 
00164                                 'but the intl extension is not available.' );
00165                 }
00166                 $this->locale = $locale;
00167                 $this->mainCollator = Collator::create( $locale );
00168                 if ( !$this->mainCollator ) {
00169                         throw new MWException( "Invalid ICU locale specified for collation: $locale" );
00170                 }
00171 
00172                 $this->primaryCollator = Collator::create( $locale );
00173                 $this->primaryCollator->setStrength( Collator::PRIMARY );
00174         }
00175 
00176         function getSortKey( $string ) {
00177                 // intl extension produces non null-terminated
00178                 // strings. Appending '' fixes it so that it doesn't generate
00179                 // a warning on each access in debug php.
00180                 wfSuppressWarnings();
00181                 $key = $this->mainCollator->getSortKey( $string ) . '';
00182                 wfRestoreWarnings();
00183                 return $key;
00184         }
00185 
00186         function getPrimarySortKey( $string ) {
00187                 wfSuppressWarnings();
00188                 $key = $this->primaryCollator->getSortKey( $string ) . '';
00189                 wfRestoreWarnings();
00190                 return $key;
00191         }
00192 
00193         function getFirstLetter( $string ) {
00194                 $string = strval( $string );
00195                 if ( $string === '' ) {
00196                         return '';
00197                 }
00198 
00199                 // Check for CJK
00200                 $firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
00201                 if ( ord( $firstChar ) > 0x7f 
00202                         && self::isCjk( utf8ToCodepoint( $firstChar ) ) ) 
00203                 {
00204                         return $firstChar;
00205                 }
00206 
00207                 $sortKey = $this->getPrimarySortKey( $string );
00208 
00209                 // Do a binary search to find the correct letter to sort under
00210                 $min = $this->findLowerBound(
00211                         array( $this, 'getSortKeyByLetterIndex' ),
00212                         $this->getFirstLetterCount(),
00213                         'strcmp',
00214                         $sortKey );
00215 
00216                 if ( $min === false ) {
00217                         // Before the first letter
00218                         return '';
00219                 }
00220                 return $this->getLetterByIndex( $min );
00221         }
00222 
00223         function getFirstLetterData() {
00224                 if ( $this->firstLetterData !== null ) {
00225                         return $this->firstLetterData;
00226                 }
00227 
00228                 $cache = wfGetCache( CACHE_ANYTHING );
00229                 $cacheKey = wfMemcKey( 'first-letters', $this->locale );
00230                 $cacheEntry = $cache->get( $cacheKey );
00231 
00232                 if ( $cacheEntry ) {
00233                         $this->firstLetterData = $cacheEntry;
00234                         return $this->firstLetterData;
00235                 }
00236 
00237                 // Generate data from serialized data file
00238 
00239                 $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
00240                 if ( $letters === false ) {
00241                         throw new MWException( "MediaWiki does not support ICU locale " .
00242                                 "\"{$this->locale}\"" );
00243                 }
00244 
00245                 // Sort the letters.
00246                 //
00247                 // It's impossible to have the precompiled data file properly sorted,
00248                 // because the sort order changes depending on ICU version. If the 
00249                 // array is not properly sorted, the binary search will return random 
00250                 // results. 
00251                 //
00252                 // We also take this opportunity to remove primary collisions.
00253                 $letterMap = array();
00254                 foreach ( $letters as $letter ) {
00255                         $key = $this->getPrimarySortKey( $letter );
00256                         if ( isset( $letterMap[$key] ) ) {
00257                                 // Primary collision
00258                                 // Keep whichever one sorts first in the main collator
00259                                 if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) {
00260                                         $letterMap[$key] = $letter;
00261                                 }
00262                         } else {
00263                                 $letterMap[$key] = $letter;
00264                         }
00265                 }
00266                 ksort( $letterMap, SORT_STRING );
00267                 $data = array(
00268                         'chars' => array_values( $letterMap ),
00269                         'keys' => array_keys( $letterMap )
00270                 );
00271 
00272                 // Reduce memory usage before caching
00273                 unset( $letterMap );
00274 
00275                 // Save to cache
00276                 $this->firstLetterData = $data;
00277                 $cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ );
00278                 return $data;
00279         }
00280 
00281         function getLetterByIndex( $index ) {
00282                 if ( $this->firstLetterData === null ) {
00283                         $this->getFirstLetterData();
00284                 }
00285                 return $this->firstLetterData['chars'][$index];
00286         }
00287 
00288         function getSortKeyByLetterIndex( $index ) {
00289                 if ( $this->firstLetterData === null ) {
00290                         $this->getFirstLetterData();
00291                 }
00292                 return $this->firstLetterData['keys'][$index];
00293         }
00294 
00295         function getFirstLetterCount() {
00296                 if ( $this->firstLetterData === null ) {
00297                         $this->getFirstLetterData();
00298                 }
00299                 return count( $this->firstLetterData['chars'] );
00300         }
00301 
00317         function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) {
00318                 $min = 0;
00319                 $max = $valueCount - 1;
00320                 do {
00321                         $mid = $min + ( ( $max - $min ) >> 1 );
00322                         $item = call_user_func( $valueCallback, $mid );
00323                         $comparison = call_user_func( $comparisonCallback, $target, $item );
00324                         if ( $comparison > 0 ) {
00325                                 $min = $mid;
00326                         } elseif ( $comparison == 0 ) {
00327                                 $min = $mid;
00328                                 break;
00329                         } else {
00330                                 $max = $mid;
00331                         }
00332                 } while ( $min < $max - 1 );
00333 
00334                 if ( $min == 0 && $max == 0 && $comparison > 0 ) {
00335                         // Before the first item
00336                         return false;
00337                 } else {
00338                         return $min;
00339                 }
00340         }
00341 
00342         static function isCjk( $codepoint ) {
00343                 foreach ( self::$cjkBlocks as $block ) {
00344                         if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
00345                                 return true;
00346                         }
00347                 }
00348                 return false;
00349         }
00350 }
00351