MediaWiki  REL1_20
Collation.php
Go to the documentation of this file.
00001 <?php
00023 abstract class Collation {
00024         static $instance;
00025 
00029         static function singleton() {
00030                 if ( !self::$instance ) {
00031                         global $wgCategoryCollation;
00032                         self::$instance = self::factory( $wgCategoryCollation );
00033                 }
00034                 return self::$instance;
00035         }
00036 
00042         static function factory( $collationName ) {
00043                 switch( $collationName ) {
00044                         case 'uppercase':
00045                                 return new UppercaseCollation;
00046                         case 'identity':
00047                                 return new IdentityCollation;
00048                         case 'uca-default':
00049                                 return new IcuCollation( 'root' );
00050                         default:
00051                                 # Provide a mechanism for extensions to hook in.
00052 
00053                                 $collationObject = null;
00054                                 wfRunHooks( 'Collation::factory', array( $collationName, &$collationObject ) );
00055 
00056                                 if ( $collationObject instanceof Collation ) {
00057                                         return $collationObject;
00058                                 }
00059 
00060                                 // If all else fails...
00061                                 throw new MWException( __METHOD__.": unknown collation type \"$collationName\"" );
00062                 }
00063         }
00064 
00076         abstract function getSortKey( $string );
00077 
00101         abstract function getFirstLetter( $string );
00102 }
00103 
00104 class UppercaseCollation extends Collation {
00105         var $lang;
00106         function __construct() {
00107                 // Get a language object so that we can use the generic UTF-8 uppercase
00108                 // function there
00109                 $this->lang = Language::factory( 'en' );
00110         }
00111 
00112         function getSortKey( $string ) {
00113                 return $this->lang->uc( $string );
00114         }
00115 
00116         function getFirstLetter( $string ) {
00117                 if ( $string[0] == "\0" ) {
00118                         $string = substr( $string, 1 );
00119                 }
00120                 return $this->lang->ucfirst( $this->lang->firstChar( $string ) );
00121         }
00122 }
00123 
00130 class IdentityCollation extends Collation {
00131 
00132         function getSortKey( $string ) {
00133                 return $string;
00134         }
00135 
00136         function getFirstLetter( $string ) {
00137                 global $wgContLang;
00138                 // Copied from UppercaseCollation.
00139                 // I'm kind of unclear on when this could happen...
00140                 if ( $string[0] == "\0" ) {
00141                         $string = substr( $string, 1 );
00142                 }
00143                 return $wgContLang->firstChar( $string );
00144         }
00145 }
00146 
00147 
00148 class IcuCollation extends Collation {
00149         var $primaryCollator, $mainCollator, $locale;
00150         var $firstLetterData;
00151 
00161         static $cjkBlocks = array(
00162                 array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement
00163                 array( 0x2F00, 0x2FDF ), // Kangxi Radicals
00164                 array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters
00165                 array( 0x3000, 0x303F ), // CJK Symbols and Punctuation
00166                 array( 0x31C0, 0x31EF ), // CJK Strokes
00167                 array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months
00168                 array( 0x3300, 0x33FF ), // CJK Compatibility
00169                 array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A
00170                 array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs
00171                 array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs
00172                 array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms
00173                 array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B
00174                 array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C
00175                 array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D
00176                 array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement
00177         );
00178 
00179         const RECORD_LENGTH = 14;
00180 
00181         function __construct( $locale ) {
00182                 if ( !extension_loaded( 'intl' ) ) {
00183                         throw new MWException( 'An ICU collation was requested, ' . 
00184                                 'but the intl extension is not available.' );
00185                 }
00186                 $this->locale = $locale;
00187                 $this->mainCollator = Collator::create( $locale );
00188                 if ( !$this->mainCollator ) {
00189                         throw new MWException( "Invalid ICU locale specified for collation: $locale" );
00190                 }
00191 
00192                 $this->primaryCollator = Collator::create( $locale );
00193                 $this->primaryCollator->setStrength( Collator::PRIMARY );
00194         }
00195 
00196         function getSortKey( $string ) {
00197                 // intl extension produces non null-terminated
00198                 // strings. Appending '' fixes it so that it doesn't generate
00199                 // a warning on each access in debug php.
00200                 wfSuppressWarnings();
00201                 $key = $this->mainCollator->getSortKey( $string ) . '';
00202                 wfRestoreWarnings();
00203                 return $key;
00204         }
00205 
00206         function getPrimarySortKey( $string ) {
00207                 wfSuppressWarnings();
00208                 $key = $this->primaryCollator->getSortKey( $string ) . '';
00209                 wfRestoreWarnings();
00210                 return $key;
00211         }
00212 
00213         function getFirstLetter( $string ) {
00214                 $string = strval( $string );
00215                 if ( $string === '' ) {
00216                         return '';
00217                 }
00218 
00219                 // Check for CJK
00220                 $firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
00221                 if ( ord( $firstChar ) > 0x7f 
00222                         && self::isCjk( utf8ToCodepoint( $firstChar ) ) ) 
00223                 {
00224                         return $firstChar;
00225                 }
00226 
00227                 $sortKey = $this->getPrimarySortKey( $string );
00228 
00229                 // Do a binary search to find the correct letter to sort under
00230                 $min = $this->findLowerBound(
00231                         array( $this, 'getSortKeyByLetterIndex' ),
00232                         $this->getFirstLetterCount(),
00233                         'strcmp',
00234                         $sortKey );
00235 
00236                 if ( $min === false ) {
00237                         // Before the first letter
00238                         return '';
00239                 }
00240                 return $this->getLetterByIndex( $min );
00241         }
00242 
00243         function getFirstLetterData() {
00244                 if ( $this->firstLetterData !== null ) {
00245                         return $this->firstLetterData;
00246                 }
00247 
00248                 $cache = wfGetCache( CACHE_ANYTHING );
00249                 $cacheKey = wfMemcKey( 'first-letters', $this->locale );
00250                 $cacheEntry = $cache->get( $cacheKey );
00251 
00252                 if ( $cacheEntry ) {
00253                         $this->firstLetterData = $cacheEntry;
00254                         return $this->firstLetterData;
00255                 }
00256 
00257                 // Generate data from serialized data file
00258 
00259                 $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
00260                 if ( $letters === false ) {
00261                         throw new MWException( "MediaWiki does not support ICU locale " .
00262                                 "\"{$this->locale}\"" );
00263                 }
00264 
00265                 // Sort the letters.
00266                 //
00267                 // It's impossible to have the precompiled data file properly sorted,
00268                 // because the sort order changes depending on ICU version. If the 
00269                 // array is not properly sorted, the binary search will return random 
00270                 // results. 
00271                 //
00272                 // We also take this opportunity to remove primary collisions.
00273                 $letterMap = array();
00274                 foreach ( $letters as $letter ) {
00275                         $key = $this->getPrimarySortKey( $letter );
00276                         if ( isset( $letterMap[$key] ) ) {
00277                                 // Primary collision
00278                                 // Keep whichever one sorts first in the main collator
00279                                 if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) {
00280                                         $letterMap[$key] = $letter;
00281                                 }
00282                         } else {
00283                                 $letterMap[$key] = $letter;
00284                         }
00285                 }
00286                 ksort( $letterMap, SORT_STRING );
00287                 $data = array(
00288                         'chars' => array_values( $letterMap ),
00289                         'keys' => array_keys( $letterMap )
00290                 );
00291 
00292                 // Reduce memory usage before caching
00293                 unset( $letterMap );
00294 
00295                 // Save to cache
00296                 $this->firstLetterData = $data;
00297                 $cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ );
00298                 return $data;
00299         }
00300 
00301         function getLetterByIndex( $index ) {
00302                 if ( $this->firstLetterData === null ) {
00303                         $this->getFirstLetterData();
00304                 }
00305                 return $this->firstLetterData['chars'][$index];
00306         }
00307 
00308         function getSortKeyByLetterIndex( $index ) {
00309                 if ( $this->firstLetterData === null ) {
00310                         $this->getFirstLetterData();
00311                 }
00312                 return $this->firstLetterData['keys'][$index];
00313         }
00314 
00315         function getFirstLetterCount() {
00316                 if ( $this->firstLetterData === null ) {
00317                         $this->getFirstLetterData();
00318                 }
00319                 return count( $this->firstLetterData['chars'] );
00320         }
00321 
00337         function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) {
00338                 $min = 0;
00339                 $max = $valueCount - 1;
00340                 do {
00341                         $mid = $min + ( ( $max - $min ) >> 1 );
00342                         $item = call_user_func( $valueCallback, $mid );
00343                         $comparison = call_user_func( $comparisonCallback, $target, $item );
00344                         if ( $comparison > 0 ) {
00345                                 $min = $mid;
00346                         } elseif ( $comparison == 0 ) {
00347                                 $min = $mid;
00348                                 break;
00349                         } else {
00350                                 $max = $mid;
00351                         }
00352                 } while ( $min < $max - 1 );
00353 
00354                 if ( $min == 0 && $max == 0 && $comparison > 0 ) {
00355                         // Before the first item
00356                         return false;
00357                 } else {
00358                         return $min;
00359                 }
00360         }
00361 
00362         static function isCjk( $codepoint ) {
00363                 foreach ( self::$cjkBlocks as $block ) {
00364                         if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
00365                                 return true;
00366                         }
00367                 }
00368                 return false;
00369         }
00370 }
00371