MediaWiki
REL1_20
|
00001 <?php 00023 abstract class Collation { 00024 static $instance; 00025 00029 static function singleton() { 00030 if ( !self::$instance ) { 00031 global $wgCategoryCollation; 00032 self::$instance = self::factory( $wgCategoryCollation ); 00033 } 00034 return self::$instance; 00035 } 00036 00042 static function factory( $collationName ) { 00043 switch( $collationName ) { 00044 case 'uppercase': 00045 return new UppercaseCollation; 00046 case 'identity': 00047 return new IdentityCollation; 00048 case 'uca-default': 00049 return new IcuCollation( 'root' ); 00050 default: 00051 # Provide a mechanism for extensions to hook in. 00052 00053 $collationObject = null; 00054 wfRunHooks( 'Collation::factory', array( $collationName, &$collationObject ) ); 00055 00056 if ( $collationObject instanceof Collation ) { 00057 return $collationObject; 00058 } 00059 00060 // If all else fails... 00061 throw new MWException( __METHOD__.": unknown collation type \"$collationName\"" ); 00062 } 00063 } 00064 00076 abstract function getSortKey( $string ); 00077 00101 abstract function getFirstLetter( $string ); 00102 } 00103 00104 class UppercaseCollation extends Collation { 00105 var $lang; 00106 function __construct() { 00107 // Get a language object so that we can use the generic UTF-8 uppercase 00108 // function there 00109 $this->lang = Language::factory( 'en' ); 00110 } 00111 00112 function getSortKey( $string ) { 00113 return $this->lang->uc( $string ); 00114 } 00115 00116 function getFirstLetter( $string ) { 00117 if ( $string[0] == "\0" ) { 00118 $string = substr( $string, 1 ); 00119 } 00120 return $this->lang->ucfirst( $this->lang->firstChar( $string ) ); 00121 } 00122 } 00123 00130 class IdentityCollation extends Collation { 00131 00132 function getSortKey( $string ) { 00133 return $string; 00134 } 00135 00136 function getFirstLetter( $string ) { 00137 global $wgContLang; 00138 // Copied from UppercaseCollation. 00139 // I'm kind of unclear on when this could happen... 00140 if ( $string[0] == "\0" ) { 00141 $string = substr( $string, 1 ); 00142 } 00143 return $wgContLang->firstChar( $string ); 00144 } 00145 } 00146 00147 00148 class IcuCollation extends Collation { 00149 var $primaryCollator, $mainCollator, $locale; 00150 var $firstLetterData; 00151 00161 static $cjkBlocks = array( 00162 array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement 00163 array( 0x2F00, 0x2FDF ), // Kangxi Radicals 00164 array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters 00165 array( 0x3000, 0x303F ), // CJK Symbols and Punctuation 00166 array( 0x31C0, 0x31EF ), // CJK Strokes 00167 array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months 00168 array( 0x3300, 0x33FF ), // CJK Compatibility 00169 array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A 00170 array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs 00171 array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs 00172 array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms 00173 array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B 00174 array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C 00175 array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D 00176 array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement 00177 ); 00178 00179 const RECORD_LENGTH = 14; 00180 00181 function __construct( $locale ) { 00182 if ( !extension_loaded( 'intl' ) ) { 00183 throw new MWException( 'An ICU collation was requested, ' . 00184 'but the intl extension is not available.' ); 00185 } 00186 $this->locale = $locale; 00187 $this->mainCollator = Collator::create( $locale ); 00188 if ( !$this->mainCollator ) { 00189 throw new MWException( "Invalid ICU locale specified for collation: $locale" ); 00190 } 00191 00192 $this->primaryCollator = Collator::create( $locale ); 00193 $this->primaryCollator->setStrength( Collator::PRIMARY ); 00194 } 00195 00196 function getSortKey( $string ) { 00197 // intl extension produces non null-terminated 00198 // strings. Appending '' fixes it so that it doesn't generate 00199 // a warning on each access in debug php. 00200 wfSuppressWarnings(); 00201 $key = $this->mainCollator->getSortKey( $string ) . ''; 00202 wfRestoreWarnings(); 00203 return $key; 00204 } 00205 00206 function getPrimarySortKey( $string ) { 00207 wfSuppressWarnings(); 00208 $key = $this->primaryCollator->getSortKey( $string ) . ''; 00209 wfRestoreWarnings(); 00210 return $key; 00211 } 00212 00213 function getFirstLetter( $string ) { 00214 $string = strval( $string ); 00215 if ( $string === '' ) { 00216 return ''; 00217 } 00218 00219 // Check for CJK 00220 $firstChar = mb_substr( $string, 0, 1, 'UTF-8' ); 00221 if ( ord( $firstChar ) > 0x7f 00222 && self::isCjk( utf8ToCodepoint( $firstChar ) ) ) 00223 { 00224 return $firstChar; 00225 } 00226 00227 $sortKey = $this->getPrimarySortKey( $string ); 00228 00229 // Do a binary search to find the correct letter to sort under 00230 $min = $this->findLowerBound( 00231 array( $this, 'getSortKeyByLetterIndex' ), 00232 $this->getFirstLetterCount(), 00233 'strcmp', 00234 $sortKey ); 00235 00236 if ( $min === false ) { 00237 // Before the first letter 00238 return ''; 00239 } 00240 return $this->getLetterByIndex( $min ); 00241 } 00242 00243 function getFirstLetterData() { 00244 if ( $this->firstLetterData !== null ) { 00245 return $this->firstLetterData; 00246 } 00247 00248 $cache = wfGetCache( CACHE_ANYTHING ); 00249 $cacheKey = wfMemcKey( 'first-letters', $this->locale ); 00250 $cacheEntry = $cache->get( $cacheKey ); 00251 00252 if ( $cacheEntry ) { 00253 $this->firstLetterData = $cacheEntry; 00254 return $this->firstLetterData; 00255 } 00256 00257 // Generate data from serialized data file 00258 00259 $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" ); 00260 if ( $letters === false ) { 00261 throw new MWException( "MediaWiki does not support ICU locale " . 00262 "\"{$this->locale}\"" ); 00263 } 00264 00265 // Sort the letters. 00266 // 00267 // It's impossible to have the precompiled data file properly sorted, 00268 // because the sort order changes depending on ICU version. If the 00269 // array is not properly sorted, the binary search will return random 00270 // results. 00271 // 00272 // We also take this opportunity to remove primary collisions. 00273 $letterMap = array(); 00274 foreach ( $letters as $letter ) { 00275 $key = $this->getPrimarySortKey( $letter ); 00276 if ( isset( $letterMap[$key] ) ) { 00277 // Primary collision 00278 // Keep whichever one sorts first in the main collator 00279 if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) { 00280 $letterMap[$key] = $letter; 00281 } 00282 } else { 00283 $letterMap[$key] = $letter; 00284 } 00285 } 00286 ksort( $letterMap, SORT_STRING ); 00287 $data = array( 00288 'chars' => array_values( $letterMap ), 00289 'keys' => array_keys( $letterMap ) 00290 ); 00291 00292 // Reduce memory usage before caching 00293 unset( $letterMap ); 00294 00295 // Save to cache 00296 $this->firstLetterData = $data; 00297 $cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ ); 00298 return $data; 00299 } 00300 00301 function getLetterByIndex( $index ) { 00302 if ( $this->firstLetterData === null ) { 00303 $this->getFirstLetterData(); 00304 } 00305 return $this->firstLetterData['chars'][$index]; 00306 } 00307 00308 function getSortKeyByLetterIndex( $index ) { 00309 if ( $this->firstLetterData === null ) { 00310 $this->getFirstLetterData(); 00311 } 00312 return $this->firstLetterData['keys'][$index]; 00313 } 00314 00315 function getFirstLetterCount() { 00316 if ( $this->firstLetterData === null ) { 00317 $this->getFirstLetterData(); 00318 } 00319 return count( $this->firstLetterData['chars'] ); 00320 } 00321 00337 function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) { 00338 $min = 0; 00339 $max = $valueCount - 1; 00340 do { 00341 $mid = $min + ( ( $max - $min ) >> 1 ); 00342 $item = call_user_func( $valueCallback, $mid ); 00343 $comparison = call_user_func( $comparisonCallback, $target, $item ); 00344 if ( $comparison > 0 ) { 00345 $min = $mid; 00346 } elseif ( $comparison == 0 ) { 00347 $min = $mid; 00348 break; 00349 } else { 00350 $max = $mid; 00351 } 00352 } while ( $min < $max - 1 ); 00353 00354 if ( $min == 0 && $max == 0 && $comparison > 0 ) { 00355 // Before the first item 00356 return false; 00357 } else { 00358 return $min; 00359 } 00360 } 00361 00362 static function isCjk( $codepoint ) { 00363 foreach ( self::$cjkBlocks as $block ) { 00364 if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) { 00365 return true; 00366 } 00367 } 00368 return false; 00369 } 00370 } 00371