MediaWiki
REL1_19
|
00001 <?php 00002 00003 abstract class Collation { 00004 static $instance; 00005 00009 static function singleton() { 00010 if ( !self::$instance ) { 00011 global $wgCategoryCollation; 00012 self::$instance = self::factory( $wgCategoryCollation ); 00013 } 00014 return self::$instance; 00015 } 00016 00022 static function factory( $collationName ) { 00023 switch( $collationName ) { 00024 case 'uppercase': 00025 return new UppercaseCollation; 00026 case 'identity': 00027 return new IdentityCollation; 00028 case 'uca-default': 00029 return new IcuCollation( 'root' ); 00030 default: 00031 # Provide a mechanism for extensions to hook in. 00032 00033 $collationObject = null; 00034 wfRunHooks( 'Collation::factory', array( $collationName, &$collationObject ) ); 00035 00036 if ( $collationObject instanceof Collation ) { 00037 return $collationObject; 00038 } 00039 00040 // If all else fails... 00041 throw new MWException( __METHOD__.": unknown collation type \"$collationName\"" ); 00042 } 00043 } 00044 00056 abstract function getSortKey( $string ); 00057 00081 abstract function getFirstLetter( $string ); 00082 } 00083 00084 class UppercaseCollation extends Collation { 00085 var $lang; 00086 function __construct() { 00087 // Get a language object so that we can use the generic UTF-8 uppercase 00088 // function there 00089 $this->lang = Language::factory( 'en' ); 00090 } 00091 00092 function getSortKey( $string ) { 00093 return $this->lang->uc( $string ); 00094 } 00095 00096 function getFirstLetter( $string ) { 00097 if ( $string[0] == "\0" ) { 00098 $string = substr( $string, 1 ); 00099 } 00100 return $this->lang->ucfirst( $this->lang->firstChar( $string ) ); 00101 } 00102 } 00103 00110 class IdentityCollation extends Collation { 00111 00112 function getSortKey( $string ) { 00113 return $string; 00114 } 00115 00116 function getFirstLetter( $string ) { 00117 global $wgContLang; 00118 // Copied from UppercaseCollation. 00119 // I'm kind of unclear on when this could happen... 00120 if ( $string[0] == "\0" ) { 00121 $string = substr( $string, 1 ); 00122 } 00123 return $wgContLang->firstChar( $string ); 00124 } 00125 } 00126 00127 00128 class IcuCollation extends Collation { 00129 var $primaryCollator, $mainCollator, $locale; 00130 var $firstLetterData; 00131 00141 static $cjkBlocks = array( 00142 array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement 00143 array( 0x2F00, 0x2FDF ), // Kangxi Radicals 00144 array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters 00145 array( 0x3000, 0x303F ), // CJK Symbols and Punctuation 00146 array( 0x31C0, 0x31EF ), // CJK Strokes 00147 array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months 00148 array( 0x3300, 0x33FF ), // CJK Compatibility 00149 array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A 00150 array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs 00151 array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs 00152 array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms 00153 array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B 00154 array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C 00155 array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D 00156 array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement 00157 ); 00158 00159 const RECORD_LENGTH = 14; 00160 00161 function __construct( $locale ) { 00162 if ( !extension_loaded( 'intl' ) ) { 00163 throw new MWException( 'An ICU collation was requested, ' . 00164 'but the intl extension is not available.' ); 00165 } 00166 $this->locale = $locale; 00167 $this->mainCollator = Collator::create( $locale ); 00168 if ( !$this->mainCollator ) { 00169 throw new MWException( "Invalid ICU locale specified for collation: $locale" ); 00170 } 00171 00172 $this->primaryCollator = Collator::create( $locale ); 00173 $this->primaryCollator->setStrength( Collator::PRIMARY ); 00174 } 00175 00176 function getSortKey( $string ) { 00177 // intl extension produces non null-terminated 00178 // strings. Appending '' fixes it so that it doesn't generate 00179 // a warning on each access in debug php. 00180 wfSuppressWarnings(); 00181 $key = $this->mainCollator->getSortKey( $string ) . ''; 00182 wfRestoreWarnings(); 00183 return $key; 00184 } 00185 00186 function getPrimarySortKey( $string ) { 00187 wfSuppressWarnings(); 00188 $key = $this->primaryCollator->getSortKey( $string ) . ''; 00189 wfRestoreWarnings(); 00190 return $key; 00191 } 00192 00193 function getFirstLetter( $string ) { 00194 $string = strval( $string ); 00195 if ( $string === '' ) { 00196 return ''; 00197 } 00198 00199 // Check for CJK 00200 $firstChar = mb_substr( $string, 0, 1, 'UTF-8' ); 00201 if ( ord( $firstChar ) > 0x7f 00202 && self::isCjk( utf8ToCodepoint( $firstChar ) ) ) 00203 { 00204 return $firstChar; 00205 } 00206 00207 $sortKey = $this->getPrimarySortKey( $string ); 00208 00209 // Do a binary search to find the correct letter to sort under 00210 $min = $this->findLowerBound( 00211 array( $this, 'getSortKeyByLetterIndex' ), 00212 $this->getFirstLetterCount(), 00213 'strcmp', 00214 $sortKey ); 00215 00216 if ( $min === false ) { 00217 // Before the first letter 00218 return ''; 00219 } 00220 return $this->getLetterByIndex( $min ); 00221 } 00222 00223 function getFirstLetterData() { 00224 if ( $this->firstLetterData !== null ) { 00225 return $this->firstLetterData; 00226 } 00227 00228 $cache = wfGetCache( CACHE_ANYTHING ); 00229 $cacheKey = wfMemcKey( 'first-letters', $this->locale ); 00230 $cacheEntry = $cache->get( $cacheKey ); 00231 00232 if ( $cacheEntry ) { 00233 $this->firstLetterData = $cacheEntry; 00234 return $this->firstLetterData; 00235 } 00236 00237 // Generate data from serialized data file 00238 00239 $letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" ); 00240 if ( $letters === false ) { 00241 throw new MWException( "MediaWiki does not support ICU locale " . 00242 "\"{$this->locale}\"" ); 00243 } 00244 00245 // Sort the letters. 00246 // 00247 // It's impossible to have the precompiled data file properly sorted, 00248 // because the sort order changes depending on ICU version. If the 00249 // array is not properly sorted, the binary search will return random 00250 // results. 00251 // 00252 // We also take this opportunity to remove primary collisions. 00253 $letterMap = array(); 00254 foreach ( $letters as $letter ) { 00255 $key = $this->getPrimarySortKey( $letter ); 00256 if ( isset( $letterMap[$key] ) ) { 00257 // Primary collision 00258 // Keep whichever one sorts first in the main collator 00259 if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) { 00260 $letterMap[$key] = $letter; 00261 } 00262 } else { 00263 $letterMap[$key] = $letter; 00264 } 00265 } 00266 ksort( $letterMap, SORT_STRING ); 00267 $data = array( 00268 'chars' => array_values( $letterMap ), 00269 'keys' => array_keys( $letterMap ) 00270 ); 00271 00272 // Reduce memory usage before caching 00273 unset( $letterMap ); 00274 00275 // Save to cache 00276 $this->firstLetterData = $data; 00277 $cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ ); 00278 return $data; 00279 } 00280 00281 function getLetterByIndex( $index ) { 00282 if ( $this->firstLetterData === null ) { 00283 $this->getFirstLetterData(); 00284 } 00285 return $this->firstLetterData['chars'][$index]; 00286 } 00287 00288 function getSortKeyByLetterIndex( $index ) { 00289 if ( $this->firstLetterData === null ) { 00290 $this->getFirstLetterData(); 00291 } 00292 return $this->firstLetterData['keys'][$index]; 00293 } 00294 00295 function getFirstLetterCount() { 00296 if ( $this->firstLetterData === null ) { 00297 $this->getFirstLetterData(); 00298 } 00299 return count( $this->firstLetterData['chars'] ); 00300 } 00301 00317 function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) { 00318 $min = 0; 00319 $max = $valueCount - 1; 00320 do { 00321 $mid = $min + ( ( $max - $min ) >> 1 ); 00322 $item = call_user_func( $valueCallback, $mid ); 00323 $comparison = call_user_func( $comparisonCallback, $target, $item ); 00324 if ( $comparison > 0 ) { 00325 $min = $mid; 00326 } elseif ( $comparison == 0 ) { 00327 $min = $mid; 00328 break; 00329 } else { 00330 $max = $mid; 00331 } 00332 } while ( $min < $max - 1 ); 00333 00334 if ( $min == 0 && $max == 0 && $comparison > 0 ) { 00335 // Before the first item 00336 return false; 00337 } else { 00338 return $min; 00339 } 00340 } 00341 00342 static function isCjk( $codepoint ) { 00343 foreach ( self::$cjkBlocks as $block ) { 00344 if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) { 00345 return true; 00346 } 00347 } 00348 return false; 00349 } 00350 } 00351