[ Index ] |
PHP Cross Reference of MediaWiki-1.24.0 |
[Summary view] [Print] [Text view]
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # @author Philip 4 import tarfile as tf 5 import zipfile as zf 6 import os, re, shutil, sys, platform 7 8 pyversion = platform.python_version() 9 islinux = platform.system().lower() == 'linux' 10 11 if pyversion[:3] in ['2.6', '2.7']: 12 import urllib as urllib_request 13 import codecs 14 open = codecs.open 15 _unichr = unichr 16 if sys.maxunicode < 0x10000: 17 def unichr(i): 18 if i < 0x10000: 19 return _unichr(i) 20 else: 21 return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) ) 22 elif pyversion[:2] == '3.': 23 import urllib.request as urllib_request 24 unichr = chr 25 26 def unichr2( *args ): 27 return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args] 28 29 def unichr3( *args ): 30 return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]] 31 32 # DEFINE 33 UNIHAN_VER = '6.3.0' 34 SF_MIRROR = 'dfn' 35 SCIM_TABLES_VER = '0.5.13' 36 SCIM_PINYIN_VER = '0.5.92' 37 LIBTABE_VER = '0.2.3' 38 # END OF DEFINE 39 40 def download( url, dest ): 41 if os.path.isfile( dest ): 42 print( 'File %s is up to date.' % dest ) 43 return 44 global islinux 45 if islinux: 46 # we use wget instead urlretrieve under Linux, 47 # because wget could display details like download progress 48 os.system( 'wget %s -O %s' % ( url, dest ) ) 49 else: 50 print( 'Downloading from [%s] ...' % url ) 51 urllib_request.urlretrieve( url, dest ) 52 print( 'Download complete.\n' ) 53 return 54 55 def uncompress( fp, member, encoding = 'U8' ): 56 name = member.rsplit( '/', 1 )[-1] 57 print( 'Extracting %s ...' % name ) 58 fp.extract( member ) 59 shutil.move( member, name ) 60 if '/' in member: 61 shutil.rmtree( member.split( '/', 1 )[0] ) 62 if pyversion[:1] in ['2']: 63 fc = open( name, 'rb', encoding, 'ignore' ) 64 else: 65 fc = open( name, 'r', encoding = encoding, errors = 'ignore' ) 66 return fc 67 68 unzip = lambda path, member, encoding = 'U8': \ 69 uncompress( zf.ZipFile( path ), member, encoding ) 70 71 untargz = lambda path, member, encoding = 'U8': \ 72 uncompress( tf.open( path, 'r:gz' ), member, encoding ) 73 74 def parserCore( fp, pos, beginmark = None, endmark = None ): 75 if beginmark and endmark: 76 start = False 77 else: start = True 78 mlist = set() 79 for line in fp: 80 if beginmark and line.startswith( beginmark ): 81 start = True 82 continue 83 elif endmark and line.startswith( endmark ): 84 break 85 if start and not line.startswith( '#' ): 86 elems = line.split() 87 if len( elems ) < 2: 88 continue 89 elif len( elems[0] ) > 1 and \ 90 len( elems[pos] ) > 1: # words only 91 mlist.add( elems[pos] ) 92 return mlist 93 94 def tablesParser( path, name ): 95 """ Read file from scim-tables and parse it. """ 96 global SCIM_TABLES_VER 97 src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name ) 98 fp = untargz( path, src, 'U8' ) 99 return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' ) 100 101 ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' ) 102 wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' ) 103 zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' ) 104 105 def phraseParser( path ): 106 """ Read phrase_lib.txt and parse it. """ 107 global SCIM_PINYIN_VER 108 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER 109 dst = 'phrase_lib.txt' 110 fp = untargz( path, src, 'U8' ) 111 return parserCore( fp, 0 ) 112 113 def tsiParser( path ): 114 """ Read tsi.src and parse it. """ 115 src = 'libtabe/tsi-src/tsi.src' 116 dst = 'tsi.src' 117 fp = untargz( path, src, 'big5hkscs' ) 118 return parserCore( fp, 0 ) 119 120 def unihanParser( path ): 121 """ Read Unihan_Variants.txt and parse it. """ 122 fp = unzip( path, 'Unihan_Variants.txt', 'U8' ) 123 t2s = dict() 124 s2t = dict() 125 for line in fp: 126 if line.startswith( '#' ): 127 continue 128 else: 129 elems = line.split() 130 if len( elems ) < 3: 131 continue 132 type = elems.pop( 1 ) 133 elems = unichr2( *elems ) 134 if type == 'kTraditionalVariant': 135 s2t[elems[0]] = elems[1:] 136 elif type == 'kSimplifiedVariant': 137 t2s[elems[0]] = elems[1:] 138 fp.close() 139 return ( t2s, s2t ) 140 141 def applyExcludes( mlist, path ): 142 """ Apply exclude rules from path to mlist. """ 143 if pyversion[:1] in ['2']: 144 excludes = open( path, 'rb', 'U8' ).read().split() 145 else: 146 excludes = open( path, 'r', encoding = 'U8' ).read().split() 147 excludes = [word.split( '#' )[0].strip() for word in excludes] 148 excludes = '|'.join( excludes ) 149 excptn = re.compile( '.*(?:%s).*' % excludes ) 150 diff = [mword for mword in mlist if excptn.search( mword )] 151 mlist.difference_update( diff ) 152 return mlist 153 154 def charManualTable( path ): 155 fp = open( path, 'r', encoding = 'U8' ) 156 ret = {} 157 for line in fp: 158 elems = line.split( '#' )[0].split( '|' ) 159 elems = unichr3( *elems ) 160 if len( elems ) > 1: 161 ret[elems[0]] = elems[1:] 162 return ret 163 164 def toManyRules( src_table ): 165 tomany = set() 166 if pyversion[:1] in ['2']: 167 for ( f, t ) in src_table.iteritems(): 168 for i in range( 1, len( t ) ): 169 tomany.add( t[i] ) 170 else: 171 for ( f, t ) in src_table.items(): 172 for i in range( 1, len( t ) ): 173 tomany.add( t[i] ) 174 return tomany 175 176 def removeRules( path, table ): 177 fp = open( path, 'r', encoding = 'U8' ) 178 texc = list() 179 for line in fp: 180 elems = line.split( '=>' ) 181 f = t = elems[0].strip() 182 if len( elems ) == 2: 183 t = elems[1].strip() 184 f = f.strip('"').strip("'") 185 t = t.strip('"').strip("'") 186 if f: 187 try: 188 table.pop( f ) 189 except: 190 pass 191 if t: 192 texc.append( t ) 193 texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) ) 194 if pyversion[:1] in ['2']: 195 for (tmp_f, tmp_t) in table.copy().iteritems(): 196 if texcptn.match( tmp_t ): 197 table.pop( tmp_f ) 198 else: 199 for (tmp_f, tmp_t) in table.copy().items(): 200 if texcptn.match( tmp_t ): 201 table.pop( tmp_f ) 202 return table 203 204 def customRules( path ): 205 fp = open( path, 'r', encoding = 'U8' ) 206 ret = dict() 207 for line in fp: 208 elems = line.split( '#' )[0].split() 209 if len( elems ) > 1: 210 ret[elems[0]] = elems[1] 211 return ret 212 213 def dictToSortedList( src_table, pos ): 214 return sorted( src_table.items(), key = lambda m: m[pos] ) 215 216 def translate( text, conv_table ): 217 i = 0 218 while i < len( text ): 219 for j in range( len( text ) - i, 0, -1 ): 220 f = text[i:][:j] 221 t = conv_table.get( f ) 222 if t: 223 text = text[:i] + t + text[i:][j:] 224 i += len(t) - 1 225 break 226 i += 1 227 return text 228 229 def manualWordsTable( path, conv_table, reconv_table ): 230 fp = open( path, 'r', encoding = 'U8' ) 231 reconv_table = {} 232 wordlist = [line.split( '#' )[0].strip() for line in fp] 233 wordlist = list( set( wordlist ) ) 234 wordlist.sort( key = len, reverse = True ) 235 while wordlist: 236 word = wordlist.pop() 237 new_word = translate( word, conv_table ) 238 rcv_word = translate( word, reconv_table ) 239 if word != rcv_word: 240 reconv_table[word] = word 241 reconv_table[new_word] = word 242 return reconv_table 243 244 def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ): 245 wordlist = list( src_wordlist ) 246 wordlist.sort( key = len, reverse = True ) 247 word_conv_table = {} 248 word_reconv_table = {} 249 conv_table = char_conv_table.copy() 250 reconv_table = char_reconv_table.copy() 251 tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) ) 252 while wordlist: 253 conv_table.update( word_conv_table ) 254 reconv_table.update( word_reconv_table ) 255 word = wordlist.pop() 256 new_word_len = word_len = len( word ) 257 while new_word_len == word_len: 258 add = False 259 test_word = translate( word, reconv_table ) 260 new_word = translate( word, conv_table ) 261 if not reconv_table.get( new_word ) \ 262 and ( test_word != word \ 263 or ( tomanyptn.search( word ) \ 264 and word != translate( new_word, reconv_table ) ) ): 265 word_conv_table[word] = new_word 266 word_reconv_table[new_word] = word 267 try: 268 word = wordlist.pop() 269 except IndexError: 270 break 271 new_word_len = len(word) 272 return word_reconv_table 273 274 def PHPArray( table ): 275 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t] 276 return '\n'.join(lines) 277 278 def main(): 279 #Get Unihan.zip: 280 url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER 281 han_dest = 'Unihan.zip' 282 download( url, han_dest ) 283 284 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz: 285 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER ) 286 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER 287 download( url, tbe_dest ) 288 289 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz: 290 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER ) 291 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER 292 download( url, pyn_dest ) 293 294 # Get libtabe-$(LIBTABE_VER).tgz: 295 url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER ) 296 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER 297 download( url, lbt_dest ) 298 299 # Unihan.txt 300 ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest ) 301 302 t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) ) 303 s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) ) 304 305 if pyversion[:1] in ['2']: 306 t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] ) 307 s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] ) 308 else: 309 t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.items()] ) 310 s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.items()] ) 311 312 s_tomany = toManyRules( t2s_1tomany ) 313 t_tomany = toManyRules( s2t_1tomany ) 314 315 # noconvert rules 316 t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 ) 317 s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 ) 318 319 # the supper set for word to word conversion 320 t2s_1to1_supp = t2s_1to1.copy() 321 s2t_1to1_supp = s2t_1to1.copy() 322 t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) ) 323 s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) ) 324 325 # word to word manual rules 326 t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp ) 327 t2s_word2word_manual.update( customRules( 'toSimp.manual' ) ) 328 s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp ) 329 s2t_word2word_manual.update( customRules( 'toTrad.manual' ) ) 330 331 # word to word rules from input methods 332 t_wordlist = set() 333 s_wordlist = set() 334 t_wordlist.update( ezbigParser( tbe_dest ), 335 tsiParser( lbt_dest ) ) 336 s_wordlist.update( wubiParser( tbe_dest ), 337 zrmParser( tbe_dest ), 338 phraseParser( pyn_dest ) ) 339 340 # exclude 341 s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' ) 342 t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' ) 343 344 s2t_supp = s2t_1to1_supp.copy() 345 s2t_supp.update( s2t_word2word_manual ) 346 t2s_supp = t2s_1to1_supp.copy() 347 t2s_supp.update( t2s_word2word_manual ) 348 349 # parse list to dict 350 t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp ) 351 t2s_word2word.update( t2s_word2word_manual ) 352 s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp ) 353 s2t_word2word.update( s2t_word2word_manual ) 354 355 # Final tables 356 # sorted list toHans 357 if pyversion[:1] in ['2']: 358 t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] ) 359 else: 360 t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.items() if f != t] ) 361 toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 ) 362 # sorted list toHant 363 if pyversion[:1] in ['2']: 364 s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] ) 365 else: 366 s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.items() if f != t] ) 367 toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 ) 368 # sorted list toCN 369 toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 ) 370 # sorted list toHK 371 toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 ) 372 # sorted list toSG 373 toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 ) 374 # sorted list toTW 375 toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 ) 376 377 # Get PHP Array 378 php = '''<?php 379 /** 380 * Simplified / Traditional Chinese conversion tables 381 * 382 * Automatically generated using code and data in maintenance/language/zhtable/ 383 * Do not modify directly! 384 * 385 * @file 386 */ 387 388 $zh2Hant = array(\n''' 389 php += PHPArray( toHant ) \ 390 + '\n);\n\n$zh2Hans = array(\n' \ 391 + PHPArray( toHans ) \ 392 + '\n);\n\n$zh2TW = array(\n' \ 393 + PHPArray( toTW ) \ 394 + '\n);\n\n$zh2HK = array(\n' \ 395 + PHPArray( toHK ) \ 396 + '\n);\n\n$zh2CN = array(\n' \ 397 + PHPArray( toCN ) \ 398 + '\n);\n\n$zh2SG = array(\n' \ 399 + PHPArray( toSG ) \ 400 + '\n);\n' 401 402 if pyversion[:1] in ['2']: 403 f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding = 'utf8' ) 404 else: 405 f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'w', buffering = 4096, encoding = 'utf8' ) 406 print ('Writing ZhConversion.php ... ') 407 f.write( php ) 408 f.close() 409 410 # Remove temporary files 411 print ('Deleting temporary files ... ') 412 os.remove('EZ-Big.txt.in') 413 os.remove('phrase_lib.txt') 414 os.remove('tsi.src') 415 os.remove('Unihan_Variants.txt') 416 os.remove('Wubi.txt.in') 417 os.remove('Ziranma.txt.in') 418 419 420 if __name__ == '__main__': 421 main()
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 14:03:12 2014 | Cross-referenced by PHPXref 0.7.1 |