MediaWiki
REL1_24
|
00001 #!/usr/bin/env python 00002 # -*- coding: utf-8 -*- 00003 # @author Philip 00004 import tarfile as tf 00005 import zipfile as zf 00006 import os, re, shutil, sys, platform 00007 00008 pyversion = platform.python_version() 00009 islinux = platform.system().lower() == 'linux' 00010 00011 if pyversion[:3] in ['2.6', '2.7']: 00012 import urllib as urllib_request 00013 import codecs 00014 open = codecs.open 00015 _unichr = unichr 00016 if sys.maxunicode < 0x10000: 00017 def unichr(i): 00018 if i < 0x10000: 00019 return _unichr(i) 00020 else: 00021 return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) ) 00022 elif pyversion[:2] == '3.': 00023 import urllib.request as urllib_request 00024 unichr = chr 00025 00026 def unichr2( *args ): 00027 return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args] 00028 00029 def unichr3( *args ): 00030 return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]] 00031 00032 # DEFINE 00033 UNIHAN_VER = '6.3.0' 00034 SF_MIRROR = 'dfn' 00035 SCIM_TABLES_VER = '0.5.13' 00036 SCIM_PINYIN_VER = '0.5.92' 00037 LIBTABE_VER = '0.2.3' 00038 # END OF DEFINE 00039 00040 def download( url, dest ): 00041 if os.path.isfile( dest ): 00042 print( 'File %s is up to date.' % dest ) 00043 return 00044 global islinux 00045 if islinux: 00046 # we use wget instead urlretrieve under Linux, 00047 # because wget could display details like download progress 00048 os.system( 'wget %s -O %s' % ( url, dest ) ) 00049 else: 00050 print( 'Downloading from [%s] ...' % url ) 00051 urllib_request.urlretrieve( url, dest ) 00052 print( 'Download complete.\n' ) 00053 return 00054 00055 def uncompress( fp, member, encoding = 'U8' ): 00056 name = member.rsplit( '/', 1 )[-1] 00057 print( 'Extracting %s ...' % name ) 00058 fp.extract( member ) 00059 shutil.move( member, name ) 00060 if '/' in member: 00061 shutil.rmtree( member.split( '/', 1 )[0] ) 00062 if pyversion[:1] in ['2']: 00063 fc = open( name, 'rb', encoding, 'ignore' ) 00064 else: 00065 fc = open( name, 'r', encoding = encoding, errors = 'ignore' ) 00066 return fc 00067 00068 unzip = lambda path, member, encoding = 'U8': \ 00069 uncompress( zf.ZipFile( path ), member, encoding ) 00070 00071 untargz = lambda path, member, encoding = 'U8': \ 00072 uncompress( tf.open( path, 'r:gz' ), member, encoding ) 00073 00074 def parserCore( fp, pos, beginmark = None, endmark = None ): 00075 if beginmark and endmark: 00076 start = False 00077 else: start = True 00078 mlist = set() 00079 for line in fp: 00080 if beginmark and line.startswith( beginmark ): 00081 start = True 00082 continue 00083 elif endmark and line.startswith( endmark ): 00084 break 00085 if start and not line.startswith( '#' ): 00086 elems = line.split() 00087 if len( elems ) < 2: 00088 continue 00089 elif len( elems[0] ) > 1 and \ 00090 len( elems[pos] ) > 1: # words only 00091 mlist.add( elems[pos] ) 00092 return mlist 00093 00094 def tablesParser( path, name ): 00095 """ Read file from scim-tables and parse it. """ 00096 global SCIM_TABLES_VER 00097 src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name ) 00098 fp = untargz( path, src, 'U8' ) 00099 return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' ) 00100 00101 ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' ) 00102 wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' ) 00103 zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' ) 00104 00105 def phraseParser( path ): 00106 """ Read phrase_lib.txt and parse it. """ 00107 global SCIM_PINYIN_VER 00108 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER 00109 dst = 'phrase_lib.txt' 00110 fp = untargz( path, src, 'U8' ) 00111 return parserCore( fp, 0 ) 00112 00113 def tsiParser( path ): 00114 """ Read tsi.src and parse it. """ 00115 src = 'libtabe/tsi-src/tsi.src' 00116 dst = 'tsi.src' 00117 fp = untargz( path, src, 'big5hkscs' ) 00118 return parserCore( fp, 0 ) 00119 00120 def unihanParser( path ): 00121 """ Read Unihan_Variants.txt and parse it. """ 00122 fp = unzip( path, 'Unihan_Variants.txt', 'U8' ) 00123 t2s = dict() 00124 s2t = dict() 00125 for line in fp: 00126 if line.startswith( '#' ): 00127 continue 00128 else: 00129 elems = line.split() 00130 if len( elems ) < 3: 00131 continue 00132 type = elems.pop( 1 ) 00133 elems = unichr2( *elems ) 00134 if type == 'kTraditionalVariant': 00135 s2t[elems[0]] = elems[1:] 00136 elif type == 'kSimplifiedVariant': 00137 t2s[elems[0]] = elems[1:] 00138 fp.close() 00139 return ( t2s, s2t ) 00140 00141 def applyExcludes( mlist, path ): 00142 """ Apply exclude rules from path to mlist. """ 00143 if pyversion[:1] in ['2']: 00144 excludes = open( path, 'rb', 'U8' ).read().split() 00145 else: 00146 excludes = open( path, 'r', encoding = 'U8' ).read().split() 00147 excludes = [word.split( '#' )[0].strip() for word in excludes] 00148 excludes = '|'.join( excludes ) 00149 excptn = re.compile( '.*(?:%s).*' % excludes ) 00150 diff = [mword for mword in mlist if excptn.search( mword )] 00151 mlist.difference_update( diff ) 00152 return mlist 00153 00154 def charManualTable( path ): 00155 fp = open( path, 'r', encoding = 'U8' ) 00156 ret = {} 00157 for line in fp: 00158 elems = line.split( '#' )[0].split( '|' ) 00159 elems = unichr3( *elems ) 00160 if len( elems ) > 1: 00161 ret[elems[0]] = elems[1:] 00162 return ret 00163 00164 def toManyRules( src_table ): 00165 tomany = set() 00166 if pyversion[:1] in ['2']: 00167 for ( f, t ) in src_table.iteritems(): 00168 for i in range( 1, len( t ) ): 00169 tomany.add( t[i] ) 00170 else: 00171 for ( f, t ) in src_table.items(): 00172 for i in range( 1, len( t ) ): 00173 tomany.add( t[i] ) 00174 return tomany 00175 00176 def removeRules( path, table ): 00177 fp = open( path, 'r', encoding = 'U8' ) 00178 texc = list() 00179 for line in fp: 00180 elems = line.split( '=>' ) 00181 f = t = elems[0].strip() 00182 if len( elems ) == 2: 00183 t = elems[1].strip() 00184 f = f.strip('"').strip("'") 00185 t = t.strip('"').strip("'") 00186 if f: 00187 try: 00188 table.pop( f ) 00189 except: 00190 pass 00191 if t: 00192 texc.append( t ) 00193 texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) ) 00194 if pyversion[:1] in ['2']: 00195 for (tmp_f, tmp_t) in table.copy().iteritems(): 00196 if texcptn.match( tmp_t ): 00197 table.pop( tmp_f ) 00198 else: 00199 for (tmp_f, tmp_t) in table.copy().items(): 00200 if texcptn.match( tmp_t ): 00201 table.pop( tmp_f ) 00202 return table 00203 00204 def customRules( path ): 00205 fp = open( path, 'r', encoding = 'U8' ) 00206 ret = dict() 00207 for line in fp: 00208 elems = line.split( '#' )[0].split() 00209 if len( elems ) > 1: 00210 ret[elems[0]] = elems[1] 00211 return ret 00212 00213 def dictToSortedList( src_table, pos ): 00214 return sorted( src_table.items(), key = lambda m: m[pos] ) 00215 00216 def translate( text, conv_table ): 00217 i = 0 00218 while i < len( text ): 00219 for j in range( len( text ) - i, 0, -1 ): 00220 f = text[i:][:j] 00221 t = conv_table.get( f ) 00222 if t: 00223 text = text[:i] + t + text[i:][j:] 00224 i += len(t) - 1 00225 break 00226 i += 1 00227 return text 00228 00229 def manualWordsTable( path, conv_table, reconv_table ): 00230 fp = open( path, 'r', encoding = 'U8' ) 00231 reconv_table = {} 00232 wordlist = [line.split( '#' )[0].strip() for line in fp] 00233 wordlist = list( set( wordlist ) ) 00234 wordlist.sort( key = len, reverse = True ) 00235 while wordlist: 00236 word = wordlist.pop() 00237 new_word = translate( word, conv_table ) 00238 rcv_word = translate( word, reconv_table ) 00239 if word != rcv_word: 00240 reconv_table[word] = word 00241 reconv_table[new_word] = word 00242 return reconv_table 00243 00244 def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ): 00245 wordlist = list( src_wordlist ) 00246 wordlist.sort( key = len, reverse = True ) 00247 word_conv_table = {} 00248 word_reconv_table = {} 00249 conv_table = char_conv_table.copy() 00250 reconv_table = char_reconv_table.copy() 00251 tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) ) 00252 while wordlist: 00253 conv_table.update( word_conv_table ) 00254 reconv_table.update( word_reconv_table ) 00255 word = wordlist.pop() 00256 new_word_len = word_len = len( word ) 00257 while new_word_len == word_len: 00258 add = False 00259 test_word = translate( word, reconv_table ) 00260 new_word = translate( word, conv_table ) 00261 if not reconv_table.get( new_word ) \ 00262 and ( test_word != word \ 00263 or ( tomanyptn.search( word ) \ 00264 and word != translate( new_word, reconv_table ) ) ): 00265 word_conv_table[word] = new_word 00266 word_reconv_table[new_word] = word 00267 try: 00268 word = wordlist.pop() 00269 except IndexError: 00270 break 00271 new_word_len = len(word) 00272 return word_reconv_table 00273 00274 def PHPArray( table ): 00275 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t] 00276 return '\n'.join(lines) 00277 00278 def main(): 00279 #Get Unihan.zip: 00280 url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER 00281 han_dest = 'Unihan.zip' 00282 download( url, han_dest ) 00283 00284 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz: 00285 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER ) 00286 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER 00287 download( url, tbe_dest ) 00288 00289 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz: 00290 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER ) 00291 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER 00292 download( url, pyn_dest ) 00293 00294 # Get libtabe-$(LIBTABE_VER).tgz: 00295 url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER ) 00296 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER 00297 download( url, lbt_dest ) 00298 00299 # Unihan.txt 00300 ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest ) 00301 00302 t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) ) 00303 s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) ) 00304 00305 if pyversion[:1] in ['2']: 00306 t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] ) 00307 s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] ) 00308 else: 00309 t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.items()] ) 00310 s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.items()] ) 00311 00312 s_tomany = toManyRules( t2s_1tomany ) 00313 t_tomany = toManyRules( s2t_1tomany ) 00314 00315 # noconvert rules 00316 t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 ) 00317 s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 ) 00318 00319 # the supper set for word to word conversion 00320 t2s_1to1_supp = t2s_1to1.copy() 00321 s2t_1to1_supp = s2t_1to1.copy() 00322 t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) ) 00323 s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) ) 00324 00325 # word to word manual rules 00326 t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp ) 00327 t2s_word2word_manual.update( customRules( 'toSimp.manual' ) ) 00328 s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp ) 00329 s2t_word2word_manual.update( customRules( 'toTrad.manual' ) ) 00330 00331 # word to word rules from input methods 00332 t_wordlist = set() 00333 s_wordlist = set() 00334 t_wordlist.update( ezbigParser( tbe_dest ), 00335 tsiParser( lbt_dest ) ) 00336 s_wordlist.update( wubiParser( tbe_dest ), 00337 zrmParser( tbe_dest ), 00338 phraseParser( pyn_dest ) ) 00339 00340 # exclude 00341 s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' ) 00342 t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' ) 00343 00344 s2t_supp = s2t_1to1_supp.copy() 00345 s2t_supp.update( s2t_word2word_manual ) 00346 t2s_supp = t2s_1to1_supp.copy() 00347 t2s_supp.update( t2s_word2word_manual ) 00348 00349 # parse list to dict 00350 t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp ) 00351 t2s_word2word.update( t2s_word2word_manual ) 00352 s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp ) 00353 s2t_word2word.update( s2t_word2word_manual ) 00354 00355 # Final tables 00356 # sorted list toHans 00357 if pyversion[:1] in ['2']: 00358 t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] ) 00359 else: 00360 t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.items() if f != t] ) 00361 toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 ) 00362 # sorted list toHant 00363 if pyversion[:1] in ['2']: 00364 s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] ) 00365 else: 00366 s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.items() if f != t] ) 00367 toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 ) 00368 # sorted list toCN 00369 toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 ) 00370 # sorted list toHK 00371 toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 ) 00372 # sorted list toSG 00373 toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 ) 00374 # sorted list toTW 00375 toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 ) 00376 00377 # Get PHP Array 00378 php = '''<?php 00379 /** 00380 * Simplified / Traditional Chinese conversion tables 00381 * 00382 * Automatically generated using code and data in maintenance/language/zhtable/ 00383 * Do not modify directly! 00384 * 00385 * @file 00386 */ 00387 00388 $zh2Hant = array(\n''' 00389 php += PHPArray( toHant ) \ 00390 + '\n);\n\n$zh2Hans = array(\n' \ 00391 + PHPArray( toHans ) \ 00392 + '\n);\n\n$zh2TW = array(\n' \ 00393 + PHPArray( toTW ) \ 00394 + '\n);\n\n$zh2HK = array(\n' \ 00395 + PHPArray( toHK ) \ 00396 + '\n);\n\n$zh2CN = array(\n' \ 00397 + PHPArray( toCN ) \ 00398 + '\n);\n\n$zh2SG = array(\n' \ 00399 + PHPArray( toSG ) \ 00400 + '\n);\n' 00401 00402 if pyversion[:1] in ['2']: 00403 f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding = 'utf8' ) 00404 else: 00405 f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'w', buffering = 4096, encoding = 'utf8' ) 00406 print ('Writing ZhConversion.php ... ') 00407 f.write( php ) 00408 f.close() 00409 00410 # Remove temporary files 00411 print ('Deleting temporary files ... ') 00412 os.remove('EZ-Big.txt.in') 00413 os.remove('phrase_lib.txt') 00414 os.remove('tsi.src') 00415 os.remove('Unihan_Variants.txt') 00416 os.remove('Wubi.txt.in') 00417 os.remove('Ziranma.txt.in') 00418 00419 00420 if __name__ == '__main__': 00421 main()