MediaWiki
REL1_19
|
00001 #!/usr/bin/python 00002 # -*- coding: utf-8 -*- 00003 # @author Philip 00004 import tarfile as tf 00005 import zipfile as zf 00006 import os, re, shutil, sys, platform 00007 00008 pyversion = platform.python_version() 00009 islinux = platform.system().lower() == 'linux' 00010 00011 if pyversion[:3] in ['2.6', '2.7']: 00012 import urllib as urllib_request 00013 import codecs 00014 open = codecs.open 00015 _unichr = unichr 00016 if sys.maxunicode < 0x10000: 00017 def unichr(i): 00018 if i < 0x10000: 00019 return _unichr(i) 00020 else: 00021 return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) ) 00022 elif pyversion[:2] == '3.': 00023 import urllib.request as urllib_request 00024 unichr = chr 00025 00026 def unichr2( *args ): 00027 return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args] 00028 00029 def unichr3( *args ): 00030 return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]] 00031 00032 # DEFINE 00033 UNIHAN_VER = '5.2.0' 00034 SF_MIRROR = 'cdnetworks-kr-2' 00035 SCIM_TABLES_VER = '0.5.10' 00036 SCIM_PINYIN_VER = '0.5.91' 00037 LIBTABE_VER = '0.2.3' 00038 # END OF DEFINE 00039 00040 def download( url, dest ): 00041 if os.path.isfile( dest ): 00042 print( 'File %s up to date.' % dest ) 00043 return 00044 global islinux 00045 if islinux: 00046 # we use wget instead urlretrieve under Linux, 00047 # because wget could display details like download progress 00048 os.system( 'wget %s -O %s' % ( url, dest ) ) 00049 else: 00050 print( 'Downloading from [%s] ...' % url ) 00051 urllib_request.urlretrieve( url, dest ) 00052 print( 'Download complete.\n' ) 00053 return 00054 00055 def uncompress( fp, member, encoding = 'U8' ): 00056 name = member.rsplit( '/', 1 )[-1] 00057 print( 'Extracting %s ...' % name ) 00058 fp.extract( member ) 00059 shutil.move( member, name ) 00060 if '/' in member: 00061 shutil.rmtree( member.split( '/', 1 )[0] ) 00062 return open( name, 'rb', encoding, 'ignore' ) 00063 00064 unzip = lambda path, member, encoding = 'U8': \ 00065 uncompress( zf.ZipFile( path ), member, encoding ) 00066 00067 untargz = lambda path, member, encoding = 'U8': \ 00068 uncompress( tf.open( path, 'r:gz' ), member, encoding ) 00069 00070 def parserCore( fp, pos, beginmark = None, endmark = None ): 00071 if beginmark and endmark: 00072 start = False 00073 else: start = True 00074 mlist = set() 00075 for line in fp: 00076 if beginmark and line.startswith( beginmark ): 00077 start = True 00078 continue 00079 elif endmark and line.startswith( endmark ): 00080 break 00081 if start and not line.startswith( '#' ): 00082 elems = line.split() 00083 if len( elems ) < 2: 00084 continue 00085 elif len( elems[0] ) > 1 and \ 00086 len( elems[pos] ) > 1: # words only 00087 mlist.add( elems[pos] ) 00088 return mlist 00089 00090 def tablesParser( path, name ): 00091 """ Read file from scim-tables and parse it. """ 00092 global SCIM_TABLES_VER 00093 src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name ) 00094 fp = untargz( path, src, 'U8' ) 00095 return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' ) 00096 00097 ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' ) 00098 wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' ) 00099 zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' ) 00100 00101 def phraseParser( path ): 00102 """ Read phrase_lib.txt and parse it. """ 00103 global SCIM_PINYIN_VER 00104 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER 00105 dst = 'phrase_lib.txt' 00106 fp = untargz( path, src, 'U8' ) 00107 return parserCore( fp, 0 ) 00108 00109 def tsiParser( path ): 00110 """ Read tsi.src and parse it. """ 00111 src = 'libtabe/tsi-src/tsi.src' 00112 dst = 'tsi.src' 00113 fp = untargz( path, src, 'big5hkscs' ) 00114 return parserCore( fp, 0 ) 00115 00116 def unihanParser( path ): 00117 """ Read Unihan_Variants.txt and parse it. """ 00118 fp = unzip( path, 'Unihan_Variants.txt', 'U8' ) 00119 t2s = dict() 00120 s2t = dict() 00121 for line in fp: 00122 if line.startswith( '#' ): 00123 continue 00124 else: 00125 elems = line.split() 00126 if len( elems ) < 3: 00127 continue 00128 type = elems.pop( 1 ) 00129 elems = unichr2( *elems ) 00130 if type == 'kTraditionalVariant': 00131 s2t[elems[0]] = elems[1:] 00132 elif type == 'kSimplifiedVariant': 00133 t2s[elems[0]] = elems[1:] 00134 fp.close() 00135 return ( t2s, s2t ) 00136 00137 def applyExcludes( mlist, path ): 00138 """ Apply exclude rules from path to mlist. """ 00139 excludes = open( path, 'rb', 'U8' ).read().split() 00140 excludes = [word.split( '#' )[0].strip() for word in excludes] 00141 excludes = '|'.join( excludes ) 00142 excptn = re.compile( '.*(?:%s).*' % excludes ) 00143 diff = [mword for mword in mlist if excptn.search( mword )] 00144 mlist.difference_update( diff ) 00145 return mlist 00146 00147 def charManualTable( path ): 00148 fp = open( path, 'rb', 'U8' ) 00149 ret = {} 00150 for line in fp: 00151 elems = line.split( '#' )[0].split( '|' ) 00152 elems = unichr3( *elems ) 00153 if len( elems ) > 1: 00154 ret[elems[0]] = elems[1:] 00155 return ret 00156 00157 def toManyRules( src_table ): 00158 tomany = set() 00159 for ( f, t ) in src_table.iteritems(): 00160 for i in range( 1, len( t ) ): 00161 tomany.add( t[i] ) 00162 return tomany 00163 00164 def removeRules( path, table ): 00165 fp = open( path, 'rb', 'U8' ) 00166 texc = list() 00167 for line in fp: 00168 elems = line.split( '=>' ) 00169 f = t = elems[0].strip() 00170 if len( elems ) == 2: 00171 t = elems[1].strip() 00172 f = f.strip('"').strip("'") 00173 t = t.strip('"').strip("'") 00174 if f: 00175 try: 00176 table.pop( f ) 00177 except: 00178 pass 00179 if t: 00180 texc.append( t ) 00181 texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) ) 00182 for (tmp_f, tmp_t) in table.copy().iteritems(): 00183 if texcptn.match( tmp_t ): 00184 table.pop( tmp_f ) 00185 return table 00186 00187 def customRules( path ): 00188 fp = open( path, 'rb', 'U8' ) 00189 ret = dict() 00190 for line in fp: 00191 elems = line.split( '#' )[0].split() 00192 if len( elems ) > 1: 00193 ret[elems[0]] = elems[1] 00194 return ret 00195 00196 def dictToSortedList( src_table, pos ): 00197 return sorted( src_table.items(), key = lambda m: m[pos] ) 00198 00199 def translate( text, conv_table ): 00200 i = 0 00201 while i < len( text ): 00202 for j in range( len( text ) - i, 0, -1 ): 00203 f = text[i:][:j] 00204 t = conv_table.get( f ) 00205 if t: 00206 text = text[:i] + t + text[i:][j:] 00207 i += len(t) - 1 00208 break 00209 i += 1 00210 return text 00211 00212 def manualWordsTable( path, conv_table, reconv_table ): 00213 fp = open( path, 'rb', 'U8' ) 00214 reconv_table = {} 00215 wordlist = [line.split( '#' )[0].strip() for line in fp] 00216 wordlist = list( set( wordlist ) ) 00217 wordlist.sort( key = len, reverse = True ) 00218 while wordlist: 00219 word = wordlist.pop() 00220 new_word = translate( word, conv_table ) 00221 rcv_word = translate( word, reconv_table ) 00222 if word != rcv_word: 00223 reconv_table[word] = word 00224 reconv_table[new_word] = word 00225 return reconv_table 00226 00227 def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ): 00228 wordlist = list( src_wordlist ) 00229 wordlist.sort( key = len, reverse = True ) 00230 word_conv_table = {} 00231 word_reconv_table = {} 00232 conv_table = char_conv_table.copy() 00233 reconv_table = char_reconv_table.copy() 00234 tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) ) 00235 while wordlist: 00236 conv_table.update( word_conv_table ) 00237 reconv_table.update( word_reconv_table ) 00238 word = wordlist.pop() 00239 new_word_len = word_len = len( word ) 00240 while new_word_len == word_len: 00241 add = False 00242 test_word = translate( word, reconv_table ) 00243 new_word = translate( word, conv_table ) 00244 if not reconv_table.get( new_word ) \ 00245 and ( test_word != word \ 00246 or ( tomanyptn.search( word ) \ 00247 and word != translate( new_word, reconv_table ) ) ): 00248 word_conv_table[word] = new_word 00249 word_reconv_table[new_word] = word 00250 try: 00251 word = wordlist.pop() 00252 except IndexError: 00253 break 00254 new_word_len = len(word) 00255 return word_reconv_table 00256 00257 def PHPArray( table ): 00258 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t] 00259 return '\n'.join(lines) 00260 00261 def main(): 00262 #Get Unihan.zip: 00263 url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER 00264 han_dest = 'Unihan.zip' 00265 download( url, han_dest ) 00266 00267 # Get scim-tables-$(SCIM_TABLES_VER).tar.gz: 00268 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER ) 00269 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER 00270 download( url, tbe_dest ) 00271 00272 # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz: 00273 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER ) 00274 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER 00275 download( url, pyn_dest ) 00276 00277 # Get libtabe-$(LIBTABE_VER).tgz: 00278 url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER ) 00279 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER 00280 download( url, lbt_dest ) 00281 00282 # Unihan.txt 00283 ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest ) 00284 00285 t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) ) 00286 s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) ) 00287 00288 t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] ) 00289 s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] ) 00290 00291 s_tomany = toManyRules( t2s_1tomany ) 00292 t_tomany = toManyRules( s2t_1tomany ) 00293 00294 # noconvert rules 00295 t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 ) 00296 s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 ) 00297 00298 # the supper set for word to word conversion 00299 t2s_1to1_supp = t2s_1to1.copy() 00300 s2t_1to1_supp = s2t_1to1.copy() 00301 t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) ) 00302 s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) ) 00303 00304 # word to word manual rules 00305 t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp ) 00306 t2s_word2word_manual.update( customRules( 'toSimp.manual' ) ) 00307 s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp ) 00308 s2t_word2word_manual.update( customRules( 'toTrad.manual' ) ) 00309 00310 # word to word rules from input methods 00311 t_wordlist = set() 00312 s_wordlist = set() 00313 t_wordlist.update( ezbigParser( tbe_dest ), 00314 tsiParser( lbt_dest ) ) 00315 s_wordlist.update( wubiParser( tbe_dest ), 00316 zrmParser( tbe_dest ), 00317 phraseParser( pyn_dest ) ) 00318 00319 # exclude 00320 s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' ) 00321 t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' ) 00322 00323 s2t_supp = s2t_1to1_supp.copy() 00324 s2t_supp.update( s2t_word2word_manual ) 00325 t2s_supp = t2s_1to1_supp.copy() 00326 t2s_supp.update( t2s_word2word_manual ) 00327 00328 # parse list to dict 00329 t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp ) 00330 t2s_word2word.update( t2s_word2word_manual ) 00331 s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp ) 00332 s2t_word2word.update( s2t_word2word_manual ) 00333 00334 # Final tables 00335 # sorted list toHans 00336 t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] ) 00337 toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 ) 00338 # sorted list toHant 00339 s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] ) 00340 toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 ) 00341 # sorted list toCN 00342 toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 ) 00343 # sorted list toHK 00344 toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 ) 00345 # sorted list toSG 00346 toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 ) 00347 # sorted list toTW 00348 toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 ) 00349 00350 # Get PHP Array 00351 php = '''<?php 00352 /** 00353 * Simplified / Traditional Chinese conversion tables 00354 * 00355 * Automatically generated using code and data in includes/zhtable/ 00356 * Do not modify directly! 00357 * 00358 * @file 00359 */ 00360 00361 $zh2Hant = array(\n''' 00362 php += PHPArray( toHant ) \ 00363 + '\n);\n\n$zh2Hans = array(\n' \ 00364 + PHPArray( toHans ) \ 00365 + '\n);\n\n$zh2TW = array(\n' \ 00366 + PHPArray( toTW ) \ 00367 + '\n);\n\n$zh2HK = array(\n' \ 00368 + PHPArray( toHK ) \ 00369 + '\n);\n\n$zh2CN = array(\n' \ 00370 + PHPArray( toCN ) \ 00371 + '\n);\n\n$zh2SG = array(\n' \ 00372 + PHPArray( toSG ) \ 00373 + '\n);' 00374 00375 f = open( 'ZhConversion.php', 'wb', encoding = 'utf8' ) 00376 print ('Writing ZhConversion.php ... ') 00377 f.write( php ) 00378 f.close() 00379 00380 #Remove temp files 00381 print ('Deleting temp files ... ') 00382 os.remove('EZ-Big.txt.in') 00383 os.remove('phrase_lib.txt') 00384 os.remove('tsi.src') 00385 os.remove('Unihan_Variants.txt') 00386 os.remove('Wubi.txt.in') 00387 os.remove('Ziranma.txt.in') 00388 00389 00390 if __name__ == '__main__': 00391 main()