php/html/Makefile_8py_source.html

00001 #!/usr/bin/env python
00002 # -*- coding: utf-8 -*-
00003 # @author Philip
00004 import tarfile as tf
00005 import zipfile as zf
00006 import os, re, shutil, sys, platform
00007
00008 pyversion = platform.python_version()
00009 islinux = platform.system().lower() == 'linux'
00010
00011 if pyversion[:3] in ['2.6', '2.7']:
00012     import urllib as urllib_request
00013     import codecs
00014     open = codecs.open
00015     _unichr = unichr
00016     if sys.maxunicode < 0x10000:
00017         def unichr(i):
00018             if i < 0x10000:
00019                 return _unichr(i)
00020             else:
00021                 return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) )
00022 elif pyversion[:2] == '3.':
00023     import urllib.request as urllib_request
00024     unichr = chr
00025
00026 def unichr2( *args ):
00027     return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args]
00028
00029 def unichr3( *args ):
00030     return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]]
00031
00032 # DEFINE
00033 UNIHAN_VER = '6.3.0'
00034 SF_MIRROR = 'dfn'
00035 SCIM_TABLES_VER = '0.5.13'
00036 SCIM_PINYIN_VER = '0.5.92'
00037 LIBTABE_VER = '0.2.3'
00038 # END OF DEFINE
00039
00040 def download( url, dest ):
00041     if os.path.isfile( dest ):
00042         print( 'File %s is up to date.' % dest )
00043         return
00044     global islinux
00045     if islinux:
00046         # we use wget instead urlretrieve under Linux,
00047         # because wget could display details like download progress
00048         os.system( 'wget %s -O %s' % ( url, dest ) )
00049     else:
00050         print( 'Downloading from [%s] ...' % url )
00051         urllib_request.urlretrieve( url, dest )
00052         print( 'Download complete.\n' )
00053     return
00054
00055 def uncompress( fp, member, encoding = 'U8' ):
00056     name = member.rsplit( '/', 1 )[-1]
00057     print( 'Extracting %s ...' % name )
00058     fp.extract( member )
00059     shutil.move( member, name )
00060     if '/' in member:
00061         shutil.rmtree( member.split( '/', 1 )[0] )
00062     if pyversion[:1] in ['2']:
00063         fc = open( name, 'rb', encoding, 'ignore' )
00064     else:
00065         fc = open( name, 'r', encoding = encoding, errors = 'ignore' )
00066     return fc
00067
00068 unzip = lambda path, member, encoding = 'U8': \
00069         uncompress( zf.ZipFile( path ), member, encoding )
00070
00071 untargz = lambda path, member, encoding = 'U8': \
00072         uncompress( tf.open( path, 'r:gz' ), member, encoding )
00073
00074 def parserCore( fp, pos, beginmark = None, endmark = None ):
00075     if beginmark and endmark:
00076         start = False
00077     else: start = True
00078     mlist = set()
00079     for line in fp:
00080         if beginmark and line.startswith( beginmark ):
00081             start = True
00082             continue
00083         elif endmark and line.startswith( endmark ):
00084             break
00085         if start and not line.startswith( '#' ):
00086             elems = line.split()
00087             if len( elems ) < 2:
00088                 continue
00089             elif len( elems[0] ) > 1 and \
00090                 len( elems[pos] ) > 1: # words only
00091                 mlist.add( elems[pos] )
00092     return mlist
00093
00094 def tablesParser( path, name ):
00095     """ Read file from scim-tables and parse it. """
00096     global SCIM_TABLES_VER
00097     src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
00098     fp = untargz( path, src, 'U8' )
00099     return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' )
00100
00101 ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' )
00102 wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' )
00103 zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' )
00104
00105 def phraseParser( path ):
00106     """ Read phrase_lib.txt and parse it. """
00107     global SCIM_PINYIN_VER
00108     src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
00109     dst = 'phrase_lib.txt'
00110     fp = untargz( path, src, 'U8' )
00111     return parserCore( fp, 0 )
00112
00113 def tsiParser( path ):
00114     """ Read tsi.src and parse it. """
00115     src = 'libtabe/tsi-src/tsi.src'
00116     dst = 'tsi.src'
00117     fp = untargz( path, src, 'big5hkscs' )
00118     return parserCore( fp, 0 )
00119
00120 def unihanParser( path ):
00121     """ Read Unihan_Variants.txt and parse it. """
00122     fp = unzip( path, 'Unihan_Variants.txt', 'U8' )
00123     t2s = dict()
00124     s2t = dict()
00125     for line in fp:
00126         if line.startswith( '#' ):
00127             continue
00128         else:
00129             elems = line.split()
00130             if len( elems ) < 3:
00131                 continue
00132             type = elems.pop( 1 )
00133             elems = unichr2( *elems )
00134             if type == 'kTraditionalVariant':
00135                 s2t[elems[0]] = elems[1:]
00136             elif type == 'kSimplifiedVariant':
00137                 t2s[elems[0]] = elems[1:]
00138     fp.close()
00139     return ( t2s, s2t )
00140
00141 def applyExcludes( mlist, path ):
00142     """ Apply exclude rules from path to mlist. """
00143     if pyversion[:1] in ['2']:
00144         excludes = open( path, 'rb', 'U8' ).read().split()
00145     else:
00146         excludes = open( path, 'r', encoding = 'U8' ).read().split()
00147     excludes = [word.split( '#' )[0].strip() for word in excludes]
00148     excludes = '|'.join( excludes )
00149     excptn = re.compile( '.*(?:%s).*' % excludes )
00150     diff = [mword for mword in mlist if excptn.search( mword )]
00151     mlist.difference_update( diff )
00152     return mlist
00153
00154 def charManualTable( path ):
00155     fp = open( path, 'r', encoding = 'U8' )
00156     ret = {}
00157     for line in fp:
00158         elems = line.split( '#' )[0].split( '|' )
00159         elems = unichr3( *elems )
00160         if len( elems ) > 1:
00161             ret[elems[0]] = elems[1:]
00162     return ret
00163
00164 def toManyRules( src_table ):
00165     tomany = set()
00166     if pyversion[:1] in ['2']:
00167         for ( f, t ) in src_table.iteritems():
00168             for i in range( 1, len( t ) ):
00169                 tomany.add( t[i] )
00170     else:
00171         for ( f, t ) in src_table.items():
00172             for i in range( 1, len( t ) ):
00173                 tomany.add( t[i] )
00174     return tomany
00175
00176 def removeRules( path, table ):
00177     fp = open( path, 'r', encoding = 'U8' )
00178     texc = list()
00179     for line in fp:
00180         elems = line.split( '=>' )
00181         f = t = elems[0].strip()
00182         if len( elems ) == 2:
00183             t = elems[1].strip()
00184         f = f.strip('"').strip("'")
00185         t = t.strip('"').strip("'")
00186         if f:
00187             try:
00188                 table.pop( f )
00189             except:
00190                 pass
00191         if t:
00192             texc.append( t )
00193     texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) )
00194     if pyversion[:1] in ['2']:
00195         for (tmp_f, tmp_t) in table.copy().iteritems():
00196             if texcptn.match( tmp_t ):
00197                 table.pop( tmp_f )
00198     else:
00199         for (tmp_f, tmp_t) in table.copy().items():
00200             if texcptn.match( tmp_t ):
00201                 table.pop( tmp_f )
00202     return table
00203
00204 def customRules( path ):
00205     fp = open( path, 'r', encoding = 'U8' )
00206     ret = dict()
00207     for line in fp:
00208         elems = line.split( '#' )[0].split()
00209         if len( elems ) > 1:
00210             ret[elems[0]] = elems[1]
00211     return ret
00212
00213 def dictToSortedList( src_table, pos ):
00214     return sorted( src_table.items(), key = lambda m: m[pos] )
00215
00216 def translate( text, conv_table ):
00217     i = 0
00218     while i < len( text ):
00219         for j in range( len( text ) - i, 0, -1 ):
00220             f = text[i:][:j]
00221             t = conv_table.get( f )
00222             if t:
00223                 text = text[:i] + t + text[i:][j:]
00224                 i += len(t) - 1
00225                 break
00226         i += 1
00227     return text
00228
00229 def manualWordsTable( path, conv_table, reconv_table ):
00230     fp = open( path, 'r', encoding = 'U8' )
00231     reconv_table = {}
00232     wordlist = [line.split( '#' )[0].strip() for line in fp]
00233     wordlist = list( set( wordlist ) )
00234     wordlist.sort( key = len, reverse = True )
00235     while wordlist:
00236         word = wordlist.pop()
00237         new_word = translate( word, conv_table )
00238         rcv_word = translate( word, reconv_table )
00239         if word != rcv_word:
00240             reconv_table[word] = word
00241         reconv_table[new_word] = word
00242     return reconv_table
00243
00244 def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
00245     wordlist = list( src_wordlist )
00246     wordlist.sort( key = len, reverse = True )
00247     word_conv_table = {}
00248     word_reconv_table = {}
00249     conv_table = char_conv_table.copy()
00250     reconv_table = char_reconv_table.copy()
00251     tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) )
00252     while wordlist:
00253         conv_table.update( word_conv_table )
00254         reconv_table.update( word_reconv_table )
00255         word = wordlist.pop()
00256         new_word_len = word_len = len( word )
00257         while new_word_len == word_len:
00258             add = False
00259             test_word = translate( word, reconv_table )
00260             new_word = translate( word, conv_table )
00261             if not reconv_table.get( new_word ) \
00262                and ( test_word != word \
00263                or ( tomanyptn.search( word ) \
00264                and word != translate( new_word, reconv_table ) ) ):
00265                 word_conv_table[word] = new_word
00266                 word_reconv_table[new_word] = word
00267             try:
00268                 word = wordlist.pop()
00269             except IndexError:
00270                 break
00271             new_word_len = len(word)
00272     return word_reconv_table
00273
00274 def PHPArray( table ):
00275     lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
00276     return '\n'.join(lines)
00277
00278 def main():
00279     #Get Unihan.zip:
00280     url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
00281     han_dest = 'Unihan.zip'
00282     download( url, han_dest )
00283
00284     # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
00285     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
00286     tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
00287     download( url, tbe_dest )
00288
00289     # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
00290     url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
00291     pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
00292     download( url, pyn_dest )
00293
00294     # Get libtabe-$(LIBTABE_VER).tgz:
00295     url  = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
00296     lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
00297     download( url, lbt_dest )
00298
00299     # Unihan.txt
00300     ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest )
00301
00302     t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) )
00303     s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) )
00304
00305     if pyversion[:1] in ['2']:
00306       t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] )
00307       s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] )
00308     else:
00309       t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.items()] )
00310       s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.items()] )
00311
00312     s_tomany = toManyRules( t2s_1tomany )
00313     t_tomany = toManyRules( s2t_1tomany )
00314
00315     # noconvert rules
00316     t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 )
00317     s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 )
00318
00319     # the supper set for word to word conversion
00320     t2s_1to1_supp = t2s_1to1.copy()
00321     s2t_1to1_supp = s2t_1to1.copy()
00322     t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) )
00323     s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) )
00324
00325     # word to word manual rules
00326     t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
00327     t2s_word2word_manual.update( customRules( 'toSimp.manual' ) )
00328     s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
00329     s2t_word2word_manual.update( customRules( 'toTrad.manual' ) )
00330
00331     # word to word rules from input methods
00332     t_wordlist = set()
00333     s_wordlist = set()
00334     t_wordlist.update( ezbigParser( tbe_dest ),
00335                        tsiParser( lbt_dest ) )
00336     s_wordlist.update( wubiParser( tbe_dest ),
00337                        zrmParser( tbe_dest ),
00338                        phraseParser( pyn_dest ) )
00339
00340     # exclude
00341     s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' )
00342     t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' )
00343
00344     s2t_supp = s2t_1to1_supp.copy()
00345     s2t_supp.update( s2t_word2word_manual )
00346     t2s_supp = t2s_1to1_supp.copy()
00347     t2s_supp.update( t2s_word2word_manual )
00348
00349     # parse list to dict
00350     t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
00351     t2s_word2word.update( t2s_word2word_manual )
00352     s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
00353     s2t_word2word.update( s2t_word2word_manual )
00354
00355     # Final tables
00356     # sorted list toHans
00357     if pyversion[:1] in ['2']:
00358         t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] )
00359     else:
00360         t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.items() if f != t] )
00361     toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 )
00362     # sorted list toHant
00363     if pyversion[:1] in ['2']:
00364         s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] )
00365     else:
00366         s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.items() if f != t] )
00367     toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 )
00368     # sorted list toCN
00369     toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 )
00370     # sorted list toHK
00371     toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 )
00372     # sorted list toSG
00373     toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 )
00374     # sorted list toTW
00375     toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 )
00376
00377     # Get PHP Array
00378     php = '''<?php
00379 /**
00380  * Simplified / Traditional Chinese conversion tables
00381  *
00382  * Automatically generated using code and data in maintenance/language/zhtable/
00383  * Do not modify directly!
00384  *
00385  * @file
00386  */
00387
00388 $zh2Hant = array(\n'''
00389     php += PHPArray( toHant ) \
00390         +  '\n);\n\n$zh2Hans = array(\n' \
00391         +  PHPArray( toHans ) \
00392         +  '\n);\n\n$zh2TW = array(\n' \
00393         +  PHPArray( toTW ) \
00394         +  '\n);\n\n$zh2HK = array(\n' \
00395         +  PHPArray( toHK ) \
00396         +  '\n);\n\n$zh2CN = array(\n' \
00397         +  PHPArray( toCN ) \
00398         +  '\n);\n\n$zh2SG = array(\n' \
00399         +  PHPArray( toSG ) \
00400         +  '\n);\n'
00401
00402     if pyversion[:1] in ['2']:
00403         f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding = 'utf8' )
00404     else:
00405         f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'w', buffering = 4096, encoding = 'utf8' )
00406     print ('Writing ZhConversion.php ... ')
00407     f.write( php )
00408     f.close()
00409
00410     # Remove temporary files
00411     print ('Deleting temporary files ... ')
00412     os.remove('EZ-Big.txt.in')
00413     os.remove('phrase_lib.txt')
00414     os.remove('tsi.src')
00415     os.remove('Unihan_Variants.txt')
00416     os.remove('Wubi.txt.in')
00417     os.remove('Ziranma.txt.in')
00418
00419
00420 if __name__ == '__main__':
00421     main()