PHPXRef 0.7.1 : MediaWiki-1.24.0 : /maintenance/language/zhtable/Makefile.py source

[Summary view] [Print] [Text view]
   1  #!/usr/bin/env python
   2  # -*- coding: utf-8 -*-
   3  # @author Philip
   4  import tarfile as tf
   5  import zipfile as zf
   6  import os, re, shutil, sys, platform
   7  
   8  pyversion = platform.python_version()
   9  islinux = platform.system().lower() == 'linux'
  10  
  11  if pyversion[:3] in ['2.6', '2.7']:
  12      import urllib as urllib_request
  13      import codecs
  14      open = codecs.open
  15      _unichr = unichr
  16      if sys.maxunicode < 0x10000:
  17          def unichr(i):
  18              if i < 0x10000:
  19                  return _unichr(i)
  20              else:
  21                  return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) )
  22  elif pyversion[:2] == '3.':
  23      import urllib.request as urllib_request
  24      unichr = chr
  25  
  26  def unichr2( *args ):
  27      return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args]
  28  
  29  def unichr3( *args ):
  30      return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]]
  31  
  32  # DEFINE
  33  UNIHAN_VER = '6.3.0'
  34  SF_MIRROR = 'dfn'
  35  SCIM_TABLES_VER = '0.5.13'
  36  SCIM_PINYIN_VER = '0.5.92'
  37  LIBTABE_VER = '0.2.3'
  38  # END OF DEFINE
  39  
  40  def download( url, dest ):
  41      if os.path.isfile( dest ):
  42          print( 'File %s is up to date.' % dest )
  43          return
  44      global islinux
  45      if islinux:
  46          # we use wget instead urlretrieve under Linux, 
  47          # because wget could display details like download progress
  48          os.system( 'wget %s -O %s' % ( url, dest ) )
  49      else:
  50          print( 'Downloading from [%s] ...' % url )
  51          urllib_request.urlretrieve( url, dest )
  52          print( 'Download complete.\n' )
  53      return
  54  
  55  def uncompress( fp, member, encoding = 'U8' ):
  56      name = member.rsplit( '/', 1 )[-1]
  57      print( 'Extracting %s ...' % name )
  58      fp.extract( member )
  59      shutil.move( member, name )
  60      if '/' in member:
  61          shutil.rmtree( member.split( '/', 1 )[0] )
  62      if pyversion[:1] in ['2']:
  63          fc = open( name, 'rb', encoding, 'ignore' )
  64      else:
  65          fc = open( name, 'r', encoding = encoding, errors = 'ignore' )
  66      return fc
  67  
  68  unzip = lambda path, member, encoding = 'U8': \
  69          uncompress( zf.ZipFile( path ), member, encoding )
  70  
  71  untargz = lambda path, member, encoding = 'U8': \
  72          uncompress( tf.open( path, 'r:gz' ), member, encoding )
  73  
  74  def parserCore( fp, pos, beginmark = None, endmark = None ):
  75      if beginmark and endmark:
  76          start = False
  77      else: start = True
  78      mlist = set()
  79      for line in fp:
  80          if beginmark and line.startswith( beginmark ):
  81              start = True
  82              continue
  83          elif endmark and line.startswith( endmark ):
  84              break
  85          if start and not line.startswith( '#' ):
  86              elems = line.split()
  87              if len( elems ) < 2:
  88                  continue
  89              elif len( elems[0] ) > 1 and \
  90                  len( elems[pos] ) > 1: # words only
  91                  mlist.add( elems[pos] )
  92      return mlist
  93  
  94  def tablesParser( path, name ):
  95      """ Read file from scim-tables and parse it. """
  96      global SCIM_TABLES_VER
  97      src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
  98      fp = untargz( path, src, 'U8' )
  99      return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' )
 100  
 101  ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' )
 102  wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' )
 103  zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' )
 104  
 105  def phraseParser( path ):
 106      """ Read phrase_lib.txt and parse it. """
 107      global SCIM_PINYIN_VER
 108      src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
 109      dst = 'phrase_lib.txt'
 110      fp = untargz( path, src, 'U8' )
 111      return parserCore( fp, 0 )
 112  
 113  def tsiParser( path ):
 114      """ Read tsi.src and parse it. """
 115      src = 'libtabe/tsi-src/tsi.src'
 116      dst = 'tsi.src'
 117      fp = untargz( path, src, 'big5hkscs' )
 118      return parserCore( fp, 0 )
 119  
 120  def unihanParser( path ):
 121      """ Read Unihan_Variants.txt and parse it. """
 122      fp = unzip( path, 'Unihan_Variants.txt', 'U8' )
 123      t2s = dict()
 124      s2t = dict()
 125      for line in fp:
 126          if line.startswith( '#' ):
 127              continue
 128          else:
 129              elems = line.split()
 130              if len( elems ) < 3:
 131                  continue
 132              type = elems.pop( 1 )
 133              elems = unichr2( *elems )
 134              if type == 'kTraditionalVariant':
 135                  s2t[elems[0]] = elems[1:]
 136              elif type == 'kSimplifiedVariant':
 137                  t2s[elems[0]] = elems[1:]
 138      fp.close()
 139      return ( t2s, s2t )
 140  
 141  def applyExcludes( mlist, path ):
 142      """ Apply exclude rules from path to mlist. """
 143      if pyversion[:1] in ['2']:
 144          excludes = open( path, 'rb', 'U8' ).read().split()
 145      else:
 146          excludes = open( path, 'r', encoding = 'U8' ).read().split()
 147      excludes = [word.split( '#' )[0].strip() for word in excludes]
 148      excludes = '|'.join( excludes )
 149      excptn = re.compile( '.*(?:%s).*' % excludes )
 150      diff = [mword for mword in mlist if excptn.search( mword )]
 151      mlist.difference_update( diff )
 152      return mlist
 153  
 154  def charManualTable( path ):
 155      fp = open( path, 'r', encoding = 'U8' )
 156      ret = {}
 157      for line in fp:
 158          elems = line.split( '#' )[0].split( '|' )
 159          elems = unichr3( *elems )
 160          if len( elems ) > 1:
 161              ret[elems[0]] = elems[1:]
 162      return ret
 163          
 164  def toManyRules( src_table ):
 165      tomany = set()
 166      if pyversion[:1] in ['2']:
 167          for ( f, t ) in src_table.iteritems():
 168              for i in range( 1, len( t ) ):
 169                  tomany.add( t[i] )
 170      else:
 171          for ( f, t ) in src_table.items():
 172              for i in range( 1, len( t ) ):
 173                  tomany.add( t[i] )
 174      return tomany
 175  
 176  def removeRules( path, table ):
 177      fp = open( path, 'r', encoding = 'U8' )
 178      texc = list()
 179      for line in fp:
 180          elems = line.split( '=>' )
 181          f = t = elems[0].strip()
 182          if len( elems ) == 2:
 183              t = elems[1].strip()
 184          f = f.strip('"').strip("'")
 185          t = t.strip('"').strip("'")
 186          if f:
 187              try:
 188                  table.pop( f )
 189              except:
 190                  pass
 191          if t:
 192              texc.append( t )
 193      texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) )
 194      if pyversion[:1] in ['2']:
 195          for (tmp_f, tmp_t) in table.copy().iteritems():
 196              if texcptn.match( tmp_t ):
 197                  table.pop( tmp_f )
 198      else:
 199          for (tmp_f, tmp_t) in table.copy().items():
 200              if texcptn.match( tmp_t ):
 201                  table.pop( tmp_f )
 202      return table
 203  
 204  def customRules( path ):
 205      fp = open( path, 'r', encoding = 'U8' )
 206      ret = dict()
 207      for line in fp:
 208          elems = line.split( '#' )[0].split()
 209          if len( elems ) > 1:
 210              ret[elems[0]] = elems[1]
 211      return ret
 212  
 213  def dictToSortedList( src_table, pos ):
 214      return sorted( src_table.items(), key = lambda m: m[pos] )
 215  
 216  def translate( text, conv_table ):
 217      i = 0
 218      while i < len( text ):
 219          for j in range( len( text ) - i, 0, -1 ):
 220              f = text[i:][:j]
 221              t = conv_table.get( f )
 222              if t:
 223                  text = text[:i] + t + text[i:][j:]
 224                  i += len(t) - 1
 225                  break
 226          i += 1
 227      return text
 228  
 229  def manualWordsTable( path, conv_table, reconv_table ):
 230      fp = open( path, 'r', encoding = 'U8' )
 231      reconv_table = {}
 232      wordlist = [line.split( '#' )[0].strip() for line in fp]
 233      wordlist = list( set( wordlist ) )
 234      wordlist.sort( key = len, reverse = True )
 235      while wordlist:
 236          word = wordlist.pop()
 237          new_word = translate( word, conv_table )
 238          rcv_word = translate( word, reconv_table )
 239          if word != rcv_word:
 240              reconv_table[word] = word
 241          reconv_table[new_word] = word
 242      return reconv_table
 243  
 244  def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
 245      wordlist = list( src_wordlist )
 246      wordlist.sort( key = len, reverse = True )
 247      word_conv_table = {}
 248      word_reconv_table = {}
 249      conv_table = char_conv_table.copy()
 250      reconv_table = char_reconv_table.copy()
 251      tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) )
 252      while wordlist:
 253          conv_table.update( word_conv_table )
 254          reconv_table.update( word_reconv_table )
 255          word = wordlist.pop()
 256          new_word_len = word_len = len( word )
 257          while new_word_len == word_len:
 258              add = False
 259              test_word = translate( word, reconv_table )
 260              new_word = translate( word, conv_table )
 261              if not reconv_table.get( new_word ) \
 262                 and ( test_word != word \
 263                 or ( tomanyptn.search( word ) \
 264                 and word != translate( new_word, reconv_table ) ) ):
 265                  word_conv_table[word] = new_word
 266                  word_reconv_table[new_word] = word
 267              try:
 268                  word = wordlist.pop()
 269              except IndexError:
 270                  break
 271              new_word_len = len(word)
 272      return word_reconv_table
 273  
 274  def PHPArray( table ):
 275      lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
 276      return '\n'.join(lines)
 277  
 278  def main():
 279      #Get Unihan.zip:
 280      url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
 281      han_dest = 'Unihan.zip'
 282      download( url, han_dest )
 283      
 284      # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
 285      url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
 286      tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
 287      download( url, tbe_dest )
 288      
 289      # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
 290      url  = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
 291      pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
 292      download( url, pyn_dest )
 293      
 294      # Get libtabe-$(LIBTABE_VER).tgz:
 295      url  = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
 296      lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
 297      download( url, lbt_dest )
 298      
 299      # Unihan.txt
 300      ( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest )
 301  
 302      t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) )
 303      s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) )
 304      
 305      if pyversion[:1] in ['2']:
 306        t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] )
 307        s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] )
 308      else:
 309        t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.items()] )
 310        s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.items()] )
 311      
 312      s_tomany = toManyRules( t2s_1tomany )
 313      t_tomany = toManyRules( s2t_1tomany )
 314  
 315      # noconvert rules
 316      t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 )
 317      s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 )
 318      
 319      # the supper set for word to word conversion
 320      t2s_1to1_supp = t2s_1to1.copy()
 321      s2t_1to1_supp = s2t_1to1.copy()
 322      t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) )
 323      s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) )
 324      
 325      # word to word manual rules
 326      t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
 327      t2s_word2word_manual.update( customRules( 'toSimp.manual' ) )
 328      s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
 329      s2t_word2word_manual.update( customRules( 'toTrad.manual' ) )
 330  
 331      # word to word rules from input methods
 332      t_wordlist = set()
 333      s_wordlist = set()
 334      t_wordlist.update( ezbigParser( tbe_dest ),
 335                         tsiParser( lbt_dest ) )
 336      s_wordlist.update( wubiParser( tbe_dest ),
 337                         zrmParser( tbe_dest ),
 338                         phraseParser( pyn_dest ) )
 339  
 340      # exclude
 341      s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' )
 342      t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' )
 343  
 344      s2t_supp = s2t_1to1_supp.copy()
 345      s2t_supp.update( s2t_word2word_manual )
 346      t2s_supp = t2s_1to1_supp.copy()
 347      t2s_supp.update( t2s_word2word_manual )
 348  
 349      # parse list to dict
 350      t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
 351      t2s_word2word.update( t2s_word2word_manual )
 352      s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
 353      s2t_word2word.update( s2t_word2word_manual )
 354      
 355      # Final tables
 356      # sorted list toHans
 357      if pyversion[:1] in ['2']:
 358          t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] )
 359      else:
 360          t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.items() if f != t] )
 361      toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 )
 362      # sorted list toHant
 363      if pyversion[:1] in ['2']:
 364          s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] )
 365      else:
 366          s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.items() if f != t] )
 367      toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 )
 368      # sorted list toCN
 369      toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 )
 370      # sorted list toHK
 371      toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 )
 372      # sorted list toSG
 373      toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 )
 374      # sorted list toTW
 375      toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 )
 376      
 377      # Get PHP Array
 378      php = '''<?php
 379  /**
 380   * Simplified / Traditional Chinese conversion tables
 381   *
 382   * Automatically generated using code and data in maintenance/language/zhtable/
 383   * Do not modify directly!
 384   *
 385   * @file
 386   */
 387  
 388  $zh2Hant = array(\n'''
 389      php += PHPArray( toHant ) \
 390          +  '\n);\n\n$zh2Hans = array(\n' \
 391          +  PHPArray( toHans ) \
 392          +  '\n);\n\n$zh2TW = array(\n' \
 393          +  PHPArray( toTW ) \
 394          +  '\n);\n\n$zh2HK = array(\n' \
 395          +  PHPArray( toHK ) \
 396          +  '\n);\n\n$zh2CN = array(\n' \
 397          +  PHPArray( toCN ) \
 398          +  '\n);\n\n$zh2SG = array(\n' \
 399          +  PHPArray( toSG ) \
 400          +  '\n);\n'
 401      
 402      if pyversion[:1] in ['2']:
 403          f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding = 'utf8' )
 404      else:
 405          f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'w', buffering = 4096, encoding = 'utf8' )
 406      print ('Writing ZhConversion.php ... ')
 407      f.write( php )
 408      f.close()
 409      
 410      # Remove temporary files
 411      print ('Deleting temporary files ... ')
 412      os.remove('EZ-Big.txt.in')
 413      os.remove('phrase_lib.txt')
 414      os.remove('tsi.src')
 415      os.remove('Unihan_Variants.txt')
 416      os.remove('Wubi.txt.in')
 417      os.remove('Ziranma.txt.in')
 418      
 419  
 420  if __name__ == '__main__':
 421      main()
PHP Cross Reference of MediaWiki-1.24.0

/maintenance/language/zhtable/ -> Makefile.py (source)