MediaWiki  master
Makefile.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @author Philip
4 import os
5 import platform
6 import re
7 import shutil
8 import sys
9 import tarfile
10 import zipfile
11 
12 pyversion = platform.python_version()
13 islinux = platform.system().lower() == 'linux'
14 
15 if pyversion[:3] in ['2.6', '2.7']:
16  import urllib as urllib_request
17  import codecs
18  open = codecs.open
19  _unichr = unichr
20  if sys.maxunicode < 0x10000:
21  def unichr(i):
22  if i < 0x10000:
23  return _unichr(i)
24  else:
25  return _unichr(0xD7C0 + (i >> 10)) + _unichr(0xDC00 + (i & 0x3FF))
26 elif pyversion[:2] == '3.':
27  import urllib.request as urllib_request
28  unichr = chr
29 
30 
31 def unichr2(*args):
32  return [unichr(int(i.split('<')[0][2:], 16)) for i in args]
33 
34 
35 def unichr3(*args):
36  return [unichr(int(i[2:7], 16)) for i in args if i[2:7]]
37 
38 # DEFINE
39 UNIHAN_VER = '6.3.0'
40 SF_MIRROR = 'dfn'
41 SCIM_TABLES_VER = '0.5.13'
42 SCIM_PINYIN_VER = '0.5.92'
43 LIBTABE_VER = '0.2.3'
44 # END OF DEFINE
45 
46 
47 def download(url, dest):
48  if os.path.isfile(dest):
49  print('File %s is up to date.' % dest)
50  return
51  global islinux
52  if islinux:
53  # we use wget instead urlretrieve under Linux,
54  # because wget could display details like download progress
55  os.system('wget %s -O %s' % (url, dest))
56  else:
57  print('Downloading from [%s] ...' % url)
58  urllib_request.urlretrieve(url, dest)
59  print('Download complete.\n')
60  return
61 
62 
63 def uncompress(fp, member, encoding='U8'):
64  name = member.rsplit('/', 1)[-1]
65  print('Extracting %s ...' % name)
66  fp.extract(member)
67  shutil.move(member, name)
68  if '/' in member:
69  shutil.rmtree(member.split('/', 1)[0])
70  if pyversion[:1] in ['2']:
71  fc = open(name, 'rb', encoding, 'ignore')
72  else:
73  fc = open(name, 'r', encoding=encoding, errors='ignore')
74  return fc
75 
76 unzip = lambda path, member, encoding = 'U8': \
77  uncompress(zipfile.ZipFile(path), member, encoding)
78 
79 untargz = lambda path, member, encoding = 'U8': \
80  uncompress(tarfile.open(path, 'r:gz'), member, encoding)
81 
82 
83 def parserCore(fp, pos, beginmark=None, endmark=None):
84  if beginmark and endmark:
85  start = False
86  else:
87  start = True
88  mlist = set()
89  for line in fp:
90  if beginmark and line.startswith(beginmark):
91  start = True
92  continue
93  elif endmark and line.startswith(endmark):
94  break
95  if start and not line.startswith('#'):
96  elems = line.split()
97  if len(elems) < 2:
98  continue
99  elif len(elems[0]) > 1 and len(elems[pos]) > 1: # words only
100  mlist.add(elems[pos])
101  return mlist
102 
103 
104 def tablesParser(path, name):
105  """ Read file from scim-tables and parse it. """
106  global SCIM_TABLES_VER
107  src = 'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER, name)
108  fp = untargz(path, src, 'U8')
109  return parserCore(fp, 1, 'BEGIN_TABLE', 'END_TABLE')
110 
111 ezbigParser = lambda path: tablesParser(path, 'EZ-Big.txt.in')
112 wubiParser = lambda path: tablesParser(path, 'Wubi.txt.in')
113 zrmParser = lambda path: tablesParser(path, 'Ziranma.txt.in')
114 
115 
116 def phraseParser(path):
117  """ Read phrase_lib.txt and parse it. """
118  global SCIM_PINYIN_VER
119  src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
120  fp = untargz(path, src, 'U8')
121  return parserCore(fp, 0)
122 
123 
124 def tsiParser(path):
125  """ Read tsi.src and parse it. """
126  src = 'libtabe/tsi-src/tsi.src'
127  fp = untargz(path, src, 'big5hkscs')
128  return parserCore(fp, 0)
129 
130 
131 def unihanParser(path):
132  """ Read Unihan_Variants.txt and parse it. """
133  fp = unzip(path, 'Unihan_Variants.txt', 'U8')
134  t2s = dict()
135  s2t = dict()
136  for line in fp:
137  if line.startswith('#'):
138  continue
139  else:
140  elems = line.split()
141  if len(elems) < 3:
142  continue
143  type = elems.pop(1)
144  elems = unichr2(*elems)
145  if type == 'kTraditionalVariant':
146  s2t[elems[0]] = elems[1:]
147  elif type == 'kSimplifiedVariant':
148  t2s[elems[0]] = elems[1:]
149  fp.close()
150  return (t2s, s2t)
151 
152 
153 def applyExcludes(mlist, path):
154  """ Apply exclude rules from path to mlist. """
155  if pyversion[:1] in ['2']:
156  excludes = open(path, 'rb', 'U8').read().split()
157  else:
158  excludes = open(path, 'r', encoding='U8').read().split()
159  excludes = [word.split('#')[0].strip() for word in excludes]
160  excludes = '|'.join(excludes)
161  excptn = re.compile('.*(?:%s).*' % excludes)
162  diff = [mword for mword in mlist if excptn.search(mword)]
163  mlist.difference_update(diff)
164  return mlist
165 
166 
167 def charManualTable(path):
168  fp = open(path, 'r', encoding='U8')
169  for line in fp:
170  elems = line.split('#')[0].split('|')
171  elems = unichr3(*elems)
172  if len(elems) > 1:
173  yield elems[0], elems[1:]
174 
175 
176 def toManyRules(src_table):
177  tomany = set()
178  if pyversion[:1] in ['2']:
179  for (f, t) in src_table.iteritems():
180  for i in range(1, len(t)):
181  tomany.add(t[i])
182  else:
183  for (f, t) in src_table.items():
184  for i in range(1, len(t)):
185  tomany.add(t[i])
186  return tomany
187 
188 
189 def removeRules(path, table):
190  fp = open(path, 'r', encoding='U8')
191  texc = list()
192  for line in fp:
193  elems = line.split('=>')
194  f = t = elems[0].strip()
195  if len(elems) == 2:
196  t = elems[1].strip()
197  f = f.strip('"').strip("'")
198  t = t.strip('"').strip("'")
199  if f:
200  try:
201  table.pop(f)
202  except:
203  pass
204  if t:
205  texc.append(t)
206  texcptn = re.compile('^(?:%s)$' % '|'.join(texc))
207  if pyversion[:1] in ['2']:
208  for (tmp_f, tmp_t) in table.copy().iteritems():
209  if texcptn.match(tmp_t):
210  table.pop(tmp_f)
211  else:
212  for (tmp_f, tmp_t) in table.copy().items():
213  if texcptn.match(tmp_t):
214  table.pop(tmp_f)
215  return table
216 
217 
218 def customRules(path):
219  fp = open(path, 'r', encoding='U8')
220  ret = dict()
221  for line in fp:
222  line = line.rstrip('\r\n')
223  if '#' in line:
224  line = line.split('#')[0].rstrip()
225  elems = line.split('\t')
226  if len(elems) > 1:
227  ret[elems[0]] = elems[1]
228  return ret
229 
230 
231 def dictToSortedList(src_table, pos):
232  return sorted(src_table.items(), key=lambda m: (m[pos], m[1 - pos]))
233 
234 
235 def translate(text, conv_table):
236  i = 0
237  while i < len(text):
238  for j in range(len(text) - i, 0, -1):
239  f = text[i:][:j]
240  t = conv_table.get(f)
241  if t:
242  text = text[:i] + t + text[i:][j:]
243  i += len(t) - 1
244  break
245  i += 1
246  return text
247 
248 
249 def manualWordsTable(path, conv_table, reconv_table):
250  fp = open(path, 'r', encoding='U8')
251  reconv_table = reconv_table.copy()
252  out_table = {}
253  wordlist = [line.split('#')[0].strip() for line in fp]
254  wordlist = list(set(wordlist))
255  wordlist.sort(key=lambda w: (len(w), w), reverse=True)
256  while wordlist:
257  word = wordlist.pop()
258  new_word = translate(word, conv_table)
259  rcv_word = translate(word, reconv_table)
260  if word != rcv_word:
261  reconv_table[word] = out_table[word] = word
262  reconv_table[new_word] = out_table[new_word] = word
263  return out_table
264 
265 
266 def defaultWordsTable(src_wordlist, src_tomany, char_conv_table,
267  char_reconv_table):
268  wordlist = list(src_wordlist)
269  wordlist.sort(key=lambda w: (len(w), w), reverse=True)
270  word_conv_table = {}
271  word_reconv_table = {}
272  conv_table = char_conv_table.copy()
273  reconv_table = char_reconv_table.copy()
274  tomanyptn = re.compile('(?:%s)' % '|'.join(src_tomany))
275  while wordlist:
276  conv_table.update(word_conv_table)
277  reconv_table.update(word_reconv_table)
278  word = wordlist.pop()
279  new_word_len = word_len = len(word)
280  while new_word_len == word_len:
281  test_word = translate(word, reconv_table)
282  new_word = translate(word, conv_table)
283  if not reconv_table.get(new_word) and \
284  (test_word != word or
285  (tomanyptn.search(word) and
286  word != translate(new_word, reconv_table))):
287  word_conv_table[word] = new_word
288  word_reconv_table[new_word] = word
289  try:
290  word = wordlist.pop()
291  except IndexError:
292  break
293  new_word_len = len(word)
294  return word_reconv_table
295 
296 
297 def PHPArray(table):
298  lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
299  return '\n'.join(lines)
300 
301 
302 def main():
303  # Get Unihan.zip:
304  url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
305  han_dest = 'Unihan-%s.zip' % UNIHAN_VER
306  download(url, han_dest)
307 
308  sfurlbase = 'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR
309 
310  # Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
311  url = sfurlbase + 'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER
312  tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
313  download(url, tbe_dest)
314 
315  # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
316  url = sfurlbase + 'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
317  pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
318  download(url, pyn_dest)
319 
320  # Get libtabe-$(LIBTABE_VER).tgz:
321  url = sfurlbase + 'libtabe/libtabe-%s.tgz' % LIBTABE_VER
322  lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
323  download(url, lbt_dest)
324 
325  # Unihan.txt
326  (t2s_1tomany, s2t_1tomany) = unihanParser(han_dest)
327 
328  t2s_1tomany.update(charManualTable('symme_supp.manual'))
329  t2s_1tomany.update(charManualTable('trad2simp.manual'))
330  s2t_1tomany.update((t[0], [f]) for (f, t) in charManualTable('symme_supp.manual'))
331  s2t_1tomany.update(charManualTable('simp2trad.manual'))
332 
333  if pyversion[:1] in ['2']:
334  t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.iteritems()])
335  s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.iteritems()])
336  else:
337  t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.items()])
338  s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.items()])
339 
340  s_tomany = toManyRules(t2s_1tomany)
341  t_tomany = toManyRules(s2t_1tomany)
342 
343  # noconvert rules
344  t2s_1to1 = removeRules('trad2simp_noconvert.manual', t2s_1to1)
345  s2t_1to1 = removeRules('simp2trad_noconvert.manual', s2t_1to1)
346 
347  # the supper set for word to word conversion
348  t2s_1to1_supp = t2s_1to1.copy()
349  s2t_1to1_supp = s2t_1to1.copy()
350  t2s_1to1_supp.update(customRules('trad2simp_supp_set.manual'))
351  s2t_1to1_supp.update(customRules('simp2trad_supp_set.manual'))
352 
353  # word to word manual rules
354  t2s_word2word_manual = manualWordsTable('simpphrases.manual',
355  s2t_1to1_supp, t2s_1to1_supp)
356  t2s_word2word_manual.update(customRules('toSimp.manual'))
357  s2t_word2word_manual = manualWordsTable('tradphrases.manual',
358  t2s_1to1_supp, s2t_1to1_supp)
359  s2t_word2word_manual.update(customRules('toTrad.manual'))
360 
361  # word to word rules from input methods
362  t_wordlist = set()
363  s_wordlist = set()
364  t_wordlist.update(ezbigParser(tbe_dest),
365  tsiParser(lbt_dest))
366  s_wordlist.update(wubiParser(tbe_dest),
367  zrmParser(tbe_dest),
368  phraseParser(pyn_dest))
369 
370  # exclude
371  s_wordlist = applyExcludes(s_wordlist, 'simpphrases_exclude.manual')
372  t_wordlist = applyExcludes(t_wordlist, 'tradphrases_exclude.manual')
373 
374  s2t_supp = s2t_1to1_supp.copy()
375  s2t_supp.update(s2t_word2word_manual)
376  t2s_supp = t2s_1to1_supp.copy()
377  t2s_supp.update(t2s_word2word_manual)
378 
379  # parse list to dict
380  t2s_word2word = defaultWordsTable(s_wordlist, s_tomany,
381  s2t_1to1_supp, t2s_supp)
382  t2s_word2word.update(t2s_word2word_manual)
383  s2t_word2word = defaultWordsTable(t_wordlist, t_tomany,
384  t2s_1to1_supp, s2t_supp)
385  s2t_word2word.update(s2t_word2word_manual)
386 
387  # Final tables
388  # sorted list toHans
389  if pyversion[:1] in ['2']:
390  t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.iteritems() if f != t])
391  else:
392  t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.items() if f != t])
393  toHans = dictToSortedList(t2s_1to1, 0) + dictToSortedList(t2s_word2word, 1)
394  # sorted list toHant
395  if pyversion[:1] in ['2']:
396  s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.iteritems() if f != t])
397  else:
398  s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.items() if f != t])
399  toHant = dictToSortedList(s2t_1to1, 0) + dictToSortedList(s2t_word2word, 1)
400  # sorted list toCN
401  toCN = dictToSortedList(customRules('toCN.manual'), 1)
402  # sorted list toHK
403  toHK = dictToSortedList(customRules('toHK.manual'), 1)
404  # sorted list toTW
405  toTW = dictToSortedList(customRules('toTW.manual'), 1)
406 
407  # Get PHP Array
408  php = '''<?php
409 /**
410  * Simplified / Traditional Chinese conversion tables
411  *
412  * Automatically generated using code and data in maintenance/language/zhtable/
413  * Do not modify directly!
414  *
415  * @file
416  */
417 
418 namespace MediaWiki\Languages\Data;
419 
420 class ZhConversion {
421 public static $zh2Hant = [\n'''
422  php += PHPArray(toHant) \
423  + '\n];\n\npublic static $zh2Hans = [\n' \
424  + PHPArray(toHans) \
425  + '\n];\n\npublic static $zh2TW = [\n' \
426  + PHPArray(toTW) \
427  + '\n];\n\npublic static $zh2HK = [\n' \
428  + PHPArray(toHK) \
429  + '\n];\n\npublic static $zh2CN = [\n' \
430  + PHPArray(toCN) \
431  + '\n];\n}\n'
432 
433  if pyversion[:1] in ['2']:
434  f = open(os.path.join('..', '..', '..', 'languages', 'data', 'ZhConversion.php'), 'wb', encoding='utf8')
435  else:
436  f = open(os.path.join('..', '..', '..', 'languages', 'data', 'ZhConversion.php'), 'w', buffering=4096, encoding='utf8')
437  print ('Writing ZhConversion.php ... ')
438  f.write(php)
439  f.close()
440 
441  # Remove temporary files
442  print ('Deleting temporary files ... ')
443  os.remove('EZ-Big.txt.in')
444  os.remove('phrase_lib.txt')
445  os.remove('tsi.src')
446  os.remove('Unihan_Variants.txt')
447  os.remove('Wubi.txt.in')
448  os.remove('Ziranma.txt.in')
449 
450 
451 if __name__ == '__main__':
452  main()
deferred txt A few of the database updates required by various functions here can be deferred until after the result page is displayed to the user For updating the view updating the linked to tables after a etc PHP does not yet have any way to tell the server to actually return and disconnect while still running these but it might have such a feature in the future We handle these by creating a deferred update object and putting those objects on a global list
Definition: deferred.txt:11
def parserCore
Definition: Makefile.py:83
def download
Definition: Makefile.py:47
tuple wubiParser
Definition: Makefile.py:112
def defaultWordsTable
Definition: Makefile.py:267
def uncompress
Definition: Makefile.py:63
def charManualTable
Definition: Makefile.py:167
def dictToSortedList
Definition: Makefile.py:231
def phraseParser
Definition: Makefile.py:116
def customRules
Definition: Makefile.py:218
def PHPArray
Definition: Makefile.py:297
string untargz
Definition: Makefile.py:79
it s the revision text itself In either if gzip is set
Definition: hooks.txt:2588
tuple zrmParser
Definition: Makefile.py:113
def tsiParser
Definition: Makefile.py:124
def unichr2
Definition: Makefile.py:31
def tablesParser
Definition: Makefile.py:104
def applyExcludes
Definition: Makefile.py:153
tuple ezbigParser
Definition: Makefile.py:111
def manualWordsTable
Definition: Makefile.py:249
def removeRules
Definition: Makefile.py:189
def main
Definition: Makefile.py:302
def translate
Definition: Makefile.py:235
def toManyRules
Definition: Makefile.py:176
def unichr3
Definition: Makefile.py:35
string unzip
Definition: Makefile.py:76
def unihanParser
Definition: Makefile.py:131