1
2
3
4
5
6
7
8 """
9 Sinica Treebank Corpus Sample
10
11 http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm
12
13 10,000 parsed sentences, drawn from the Academia Sinica Balanced
14 Corpus of Modern Chinese. Parse tree notation is based on
15 Information-based Case Grammar. Tagset documentation is available
16 at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html
17
18 Language and Knowledge Processing Group, Institute of Information
19 Science, Academia Sinica
20
21 It is distributed with the Natural Language Toolkit under the terms of
22 the Creative Commons Attribution-NonCommercial-ShareAlike License
23 [http://creativecommons.org/licenses/by-nc-sa/2.5/].
24
25 References:
26
27 Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999)
28 The Construction of Sinica Treebank. Computational Linguistics and
29 Chinese Language Processing, 4, pp 87-104.
30
31 Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming
32 Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria,
33 Annotation Guidelines, and On-line Interface. Proceedings of 2nd
34 Chinese Language Processing Workshop, Association for Computational
35 Linguistics.
36
37 Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar
38 Extraction, Proceedings of IJCNLP-04, pp560-565.
39 """
40
41 from util import *
42 from api import *
43 from nltk import tokenize, tree
44 import os, re
45 from nltk.internals import deprecated
46
47 IDENTIFIER = re.compile(r'^#\S+\s')
48 APPENDIX = re.compile(r'(?<=\))#.*$')
49 TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)')
50 WORD = re.compile(r':[^:()|]+:([^:()|]+)')
51
53 """
54 Reader for the sinica treebank.
55 """
61
64
65 - def _tag(self, sent, simplify_tags=None):
66 tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(sent)]
67 if simplify_tags:
68 tagged_sent = [(w, self._tag_mapping_function(t))
69 for (w,t) in tagged_sent]
70 return tagged_sent
71
74