Package nltk :: Package corpus :: Package reader :: Module sinica_treebank
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.sinica_treebank

 1  # Natural Language Toolkit: Sinica Treebank Reader 
 2  # 
 3  # Copyright (C) 2001-2008 NLTK Project 
 4  # Author: Steven Bird <[email protected]> 
 5  # URL: <http://nltk.org> 
 6  # For license information, see LICENSE.TXT 
 7   
 8  """ 
 9  Sinica Treebank Corpus Sample 
10   
11  http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm 
12   
13  10,000 parsed sentences, drawn from the Academia Sinica Balanced 
14  Corpus of Modern Chinese.  Parse tree notation is based on 
15  Information-based Case Grammar.  Tagset documentation is available 
16  at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html 
17   
18  Language and Knowledge Processing Group, Institute of Information 
19  Science, Academia Sinica 
20   
21  It is distributed with the Natural Language Toolkit under the terms of 
22  the Creative Commons Attribution-NonCommercial-ShareAlike License 
23  [http://creativecommons.org/licenses/by-nc-sa/2.5/]. 
24   
25  References: 
26   
27  Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999) 
28  The Construction of Sinica Treebank. Computational Linguistics and 
29  Chinese Language Processing, 4, pp 87-104. 
30   
31  Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming 
32  Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria, 
33  Annotation Guidelines, and On-line Interface. Proceedings of 2nd 
34  Chinese Language Processing Workshop, Association for Computational 
35  Linguistics. 
36   
37  Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar 
38  Extraction, Proceedings of IJCNLP-04, pp560-565. 
39  """ 
40   
41  from util import * 
42  from api import * 
43  from nltk import tokenize, tree 
44  import os, re 
45  from nltk.internals import deprecated 
46   
47  IDENTIFIER = re.compile(r'^#\S+\s') 
48  APPENDIX = re.compile(r'(?<=\))#.*$') 
49  TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)') 
50  WORD = re.compile(r':[^:()|]+:([^:()|]+)') 
51   
52 -class SinicaTreebankCorpusReader(SyntaxCorpusReader):
53 """ 54 Reader for the sinica treebank. 55 """
56 - def _read_block(self, stream):
57 sent = stream.readline() 58 sent = IDENTIFIER.sub('', sent) 59 sent = APPENDIX.sub('', sent) 60 return [sent]
61
62 - def _parse(self, sent):
63 return tree.sinica_parse(sent)
64
65 - def _tag(self, sent, simplify_tags=None):
66 tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(sent)] 67 if simplify_tags: 68 tagged_sent = [(w, self._tag_mapping_function(t)) 69 for (w,t) in tagged_sent] 70 return tagged_sent
71
72 - def _word(self, sent):
73 return WORD.findall(sent)
74