nltk.corpus.reader.sinica

Source Code for Module nltk.corpus.reader.sinica_treebank

1 # Natural Language Toolkit: Sinica Treebank Reader 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Steven Bird <[email protected]> 5 # URL: <http://nltk.org> 6 # For license information, see LICENSE.TXT 7 8 """ 9 Sinica Treebank Corpus Sample 10 11 http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm 12 13 10,000 parsed sentences, drawn from the Academia Sinica Balanced 14 Corpus of Modern Chinese. Parse tree notation is based on 15 Information-based Case Grammar. Tagset documentation is available 16 at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html 17 18 Language and Knowledge Processing Group, Institute of Information 19 Science, Academia Sinica 20 21 It is distributed with the Natural Language Toolkit under the terms of 22 the Creative Commons Attribution-NonCommercial-ShareAlike License 23 [http://creativecommons.org/licenses/by-nc-sa/2.5/]. 24 25 References: 26 27 Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999) 28 The Construction of Sinica Treebank. Computational Linguistics and 29 Chinese Language Processing, 4, pp 87-104. 30 31 Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming 32 Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria, 33 Annotation Guidelines, and On-line Interface. Proceedings of 2nd 34 Chinese Language Processing Workshop, Association for Computational 35 Linguistics. 36 37 Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar 38 Extraction, Proceedings of IJCNLP-04, pp560-565. 39 """ 40 41 from util import * 42 from api import * 43 from nltk import tokenize, tree 44 import os, re 45 from nltk.internals import deprecated 46 47 IDENTIFIER = re.compile(r'^#\S+\s') 48 APPENDIX = re.compile(r'(?<=\))#.*$') 49 TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)') 50 WORD = re.compile(r':[^:()|]+:([^:()|]+)') 51

52 -class SinicaTreebankCorpusReader(SyntaxCorpusReader):

53 """ 54 Reader for the sinica treebank. 55 """

56 - def _read_block(self, stream):

57 sent = stream.readline() 58 sent = IDENTIFIER.sub('', sent) 59 sent = APPENDIX.sub('', sent) 60 return [sent]

61

62 - def _parse(self, sent):

63 return tree.sinica_parse(sent)

64

65 - def _tag(self, sent, simplify_tags=None):

66 tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(sent)] 67 if simplify_tags: 68 tagged_sent = [(w, self._tag_mapping_function(t)) 69 for (w,t) in tagged_sent] 70 return tagged_sent

71

72 - def _word(self, sent):

73 return WORD.findall(sent)

74