nltk.corpus.reader.bnc

1 # Natural Language Toolkit: Plaintext Corpus Reader 2 # 3 # Copyright (C) 2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # URL: <http://nltk.org> 6 # For license information, see LICENSE.TXT 7 8 """ 9 Corpus reader for the XML version of the British National Corpus. 10 """ 11 __docformat__ = 'epytext en' 12 13 from nltk.corpus.reader.xmldocs import XMLCorpusReader 14 import nltk.etree.ElementTree as ET 15 from nltk.corpus.reader.api import * 16 from nltk.corpus.reader.util import * 17 from nltk.corpus.reader.xmldocs import * 18 import re 19

20 -class BNCCorpusReader(XMLCorpusReader):

21 """ 22 Corpus reader for the XML version of the British National Corpus. 23 For access to the complete XML data structure, use the L{xml()} 24 method. For access to simple word lists and tagged word lists, use 25 L{words()}, L{sents()}, L{tagged_words()}, and L{tagged_sents()}. 26 """

27 - def __init__(self, root, files, lazy=True):

28 XMLCorpusReader.__init__(self, root, files) 29 self._lazy = lazy

30

31 - def words(self, files=None, strip_space=True, stem=False):

32 """ 33 @return: the given file or files as a list of words 34 and punctuation symbols. 35 @rtype: C{list} of C{str} 36 37 @param strip_space: If true, then strip trailing spaces from 38 word tokens. Otherwise, leave the spaces on the tokens. 39 @param stem: If true, then use word stems instead of word strings. 40 """ 41 if self._lazy: 42 return concat([BNCWordView(filename, False, None, 43 strip_space, stem) 44 for filename in self.abspaths(files)]) 45 else: 46 return concat([self._words(filename, False, None, 47 strip_space, stem) 48 for filename in self.abspaths(files)])

49

50 - def tagged_words(self, files=None, c5=False, strip_space=True, stem=False):

51 """ 52 @return: the given file or files as a list of tagged 53 words and punctuation symbols, encoded as tuples 54 C{(word,tag)}. 55 @rtype: C{list} of C{(str,str)} 56 57 @param c5: If true, then the tags used will be the more detailed 58 c5 tags. Otherwise, the simplified tags will be used. 59 @param strip_space: If true, then strip trailing spaces from 60 word tokens. Otherwise, leave the spaces on the tokens. 61 @param stem: If true, then use word stems instead of word strings. 62 """ 63 if c5: tag = 'c5' 64 else: tag = 'pos' 65 if self._lazy: 66 return concat([BNCWordView(filename, False, tag, strip_space, stem) 67 for filename in self.abspaths(files)]) 68 else: 69 return concat([self._words(filename, False, tag, strip_space, stem) 70 for filename in self.abspaths(files)])

71

72 - def sents(self, files=None, strip_space=True, stem=False):

73 """ 74 @return: the given file or files as a list of 75 sentences or utterances, each encoded as a list of word 76 strings. 77 @rtype: C{list} of (C{list} of C{str}) 78 79 @param strip_space: If true, then strip trailing spaces from 80 word tokens. Otherwise, leave the spaces on the tokens. 81 @param stem: If true, then use word stems instead of word strings. 82 """ 83 if self._lazy: 84 return concat([BNCWordView(filename, True, None, strip_space, stem) 85 for filename in self.abspaths(files)]) 86 else: 87 return concat([self._words(filename, True, None, strip_space, stem) 88 for filename in self.abspaths(files)])

89

90 - def tagged_sents(self, files=None, c5=False, strip_space=True, 91 stem=False):

92 """ 93 @return: the given file or files as a list of 94 sentences, each encoded as a list of C{(word,tag)} tuples. 95 @rtype: C{list} of (C{list} of C{(str,str)}) 96 97 @param c5: If true, then the tags used will be the more detailed 98 c5 tags. Otherwise, the simplified tags will be used. 99 @param strip_space: If true, then strip trailing spaces from 100 word tokens. Otherwise, leave the spaces on the tokens. 101 @param stem: If true, then use word stems instead of word strings. 102 """ 103 if c5: tag = 'c5' 104 else: tag = 'pos' 105 if self._lazy: 106 return concat([BNCWordView(filename, True, tag, strip_space, stem) 107 for filename in self.abspaths(files)]) 108 else: 109 return concat([self._words(filename, True, tag, strip_space, stem) 110 for filename in self.abspaths(files)])

111

112 - def _words(self, filename, bracket_sent, tag, strip_space, stem):

113 """ 114 Helper used to implement the view methods -- returns a list of 115 words or a list of sentences, optionally tagged. 116 117 @param filename: The name of the underlying file. 118 @param bracket_sent: If true, include sentence bracketing. 119 @param tag: The name of the tagset to use, or None for no tags. 120 @param strip_space: If true, strip spaces from word tokens. 121 @param stem: If true, then substitute stems for words. 122 """ 123 result = [] 124 125 xmldoc = ElementTree.parse(filename).getroot() 126 for xmlsent in xmldoc.findall('.//s'): 127 sent = [] 128 for xmlword in _all_xmlwords_in(xmlsent): 129 word = xmlword.text 130 if strip_space or stem: word = word.strip() 131 if stem: word = xmlword.get('hw', word) 132 if tag == 'c5': 133 word = (word, xmlword.get('c5')) 134 elif tag == 'pos': 135 word = (word, xmlword.get('pos', xmlword.get('c5'))) 136 sent.append(word) 137 if bracket_sent: 138 result.append(BNCSentence(xmlsent.attrib['n'], sent)) 139 else: 140 result.extend(sent) 141 142 assert None not in result 143 return result

144

145 -def _all_xmlwords_in(elt, result=None):

146 if result is None: result = [] 147 for child in elt: 148 if child.tag in ('c', 'w'): result.append(child) 149 else: _all_xmlwords_in(child, result) 150 return result

151

152 -class BNCSentence(list):

153 """ 154 A list of words, augmented by an attribute C{num} used to record 155 the sentence identifier (the C{n} attribute from the XML). 156 """

157 - def __init__(self, num, items):

158 self.num = num 159 list.__init__(self, items)

160

161 -class BNCWordView(XMLCorpusView):

162 """ 163 A stream backed corpus view specialized for use with the BNC corpus. 164 """

165 - def __init__(self, filename, sent, tag, strip_space, stem):

166 """ 167 @param filename: The name of the underlying file. 168 @param sent: If true, include sentence bracketing. 169 @param tag: The name of the tagset to use, or None for no tags. 170 @param strip_space: If true, strip spaces from word tokens. 171 @param stem: If true, then substitute stems for words. 172 """ 173 if sent: tagspec = '.*/s' 174 else: tagspec = '.*/s/(.*/)?(c|w)' 175 self._sent = sent 176 self._tag = tag 177 self._strip_space = strip_space 178 self._stem = stem 179 180 XMLCorpusView.__init__(self, filename, tagspec) 181 182 # Read in a tasty header. 183 self._open() 184 self.read_block(self._stream, '.*/teiHeader$', self.handle_header) 185 self.close() 186 187 # Reset tag context. 188 self._tag_context = {0: ()}

189 190 191 title = None #: Title of the document. 192 author = None #: Author of the document. 193 editor = None #: Editor 194 resps = None #: Statement of responsibility 195

196 - def handle_header(self, elt, context):

197 # Set up some metadata! 198 titles = elt.findall('titleStmt/title') 199 if titles: self.title = '\n'.join( 200 [title.text.strip() for title in titles]) 201 202 authors = elt.findall('titleStmt/author') 203 if authors: self.author = '\n'.join( 204 [author.text.strip() for author in authors]) 205 206 editors = elt.findall('titleStmt/editor') 207 if editors: self.editor = '\n'.join( 208 [editor.text.strip() for editor in editors]) 209 210 resps = elt.findall('titleStmt/respStmt') 211 if resps: self.resps = '\n\n'.join([ 212 '\n'.join([resp_elt.text.strip() for resp_elt in resp]) 213 for resp in resps])

214

215 - def handle_elt(self, elt, context):

216 if self._sent: return self.handle_sent(elt) 217 else: return self.handle_word(elt)

218

219 - def handle_word(self, elt):

220 word = elt.text 221 if self._strip_space or self._stem: 222 word = word.strip() 223 if self._stem: 224 word = elt.get('hw', word) 225 if self._tag == 'c5': 226 word = (word, elt.get('c5')) 227 elif self._tag == 'pos': 228 word = (word, elt.get('pos', elt.get('c5'))) 229 return word

230

231 - def handle_sent(self, elt):

232 sent = [] 233 for child in elt: 234 if child.tag == 'mw': 235 sent += [self.handle_word(w) for w in child] 236 elif child.tag in ('w','c'): 237 sent.append(self.handle_word(child)) 238 else: 239 raise ValueError('Unexpected element %s' % child.tag) 240 return BNCSentence(elt.attrib['n'], sent)

241

Source Code for Module nltk.corpus.reader.bnc