1
2
3
4
5
6
7
8 """
9 Corpus reader for the XML version of the British National Corpus.
10 """
11 __docformat__ = 'epytext en'
12
13 from nltk.corpus.reader.xmldocs import XMLCorpusReader
14 import nltk.etree.ElementTree as ET
15 from nltk.corpus.reader.api import *
16 from nltk.corpus.reader.util import *
17 from nltk.corpus.reader.xmldocs import *
18 import re
19
21 """
22 Corpus reader for the XML version of the British National Corpus.
23 For access to the complete XML data structure, use the L{xml()}
24 method. For access to simple word lists and tagged word lists, use
25 L{words()}, L{sents()}, L{tagged_words()}, and L{tagged_sents()}.
26 """
27 - def __init__(self, root, files, lazy=True):
30
31 - def words(self, files=None, strip_space=True, stem=False):
32 """
33 @return: the given file or files as a list of words
34 and punctuation symbols.
35 @rtype: C{list} of C{str}
36
37 @param strip_space: If true, then strip trailing spaces from
38 word tokens. Otherwise, leave the spaces on the tokens.
39 @param stem: If true, then use word stems instead of word strings.
40 """
41 if self._lazy:
42 return concat([BNCWordView(filename, False, None,
43 strip_space, stem)
44 for filename in self.abspaths(files)])
45 else:
46 return concat([self._words(filename, False, None,
47 strip_space, stem)
48 for filename in self.abspaths(files)])
49
50 - def tagged_words(self, files=None, c5=False, strip_space=True, stem=False):
51 """
52 @return: the given file or files as a list of tagged
53 words and punctuation symbols, encoded as tuples
54 C{(word,tag)}.
55 @rtype: C{list} of C{(str,str)}
56
57 @param c5: If true, then the tags used will be the more detailed
58 c5 tags. Otherwise, the simplified tags will be used.
59 @param strip_space: If true, then strip trailing spaces from
60 word tokens. Otherwise, leave the spaces on the tokens.
61 @param stem: If true, then use word stems instead of word strings.
62 """
63 if c5: tag = 'c5'
64 else: tag = 'pos'
65 if self._lazy:
66 return concat([BNCWordView(filename, False, tag, strip_space, stem)
67 for filename in self.abspaths(files)])
68 else:
69 return concat([self._words(filename, False, tag, strip_space, stem)
70 for filename in self.abspaths(files)])
71
72 - def sents(self, files=None, strip_space=True, stem=False):
73 """
74 @return: the given file or files as a list of
75 sentences or utterances, each encoded as a list of word
76 strings.
77 @rtype: C{list} of (C{list} of C{str})
78
79 @param strip_space: If true, then strip trailing spaces from
80 word tokens. Otherwise, leave the spaces on the tokens.
81 @param stem: If true, then use word stems instead of word strings.
82 """
83 if self._lazy:
84 return concat([BNCWordView(filename, True, None, strip_space, stem)
85 for filename in self.abspaths(files)])
86 else:
87 return concat([self._words(filename, True, None, strip_space, stem)
88 for filename in self.abspaths(files)])
89
90 - def tagged_sents(self, files=None, c5=False, strip_space=True,
91 stem=False):
92 """
93 @return: the given file or files as a list of
94 sentences, each encoded as a list of C{(word,tag)} tuples.
95 @rtype: C{list} of (C{list} of C{(str,str)})
96
97 @param c5: If true, then the tags used will be the more detailed
98 c5 tags. Otherwise, the simplified tags will be used.
99 @param strip_space: If true, then strip trailing spaces from
100 word tokens. Otherwise, leave the spaces on the tokens.
101 @param stem: If true, then use word stems instead of word strings.
102 """
103 if c5: tag = 'c5'
104 else: tag = 'pos'
105 if self._lazy:
106 return concat([BNCWordView(filename, True, tag, strip_space, stem)
107 for filename in self.abspaths(files)])
108 else:
109 return concat([self._words(filename, True, tag, strip_space, stem)
110 for filename in self.abspaths(files)])
111
112 - def _words(self, filename, bracket_sent, tag, strip_space, stem):
113 """
114 Helper used to implement the view methods -- returns a list of
115 words or a list of sentences, optionally tagged.
116
117 @param filename: The name of the underlying file.
118 @param bracket_sent: If true, include sentence bracketing.
119 @param tag: The name of the tagset to use, or None for no tags.
120 @param strip_space: If true, strip spaces from word tokens.
121 @param stem: If true, then substitute stems for words.
122 """
123 result = []
124
125 xmldoc = ElementTree.parse(filename).getroot()
126 for xmlsent in xmldoc.findall('.//s'):
127 sent = []
128 for xmlword in _all_xmlwords_in(xmlsent):
129 word = xmlword.text
130 if strip_space or stem: word = word.strip()
131 if stem: word = xmlword.get('hw', word)
132 if tag == 'c5':
133 word = (word, xmlword.get('c5'))
134 elif tag == 'pos':
135 word = (word, xmlword.get('pos', xmlword.get('c5')))
136 sent.append(word)
137 if bracket_sent:
138 result.append(BNCSentence(xmlsent.attrib['n'], sent))
139 else:
140 result.extend(sent)
141
142 assert None not in result
143 return result
144
151
153 """
154 A list of words, augmented by an attribute C{num} used to record
155 the sentence identifier (the C{n} attribute from the XML).
156 """
160
162 """
163 A stream backed corpus view specialized for use with the BNC corpus.
164 """
165 - def __init__(self, filename, sent, tag, strip_space, stem):
166 """
167 @param filename: The name of the underlying file.
168 @param sent: If true, include sentence bracketing.
169 @param tag: The name of the tagset to use, or None for no tags.
170 @param strip_space: If true, strip spaces from word tokens.
171 @param stem: If true, then substitute stems for words.
172 """
173 if sent: tagspec = '.*/s'
174 else: tagspec = '.*/s/(.*/)?(c|w)'
175 self._sent = sent
176 self._tag = tag
177 self._strip_space = strip_space
178 self._stem = stem
179
180 XMLCorpusView.__init__(self, filename, tagspec)
181
182
183 self._open()
184 self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
185 self.close()
186
187
188 self._tag_context = {0: ()}
189
190
191 title = None
192 author = None
193 editor = None
194 resps = None
195
214
218
230
241