1
2
3
4
5
6
7
8
9
10 """
11 Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
12 English Prose (YCOE), a 1.5 million word syntactically-annotated
13 corpus of Old English prose texts. The corpus is distributed by the
14 Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
15 with NLTK.
16
17 The YCOE corpus is divided into 100 files, each representing
18 an Old English prose text. Tags used within each text complies
19 to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
20 """
21
22 from nltk.corpus.reader.util import *
23 from nltk.corpus.reader.api import *
24 from nltk.tokenize import RegexpTokenizer
25 from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
26 from nltk.corpus.reader.tagged import TaggedCorpusReader
27 from string import split
28 import os, re
29 from nltk.internals import deprecated
30
32 """
33 Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
34 English Prose (YCOE), a 1.5 million word syntactically-annotated
35 corpus of Old English prose texts.
36 """
37 - def __init__(self, root, encoding=None):
38 self._psd_reader = YCOEParseCorpusReader(
39 os.path.join(root, 'psd'), '.*', '.psd', encoding=encoding)
40 self._pos_reader = YCOETaggedCorpusReader(
41 os.path.join(root, 'pos'), '.*', '.pos', encoding=encoding)
42
43
44 documents = set(f[:-4] for f in self._psd_reader.files())
45 if set(f[:-4] for f in self._pos_reader.files()) != documents:
46 raise ValueError('Items in "psd" and "pos" '
47 'subdirectories do not match.')
48
49 files = sorted(['%s.psd' % doc for doc in documents] +
50 ['%s.pos' % doc for doc in documents])
51 CorpusReader.__init__(self, root, files, encoding)
52 self._documents = tuple(sorted(documents))
53
55 """
56 Return a list of document identifiers for all documents in
57 this corpus, or for the documents with the given file(s) if
58 specified.
59 """
60 if files is None:
61 return self._documents
62 if isinstance(files, basestring):
63 files = [files]
64 for f in files:
65 if f not in self._files:
66 raise KeyError('File id %s not found' % files)
67
68 return sorted(set(f[:-4] for f in files))
69
70 - def files(self, documents=None):
71 """
72 Return a list of file identifiers for the files that make up
73 this corpus, or that store the given document(s) if specified.
74 """
75 if documents is None:
76 return self._files
77 elif isinstance(documents, basestring):
78 documents = [documents]
79 return sorted(set(['%s.pos' % doc for doc in documents] +
80 ['%s.psd' % doc for doc in documents]))
81
83 """
84 Helper that selects the appropraite files for a given set of
85 documents from a given subcorpus (pos or psd).
86 """
87 if documents is None:
88 documents = self._documents
89 else:
90 if isinstance(documents, basestring):
91 documents = [documents]
92 for document in documents:
93 if document not in self._documents:
94 if document[-4:] in ('.pos', '.psd'):
95 raise ValueError(
96 'Expected a document identifier, not a file '
97 'identifier. (Use corpus.documents() to get '
98 'a list of document identifiers.')
99 else:
100 raise ValueError('Document identifier %s not found'
101 % document)
102 return ['%s.%s' % (d, subcorpus) for d in documents]
103
104
105 - def words(self, documents=None):
107 - def sents(self, documents=None):
109 - def paras(self, documents=None):
119
120
121 @deprecated("Use .raw() or .words() or .tagged_words() or "
122 ".parsed_sents() instead.")
123 - def read(self, items=None, format='parsed'):
130 @deprecated("Use .parsed_sents() instead.")
131 - def parsed(self, items=None):
133 @deprecated("Use .words() instead.")
136 @deprecated("Use .tagged_words() instead.")
137 - def tagged(self, items=None):
139 @deprecated("Operation no longer supported.")
141 raise ValueError('format "chunked" no longer supported')
142
143
145 """Specialized version of the standard bracket parse corpus reader
146 that strips out (CODE ...) and (ID ...) nodes."""
151
153 - def __init__(self, root, items, encoding=None):
159
160
161 documents = {
162 'coadrian.o34': 'Adrian and Ritheus',
163 'coaelhom.o3': 'Ælfric, Supplemental Homilies',
164 'coaelive.o3': 'Ælfric\'s Lives of Saints',
165 'coalcuin': 'Alcuin De virtutibus et vitiis',
166 'coalex.o23': 'Alexander\'s Letter to Aristotle',
167 'coapollo.o3': 'Apollonius of Tyre',
168 'coaugust': 'Augustine',
169 'cobede.o2': 'Bede\'s History of the English Church',
170 'cobenrul.o3': 'Benedictine Rule',
171 'coblick.o23': 'Blickling Homilies',
172 'coboeth.o2': 'Boethius\' Consolation of Philosophy',
173 'cobyrhtf.o3': 'Byrhtferth\'s Manual',
174 'cocanedgD': 'Canons of Edgar (D)',
175 'cocanedgX': 'Canons of Edgar (X)',
176 'cocathom1.o3': 'Ælfric\'s Catholic Homilies I',
177 'cocathom2.o3': 'Ælfric\'s Catholic Homilies II',
178 'cochad.o24': 'Saint Chad',
179 'cochdrul': 'Chrodegang of Metz, Rule',
180 'cochristoph': 'Saint Christopher',
181 'cochronA.o23': 'Anglo-Saxon Chronicle A',
182 'cochronC': 'Anglo-Saxon Chronicle C',
183 'cochronD': 'Anglo-Saxon Chronicle D',
184 'cochronE.o34': 'Anglo-Saxon Chronicle E',
185 'cocura.o2': 'Cura Pastoralis',
186 'cocuraC': 'Cura Pastoralis (Cotton)',
187 'codicts.o34': 'Dicts of Cato',
188 'codocu1.o1': 'Documents 1 (O1)',
189 'codocu2.o12': 'Documents 2 (O1/O2)',
190 'codocu2.o2': 'Documents 2 (O2)',
191 'codocu3.o23': 'Documents 3 (O2/O3)',
192 'codocu3.o3': 'Documents 3 (O3)',
193 'codocu4.o24': 'Documents 4 (O2/O4)',
194 'coeluc1': 'Honorius of Autun, Elucidarium 1',
195 'coeluc2': 'Honorius of Autun, Elucidarium 1',
196 'coepigen.o3': 'Ælfric\'s Epilogue to Genesis',
197 'coeuphr': 'Saint Euphrosyne',
198 'coeust': 'Saint Eustace and his companions',
199 'coexodusP': 'Exodus (P)',
200 'cogenesiC': 'Genesis (C)',
201 'cogregdC.o24': 'Gregory\'s Dialogues (C)',
202 'cogregdH.o23': 'Gregory\'s Dialogues (H)',
203 'coherbar': 'Pseudo-Apuleius, Herbarium',
204 'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)',
205 'coinspolX': 'Wulfstan\'s Institute of Polity (X)',
206 'cojames': 'Saint James',
207 'colacnu.o23': 'Lacnunga',
208 'colaece.o2': 'Leechdoms',
209 'colaw1cn.o3': 'Laws, Cnut I',
210 'colaw2cn.o3': 'Laws, Cnut II',
211 'colaw5atr.o3': 'Laws, Æthelred V',
212 'colaw6atr.o3': 'Laws, Æthelred VI',
213 'colawaf.o2': 'Laws, Alfred',
214 'colawafint.o2': 'Alfred\'s Introduction to Laws',
215 'colawger.o34': 'Laws, Gerefa',
216 'colawine.ox2': 'Laws, Ine',
217 'colawnorthu.o3': 'Northumbra Preosta Lagu',
218 'colawwllad.o4': 'Laws, William I, Lad',
219 'coleofri.o4': 'Leofric',
220 'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth',
221 'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)',
222 'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)',
223 'colwgeat': 'Ælfric\'s Letter to Wulfgeat',
224 'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)',
225 'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)',
226 'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I',
227 'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II',
228 'comargaC.o34': 'Saint Margaret (C)',
229 'comargaT': 'Saint Margaret (T)',
230 'comart1': 'Martyrology, I',
231 'comart2': 'Martyrology, II',
232 'comart3.o23': 'Martyrology, III',
233 'comarvel.o23': 'Marvels of the East',
234 'comary': 'Mary of Egypt',
235 'coneot': 'Saint Neot',
236 'conicodA': 'Gospel of Nicodemus (A)',
237 'conicodC': 'Gospel of Nicodemus (C)',
238 'conicodD': 'Gospel of Nicodemus (D)',
239 'conicodE': 'Gospel of Nicodemus (E)',
240 'coorosiu.o2': 'Orosius',
241 'cootest.o3': 'Heptateuch',
242 'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I',
243 'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II',
244 'coprefcura.o2': 'Preface to the Cura Pastoralis',
245 'coprefgen.o3': 'Ælfric\'s Preface to Genesis',
246 'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints',
247 'coprefsolilo': 'Preface to Augustine\'s Soliloquies',
248 'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus',
249 'corood': 'History of the Holy Rood-Tree',
250 'cosevensl': 'Seven Sleepers',
251 'cosolilo': 'St. Augustine\'s Soliloquies',
252 'cosolsat1.o4': 'Solomon and Saturn I',
253 'cosolsat2': 'Solomon and Saturn II',
254 'cotempo.o3': 'Ælfric\'s De Temporibus Anni',
255 'coverhom': 'Vercelli Homilies',
256 'coverhomE': 'Vercelli Homilies (E)',
257 'coverhomL': 'Vercelli Homilies (L)',
258 'covinceB': 'Saint Vincent (Bodley 343)',
259 'covinsal': 'Vindicta Salvatoris',
260 'cowsgosp.o3': 'West-Saxon Gospels',
261 'cowulf.o34': 'Wulfstan\'s Homilies'
262 }
263