nltk.corpus.reader.ycoe

Source Code for Module nltk.corpus.reader.ycoe

1 # -*- coding: iso-8859-1 -*- 2 3 # Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE) 4 # 5 # Copyright (C) 2001-2008 NLTK Project 6 # Author: Selina Dennis <[email protected]> 7 # URL: <http://nltk.org> 8 # For license information, see LICENSE.TXT 9 10 """ 11 Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old 12 English Prose (YCOE), a 1.5 million word syntactically-annotated 13 corpus of Old English prose texts. The corpus is distributed by the 14 Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included 15 with NLTK. 16 17 The YCOE corpus is divided into 100 files, each representing 18 an Old English prose text. Tags used within each text complies 19 to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm 20 """ 21 22 from nltk.corpus.reader.util import * 23 from nltk.corpus.reader.api import * 24 from nltk.tokenize import RegexpTokenizer 25 from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader 26 from nltk.corpus.reader.tagged import TaggedCorpusReader 27 from string import split 28 import os, re 29 from nltk.internals import deprecated 30

31 -class YCOECorpusReader(CorpusReader):

32 """ 33 Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old 34 English Prose (YCOE), a 1.5 million word syntactically-annotated 35 corpus of Old English prose texts. 36 """

37 - def __init__(self, root, encoding=None):

38 self._psd_reader = YCOEParseCorpusReader( 39 os.path.join(root, 'psd'), '.*', '.psd', encoding=encoding) 40 self._pos_reader = YCOETaggedCorpusReader( 41 os.path.join(root, 'pos'), '.*', '.pos', encoding=encoding) 42 43 # Make sure we have a consistent set of items: 44 documents = set(f[:-4] for f in self._psd_reader.files()) 45 if set(f[:-4] for f in self._pos_reader.files()) != documents: 46 raise ValueError('Items in "psd" and "pos" ' 47 'subdirectories do not match.') 48 49 files = sorted(['%s.psd' % doc for doc in documents] + 50 ['%s.pos' % doc for doc in documents]) 51 CorpusReader.__init__(self, root, files, encoding) 52 self._documents = tuple(sorted(documents))

53

54 - def documents(self, files=None):

55 """ 56 Return a list of document identifiers for all documents in 57 this corpus, or for the documents with the given file(s) if 58 specified. 59 """ 60 if files is None: 61 return self._documents 62 if isinstance(files, basestring): 63 files = [files] 64 for f in files: 65 if f not in self._files: 66 raise KeyError('File id %s not found' % files) 67 # Strip off the '.pos' and '.psd' extensions. 68 return sorted(set(f[:-4] for f in files))

69

70 - def files(self, documents=None):

71 """ 72 Return a list of file identifiers for the files that make up 73 this corpus, or that store the given document(s) if specified. 74 """ 75 if documents is None: 76 return self._files 77 elif isinstance(documents, basestring): 78 documents = [documents] 79 return sorted(set(['%s.pos' % doc for doc in documents] + 80 ['%s.psd' % doc for doc in documents]))

81

82 - def _getfiles(self, documents, subcorpus):

83 """ 84 Helper that selects the appropraite files for a given set of 85 documents from a given subcorpus (pos or psd). 86 """ 87 if documents is None: 88 documents = self._documents 89 else: 90 if isinstance(documents, basestring): 91 documents = [documents] 92 for document in documents: 93 if document not in self._documents: 94 if document[-4:] in ('.pos', '.psd'): 95 raise ValueError( 96 'Expected a document identifier, not a file ' 97 'identifier. (Use corpus.documents() to get ' 98 'a list of document identifiers.') 99 else: 100 raise ValueError('Document identifier %s not found' 101 % document) 102 return ['%s.%s' % (d, subcorpus) for d in documents]

103 104 # Delegate to one of our two sub-readers:

105 - def words(self, documents=None):

106 return self._pos_reader.words(self._getfiles(documents, 'pos'))

107 - def sents(self, documents=None):

108 return self._pos_reader.sents(self._getfiles(documents, 'pos'))

109 - def paras(self, documents=None):

110 return self._pos_reader.paras(self._getfiles(documents, 'pos'))

111 - def tagged_words(self, documents=None):

112 return self._pos_reader.tagged_words(self._getfiles(documents, 'pos'))

113 - def tagged_sents(self, documents=None):

114 return self._pos_reader.tagged_sents(self._getfiles(documents, 'pos'))

115 - def tagged_paras(self, documents=None):

116 return self._pos_reader.tagged_paras(self._getfiles(documents, 'pos'))

117 - def parsed_sents(self, documents=None):

118 return self._psd_reader.parsed_sents(self._getfiles(documents, 'psd'))

119 120 #{ Deprecated since 0.8 121 @deprecated("Use .raw() or .words() or .tagged_words() or " 122 ".parsed_sents() instead.")

123 - def read(self, items=None, format='parsed'):

124 if format == 'parsed': return self.parsed_sents(items) 125 if format == 'raw': return self.raw(items) 126 if format == 'tokenized': return self.words(items) 127 if format == 'tagged': return self.tagged_words(items) 128 if format == 'chunked': raise ValueError('no longer supported') 129 raise ValueError('bad format %r' % format)

130 @deprecated("Use .parsed_sents() instead.")

131 - def parsed(self, items=None):

132 return self.parsed_sents(items)

133 @deprecated("Use .words() instead.")

134 - def tokenized(self, items=None):

135 return self.words(items)

136 @deprecated("Use .tagged_words() instead.")

137 - def tagged(self, items=None):

138 return self.tagged_words(items)

139 @deprecated("Operation no longer supported.")

140 - def chunked(self, items=None):

141 raise ValueError('format "chunked" no longer supported')

142 #} 143

144 -class YCOEParseCorpusReader(BracketParseCorpusReader):

145 """Specialized version of the standard bracket parse corpus reader 146 that strips out (CODE ...) and (ID ...) nodes."""

147 - def _parse(self, t):

148 t = re.sub(r'(?u)$(CODE|ID)[^$]*\)', '', t) 149 if re.match(r'\s*$\s*$\s*$', t): return None 150 return BracketParseCorpusReader._parse(self, t)

151

152 -class YCOETaggedCorpusReader(TaggedCorpusReader):

153 - def __init__(self, root, items, encoding=None):

154 gaps_re = r'(?u)\(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' 155 sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) 156 TaggedCorpusReader.__init__(self, root, items, sep='_', 157 sent_tokenizer=sent_tokenizer, 158 encoding=encoding)

159 160 #: A list of all documents and their titles in ycoe. 161 documents = { 162 'coadrian.o34': 'Adrian and Ritheus', 163 'coaelhom.o3': 'Ælfric, Supplemental Homilies', 164 'coaelive.o3': 'Ælfric\'s Lives of Saints', 165 'coalcuin': 'Alcuin De virtutibus et vitiis', 166 'coalex.o23': 'Alexander\'s Letter to Aristotle', 167 'coapollo.o3': 'Apollonius of Tyre', 168 'coaugust': 'Augustine', 169 'cobede.o2': 'Bede\'s History of the English Church', 170 'cobenrul.o3': 'Benedictine Rule', 171 'coblick.o23': 'Blickling Homilies', 172 'coboeth.o2': 'Boethius\' Consolation of Philosophy', 173 'cobyrhtf.o3': 'Byrhtferth\'s Manual', 174 'cocanedgD': 'Canons of Edgar (D)', 175 'cocanedgX': 'Canons of Edgar (X)', 176 'cocathom1.o3': 'Ælfric\'s Catholic Homilies I', 177 'cocathom2.o3': 'Ælfric\'s Catholic Homilies II', 178 'cochad.o24': 'Saint Chad', 179 'cochdrul': 'Chrodegang of Metz, Rule', 180 'cochristoph': 'Saint Christopher', 181 'cochronA.o23': 'Anglo-Saxon Chronicle A', 182 'cochronC': 'Anglo-Saxon Chronicle C', 183 'cochronD': 'Anglo-Saxon Chronicle D', 184 'cochronE.o34': 'Anglo-Saxon Chronicle E', 185 'cocura.o2': 'Cura Pastoralis', 186 'cocuraC': 'Cura Pastoralis (Cotton)', 187 'codicts.o34': 'Dicts of Cato', 188 'codocu1.o1': 'Documents 1 (O1)', 189 'codocu2.o12': 'Documents 2 (O1/O2)', 190 'codocu2.o2': 'Documents 2 (O2)', 191 'codocu3.o23': 'Documents 3 (O2/O3)', 192 'codocu3.o3': 'Documents 3 (O3)', 193 'codocu4.o24': 'Documents 4 (O2/O4)', 194 'coeluc1': 'Honorius of Autun, Elucidarium 1', 195 'coeluc2': 'Honorius of Autun, Elucidarium 1', 196 'coepigen.o3': 'Ælfric\'s Epilogue to Genesis', 197 'coeuphr': 'Saint Euphrosyne', 198 'coeust': 'Saint Eustace and his companions', 199 'coexodusP': 'Exodus (P)', 200 'cogenesiC': 'Genesis (C)', 201 'cogregdC.o24': 'Gregory\'s Dialogues (C)', 202 'cogregdH.o23': 'Gregory\'s Dialogues (H)', 203 'coherbar': 'Pseudo-Apuleius, Herbarium', 204 'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)', 205 'coinspolX': 'Wulfstan\'s Institute of Polity (X)', 206 'cojames': 'Saint James', 207 'colacnu.o23': 'Lacnunga', 208 'colaece.o2': 'Leechdoms', 209 'colaw1cn.o3': 'Laws, Cnut I', 210 'colaw2cn.o3': 'Laws, Cnut II', 211 'colaw5atr.o3': 'Laws, Æthelred V', 212 'colaw6atr.o3': 'Laws, Æthelred VI', 213 'colawaf.o2': 'Laws, Alfred', 214 'colawafint.o2': 'Alfred\'s Introduction to Laws', 215 'colawger.o34': 'Laws, Gerefa', 216 'colawine.ox2': 'Laws, Ine', 217 'colawnorthu.o3': 'Northumbra Preosta Lagu', 218 'colawwllad.o4': 'Laws, William I, Lad', 219 'coleofri.o4': 'Leofric', 220 'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth', 221 'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)', 222 'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)', 223 'colwgeat': 'Ælfric\'s Letter to Wulfgeat', 224 'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)', 225 'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)', 226 'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I', 227 'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II', 228 'comargaC.o34': 'Saint Margaret (C)', 229 'comargaT': 'Saint Margaret (T)', 230 'comart1': 'Martyrology, I', 231 'comart2': 'Martyrology, II', 232 'comart3.o23': 'Martyrology, III', 233 'comarvel.o23': 'Marvels of the East', 234 'comary': 'Mary of Egypt', 235 'coneot': 'Saint Neot', 236 'conicodA': 'Gospel of Nicodemus (A)', 237 'conicodC': 'Gospel of Nicodemus (C)', 238 'conicodD': 'Gospel of Nicodemus (D)', 239 'conicodE': 'Gospel of Nicodemus (E)', 240 'coorosiu.o2': 'Orosius', 241 'cootest.o3': 'Heptateuch', 242 'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I', 243 'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II', 244 'coprefcura.o2': 'Preface to the Cura Pastoralis', 245 'coprefgen.o3': 'Ælfric\'s Preface to Genesis', 246 'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints', 247 'coprefsolilo': 'Preface to Augustine\'s Soliloquies', 248 'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus', 249 'corood': 'History of the Holy Rood-Tree', 250 'cosevensl': 'Seven Sleepers', 251 'cosolilo': 'St. Augustine\'s Soliloquies', 252 'cosolsat1.o4': 'Solomon and Saturn I', 253 'cosolsat2': 'Solomon and Saturn II', 254 'cotempo.o3': 'Ælfric\'s De Temporibus Anni', 255 'coverhom': 'Vercelli Homilies', 256 'coverhomE': 'Vercelli Homilies (E)', 257 'coverhomL': 'Vercelli Homilies (L)', 258 'covinceB': 'Saint Vincent (Bodley 343)', 259 'covinsal': 'Vindicta Salvatoris', 260 'cowsgosp.o3': 'West-Saxon Gospels', 261 'cowulf.o34': 'Wulfstan\'s Homilies' 262 } 263