Package nltk :: Package corpus
[hide private]
[frames] | no frames]

Source Code for Package nltk.corpus

  1  # Natural Language Toolkit: Corpus Readers 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Edward Loper <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  # [xx] this docstring isnt' up-to-date! 
  9  """ 
 10  NLTK corpus readers.  The modules in this package provide functions 
 11  that can be used to read corpus files in a variety of formats.  These 
 12  functions can be used to read both the corpus files that are 
 13  distributed in the NLTK corpus package, and corpus files that are part 
 14  of external corpora. 
 15   
 16  Corpus Reader Functions 
 17  ======================= 
 18  Each corpus module defines one or more X{corpus reader functions}, 
 19  which can be used to read documents from that corpus.  These functions 
 20  take an argument, C{item}, which is used to indicate which document 
 21  should be read from the corpus: 
 22   
 23    - If C{item} is one of the unique identifiers listed in the corpus 
 24      module's C{items} variable, then the corresponding document will 
 25      be loaded from the NLTK corpus package. 
 26   
 27    - If C{item} is a filename, then that file will be read. 
 28   
 29  Additionally, corpus reader functions can be given lists of item 
 30  names; in which case, they will return a concatenation of the 
 31  corresponding documents. 
 32   
 33  Corpus reader functions are named based on the type of information 
 34  they return.  Some common examples, and their return types, are: 
 35   
 36    - I{corpus}.words(): list of str 
 37    - I{corpus}.sents(): list of (list of str) 
 38    - I{corpus}.paras(): list of (list of (list of str)) 
 39    - I{corpus}.tagged_words(): list of (str,str) tuple 
 40    - I{corpus}.tagged_sents(): list of (list of (str,str)) 
 41    - I{corpus}.tagged_paras(): list of (list of (list of (str,str))) 
 42    - I{corpus}.chunked_sents(): list of (Tree w/ (str,str) leaves) 
 43    - I{corpus}.parsed_sents(): list of (Tree with str leaves) 
 44    - I{corpus}.parsed_paras(): list of (list of (Tree with str leaves)) 
 45    - I{corpus}.xml(): A single xml ElementTree 
 46    - I{corpus}.raw(): unprocessed corpus contents 
 47   
 48  For example, to read a list of the words in the Brown Corpus, use 
 49  C{nltk.corpus.brown.words()}: 
 50   
 51      >>> from nltk.corpus import brown 
 52      >>> print brown.words() 
 53      ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...] 
 54   
 55  Corpus Metadata 
 56  =============== 
 57  Metadata about the NLTK corpora, and their individual documents, is 
 58  stored using U{Open Language Archives Community (OLAC) 
 59  <http://www.language-archives.org/>} metadata records.  These records 
 60  can be accessed using C{nltk.corpus.I{corpus}.olac()}. 
 61  """ 
 62   
 63  import re 
 64   
 65  from nltk.tokenize import RegexpTokenizer 
 66  from nltk.tag import simplify_brown_tag, simplify_wsj_tag,\ 
 67                       simplify_alpino_tag, simplify_indian_tag,\ 
 68                       simplify_tag 
 69   
 70  from util import LazyCorpusLoader 
 71  from reader import * 
 72  import chat80 
 73   
 74  abc = LazyCorpusLoader( 
 75      'abc', PlaintextCorpusReader, r'(?!\.).*\.txt') 
 76  alpino = LazyCorpusLoader( 
 77      'alpino', AlpinoCorpusReader, tag_mapping_function=simplify_alpino_tag) 
 78  brown = LazyCorpusLoader( 
 79      'brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d', 
 80      cat_pattern=r'c([a-z])\d\d', tag_mapping_function=simplify_brown_tag) 
 81  cess_cat = LazyCorpusLoader( 
 82      'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf', 
 83      tag_mapping_function=simplify_tag) 
 84  cess_esp = LazyCorpusLoader( 
 85      'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf', 
 86      tag_mapping_function=simplify_tag) 
 87  cmudict = LazyCorpusLoader( 
 88      'cmudict', CMUDictCorpusReader, ['cmudict']) 
 89  conll2000 = LazyCorpusLoader( 
 90      'conll2000', ConllChunkCorpusReader, 
 91      ['train.txt', 'test.txt'], ('NP','VP','PP')) 
 92  conll2002 = LazyCorpusLoader( 
 93      'conll2002', ConllChunkCorpusReader, '.*\.(test|train).*',  
 94      ('LOC', 'PER', 'ORG', 'MISC')) 
 95  floresta = LazyCorpusLoader( 
 96      'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#', 
 97      tag_mapping_function=simplify_tag) 
 98  genesis = LazyCorpusLoader( 
 99      'genesis', PlaintextCorpusReader, r'(?!\.).*\.txt') 
100  gutenberg = LazyCorpusLoader( 
101      'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt') 
102  ieer = LazyCorpusLoader( 
103      'ieer', IEERCorpusReader, r'(?!README|\.).*') 
104  inaugural = LazyCorpusLoader( 
105      'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt') 
106  # [XX] This should probably just use TaggedCorpusReader: 
107  indian = LazyCorpusLoader( 
108      'indian', IndianCorpusReader, r'(?!\.).*\.pos', 
109      tag_mapping_function=simplify_indian_tag) 
110  mac_morpho = LazyCorpusLoader( 
111      'mac_morpho', MacMorphoCorpusReader, r'(?!\.).*\.txt', 
112      tag_mapping_function=simplify_tag) 
113  movie_reviews = LazyCorpusLoader( 
114      'movie_reviews', CategorizedPlaintextCorpusReader, 
115      r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*') 
116  names = LazyCorpusLoader( 
117      'names', WordListCorpusReader, r'(?!\.).*\.txt') 
118  nps_chat = LazyCorpusLoader( 
119      'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*', 
120      tag_mapping_function=simplify_wsj_tag) 
121  ppattach = LazyCorpusLoader( 
122      'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset']) 
123  qc = LazyCorpusLoader( 
124      'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt']) 
125  reuters = LazyCorpusLoader( 
126      'reuters', CategorizedPlaintextCorpusReader, '(training|test).*', 
127      cat_file='cats.txt') 
128  rte = LazyCorpusLoader( 
129      'rte', RTECorpusReader, r'(?!\.).*\.xml') 
130  senseval = LazyCorpusLoader( 
131      'senseval', SensevalCorpusReader, r'(?!\.).*\.pos') 
132  shakespeare = LazyCorpusLoader( 
133      'shakespeare', XMLCorpusReader, r'(?!\.).*\.xml') 
134  sinica_treebank = LazyCorpusLoader( 
135      'sinica_treebank', SinicaTreebankCorpusReader, ['parsed'], 
136      tag_mapping_function=simplify_tag) 
137  state_union = LazyCorpusLoader( 
138      'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt') 
139  stopwords = LazyCorpusLoader( 
140      'stopwords', WordListCorpusReader, r'(?!README|\.).*') 
141  timit = LazyCorpusLoader( 
142      'timit', TimitCorpusReader) 
143  toolbox = LazyCorpusLoader( 
144      'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)') 
145  treebank = LazyCorpusLoader( 
146      'treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg', 
147      tag_mapping_function=simplify_wsj_tag) 
148  hebrew_treebank = LazyCorpusLoader( 
149      'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt') 
150  propbank = LazyCorpusLoader( 
151      'propbank', PropbankCorpusReader, 
152      'prop.txt', 'frames/.*\.xml', 'verbs.txt', 
153      lambda filename: re.sub(r'^wsj/\d\d/', '', filename), 
154      treebank) # Must be defined *after* treebank corpus. 
155  treebank_chunk = LazyCorpusLoader( 
156      'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos', 
157      sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True), 
158      para_block_reader=tagged_treebank_para_block_reader) 
159  treebank_raw = LazyCorpusLoader( 
160      'treebank/raw', PlaintextCorpusReader, r'wsj_.*') 
161  udhr = LazyCorpusLoader( 
162      'udhr', PlaintextCorpusReader, r'(?!README|\.).*', 
163      # Encodings specified in filenames but not mapped to anything: 
164      # DallakHelv, VIQR, Cyrillic+Abkh, WinResearcher, font, 
165      # Afenegus6..60375, VG2Main, VPS, Turkish, TCVN, Az.Times.Lat0117, 
166      # EUC, Baltic, err, Az.Times.Cyr.Normal0117, T61, Amahuaca, Agra 
167      encoding=[('.*-UTF8$', 'utf-8'), ('.*-Latin1$', 'latin-1'), 
168                ('.*-Hebrew$', 'hebrew'), ('.*-Arabic$', 'arabic'), 
169                ('.*-Cyrillic$', 'cyrillic'), ('.*-SJIS$', 'SJIS'), 
170                ('.*-GB2312$', 'GB2312'), ('.*-Latin2$', 'ISO-8859-2'), 
171                ('.*-Greek$', 'greek'), ('.*-UFT8$', 'utf-8'), 
172                ('Hungarian_Magyar-Unicode', 'utf-16-le')] 
173      ) 
174  verbnet = LazyCorpusLoader( 
175      'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml') 
176  webtext = LazyCorpusLoader( 
177      'webtext', PlaintextCorpusReader, r'(?!README|\.).*') 
178  words = LazyCorpusLoader( 
179      'words', WordListCorpusReader, r'(?!README|\.).*') 
180  ycoe = LazyCorpusLoader( 
181      'ycoe', YCOECorpusReader) 
182   
183   
184 -def demo():
185 # This is out-of-date: 186 abc.demo() 187 brown.demo() 188 # chat80.demo() 189 cmudict.demo() 190 conll2000.demo() 191 conll2002.demo() 192 genesis.demo() 193 gutenberg.demo() 194 ieer.demo() 195 inaugural.demo() 196 indian.demo() 197 names.demo() 198 ppattach.demo() 199 senseval.demo() 200 shakespeare.demo() 201 sinica_treebank.demo() 202 state_union.demo() 203 stopwords.demo() 204 timit.demo() 205 toolbox.demo() 206 treebank.demo() 207 udhr.demo() 208 webtext.demo() 209 words.demo()
210 # ycoe.demo() 211 212 if __name__ == '__main__': 213 #demo() 214 pass 215