Package nltk :: Package corpus :: Package reader :: Module cmudict
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.cmudict

  1  # Natural Language Toolkit: Genesis Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2008 NLTK Project 
  4  # Author: Steven Bird <[email protected]> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6] 
 10  ftp://ftp.cs.cmu.edu/project/speech/dict/ 
 11  Copyright 1998 Carnegie Mellon University 
 12   
 13  File Format: Each line consists of an uppercased word, a counter 
 14  (for alternative pronunciations), and a transcription.  Vowels are 
 15  marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.: 
 16  NATURAL 1 N AE1 CH ER0 AH0 L 
 17   
 18  The dictionary contains 127069 entries.  Of these, 119400 words are assigned 
 19  a unique pronunciation, 6830 words have two pronunciations, and 839 words have 
 20  three or more pronunciations.  Many of these are fast-speech variants. 
 21   
 22  Phonemes: There are 39 phonemes, as shown below: 
 23       
 24  Phoneme Example Translation    Phoneme Example Translation 
 25  ------- ------- -----------    ------- ------- ----------- 
 26  AA      odd     AA D           AE      at      AE T 
 27  AH      hut     HH AH T        AO      ought   AO T 
 28  AW      cow     K AW           AY      hide    HH AY D 
 29  B       be      B IY           CH      cheese  CH IY Z 
 30  D       dee     D IY           DH      thee    DH IY 
 31  EH      Ed      EH D           ER      hurt    HH ER T 
 32  EY      ate     EY T           F       fee     F IY 
 33  G       green   G R IY N       HH      he      HH IY 
 34  IH      it      IH T           IY      eat     IY T 
 35  JH      gee     JH IY          K       key     K IY 
 36  L       lee     L IY           M       me      M IY 
 37  N       knee    N IY           NG      ping    P IH NG 
 38  OW      oat     OW T           OY      toy     T OY 
 39  P       pee     P IY           R       read    R IY D 
 40  S       sea     S IY           SH      she     SH IY 
 41  T       tea     T IY           TH      theta   TH EY T AH 
 42  UH      hood    HH UH D        UW      two     T UW 
 43  V       vee     V IY           W       we      W IY 
 44  Y       yield   Y IY L D       Z       zee     Z IY 
 45  ZH      seizure S IY ZH ER 
 46  """ 
 47   
 48  from nltk.corpus.reader.util import * 
 49  from nltk.corpus.reader.api import * 
 50  import codecs 
 51  from nltk.internals import deprecated 
 52   
53 -class CMUDictCorpusReader(CorpusReader):
54 - def entries(self):
55 """ 56 @return: the cmudict lexicon as a list of entries 57 containing (word, identifier, transcription) tuples. 58 """ 59 return concat([StreamBackedCorpusView(filename, read_cmudict_block, 60 encoding=enc) 61 for filename, enc in self.abspaths(None, True)])
62
63 - def raw(self):
64 """ 65 @return: the cmudict lexicon as a raw string. 66 """ 67 if files is None: files = self._files 68 elif isinstance(files, basestring): files = [files] 69 return concat([self.open(f).read() for f in files])
70
71 - def words(self):
72 """ 73 @return: a list of all words defined in the cmudict lexicon. 74 """ 75 return [word for (word, num, transcription) in self.entries()]
76
77 - def transcriptions(self):
78 """ 79 @return: the cmudict lexicon as a dictionary, whose keys are 80 upper case words and whose values are tuples of pronunciation 81 entries. 82 """ 83 lexicon = self.entries() 84 d = {} 85 for word, num, transcription in lexicon: 86 if num == 1: 87 d[word] = (transcription,) 88 else: 89 d[word] += (transcription,) 90 return d
91 92 #{ Deprecated since 0.8 93 @deprecated("Use .entries() or .transcriptions() instead.")
94 - def read(self, items='cmudict', format='listed'):
95 if format == 'listed': return self.entries(items) 96 if format == 'dictionary': return self.transcriptions(items) 97 raise ValueError('bad format %r' % format)
98 @deprecated("Use .transcriptions() instead.")
99 - def dictionary(self, items='cmudict'): return self.transcriptions(items)
100 @deprecated("Use .entries() instead.")
101 - def listed(self, items='cmudict'): return self.entries(items)
102 #} 103
104 -def read_cmudict_block(stream):
105 entries = [] 106 while len(entries) < 100: # Read 100 at a time. 107 line = stream.readline() 108 if line == '': return entries # end of file. 109 pieces = line.split() 110 entries.append( (pieces[0], int(pieces[1]), tuple(pieces[2:])) ) 111 return entries
112