nltk.corpus.reader.cmudict

Source Code for Module nltk.corpus.reader.cmudict

1 # Natural Language Toolkit: Genesis Corpus Reader 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Steven Bird <[email protected]> 5 # URL: <http://nltk.org> 6 # For license information, see LICENSE.TXT 7 8 """ 9 The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6] 10 ftp://ftp.cs.cmu.edu/project/speech/dict/ 11 Copyright 1998 Carnegie Mellon University 12 13 File Format: Each line consists of an uppercased word, a counter 14 (for alternative pronunciations), and a transcription. Vowels are 15 marked for stress (1=primary, 2=secondary, 0=no stress). E.g.: 16 NATURAL 1 N AE1 CH ER0 AH0 L 17 18 The dictionary contains 127069 entries. Of these, 119400 words are assigned 19 a unique pronunciation, 6830 words have two pronunciations, and 839 words have 20 three or more pronunciations. Many of these are fast-speech variants. 21 22 Phonemes: There are 39 phonemes, as shown below: 23 24 Phoneme Example Translation Phoneme Example Translation 25 ------- ------- ----------- ------- ------- ----------- 26 AA odd AA D AE at AE T 27 AH hut HH AH T AO ought AO T 28 AW cow K AW AY hide HH AY D 29 B be B IY CH cheese CH IY Z 30 D dee D IY DH thee DH IY 31 EH Ed EH D ER hurt HH ER T 32 EY ate EY T F fee F IY 33 G green G R IY N HH he HH IY 34 IH it IH T IY eat IY T 35 JH gee JH IY K key K IY 36 L lee L IY M me M IY 37 N knee N IY NG ping P IH NG 38 OW oat OW T OY toy T OY 39 P pee P IY R read R IY D 40 S sea S IY SH she SH IY 41 T tea T IY TH theta TH EY T AH 42 UH hood HH UH D UW two T UW 43 V vee V IY W we W IY 44 Y yield Y IY L D Z zee Z IY 45 ZH seizure S IY ZH ER 46 """ 47 48 from nltk.corpus.reader.util import * 49 from nltk.corpus.reader.api import * 50 import codecs 51 from nltk.internals import deprecated 52

53 -class CMUDictCorpusReader(CorpusReader):

54 - def entries(self):

55 """ 56 @return: the cmudict lexicon as a list of entries 57 containing (word, identifier, transcription) tuples. 58 """ 59 return concat([StreamBackedCorpusView(filename, read_cmudict_block, 60 encoding=enc) 61 for filename, enc in self.abspaths(None, True)])

62

63 - def raw(self):

64 """ 65 @return: the cmudict lexicon as a raw string. 66 """ 67 if files is None: files = self._files 68 elif isinstance(files, basestring): files = [files] 69 return concat([self.open(f).read() for f in files])

70

71 - def words(self):

72 """ 73 @return: a list of all words defined in the cmudict lexicon. 74 """ 75 return [word for (word, num, transcription) in self.entries()]

76

77 - def transcriptions(self):

78 """ 79 @return: the cmudict lexicon as a dictionary, whose keys are 80 upper case words and whose values are tuples of pronunciation 81 entries. 82 """ 83 lexicon = self.entries() 84 d = {} 85 for word, num, transcription in lexicon: 86 if num == 1: 87 d[word] = (transcription,) 88 else: 89 d[word] += (transcription,) 90 return d

91 92 #{ Deprecated since 0.8 93 @deprecated("Use .entries() or .transcriptions() instead.")

94 - def read(self, items='cmudict', format='listed'):

95 if format == 'listed': return self.entries(items) 96 if format == 'dictionary': return self.transcriptions(items) 97 raise ValueError('bad format %r' % format)

98 @deprecated("Use .transcriptions() instead.")

99 - def dictionary(self, items='cmudict'): return self.transcriptions(items)

100 @deprecated("Use .entries() instead.")

101 - def listed(self, items='cmudict'): return self.entries(items)

102 #} 103

104 -def read_cmudict_block(stream):

105 entries = [] 106 while len(entries) < 100: # Read 100 at a time. 107 line = stream.readline() 108 if line == '': return entries # end of file. 109 pieces = line.split() 110 entries.append( (pieces[0], int(pieces[1]), tuple(pieces[2:])) ) 111 return entries

112