1
2
3
4
5
6
7
8 """
9 The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
10 ftp://ftp.cs.cmu.edu/project/speech/dict/
11 Copyright 1998 Carnegie Mellon University
12
13 File Format: Each line consists of an uppercased word, a counter
14 (for alternative pronunciations), and a transcription. Vowels are
15 marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
16 NATURAL 1 N AE1 CH ER0 AH0 L
17
18 The dictionary contains 127069 entries. Of these, 119400 words are assigned
19 a unique pronunciation, 6830 words have two pronunciations, and 839 words have
20 three or more pronunciations. Many of these are fast-speech variants.
21
22 Phonemes: There are 39 phonemes, as shown below:
23
24 Phoneme Example Translation Phoneme Example Translation
25 ------- ------- ----------- ------- ------- -----------
26 AA odd AA D AE at AE T
27 AH hut HH AH T AO ought AO T
28 AW cow K AW AY hide HH AY D
29 B be B IY CH cheese CH IY Z
30 D dee D IY DH thee DH IY
31 EH Ed EH D ER hurt HH ER T
32 EY ate EY T F fee F IY
33 G green G R IY N HH he HH IY
34 IH it IH T IY eat IY T
35 JH gee JH IY K key K IY
36 L lee L IY M me M IY
37 N knee N IY NG ping P IH NG
38 OW oat OW T OY toy T OY
39 P pee P IY R read R IY D
40 S sea S IY SH she SH IY
41 T tea T IY TH theta TH EY T AH
42 UH hood HH UH D UW two T UW
43 V vee V IY W we W IY
44 Y yield Y IY L D Z zee Z IY
45 ZH seizure S IY ZH ER
46 """
47
48 from nltk.corpus.reader.util import *
49 from nltk.corpus.reader.api import *
50 import codecs
51 from nltk.internals import deprecated
52
62
70
72 """
73 @return: a list of all words defined in the cmudict lexicon.
74 """
75 return [word for (word, num, transcription) in self.entries()]
76
78 """
79 @return: the cmudict lexicon as a dictionary, whose keys are
80 upper case words and whose values are tuples of pronunciation
81 entries.
82 """
83 lexicon = self.entries()
84 d = {}
85 for word, num, transcription in lexicon:
86 if num == 1:
87 d[word] = (transcription,)
88 else:
89 d[word] += (transcription,)
90 return d
91
92
93 @deprecated("Use .entries() or .transcriptions() instead.")
94 - def read(self, items='cmudict', format='listed'):
98 @deprecated("Use .transcriptions() instead.")
100 @deprecated("Use .entries() instead.")
102
103
112