1
2
3
4
5
6
7
8
9
10 """
11 Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
12
13 This corpus contains selected portion of the TIMIT corpus.
14
15 - 16 speakers from 8 dialect regions
16 - 1 male and 1 female from each dialect region
17 - total 130 sentences (10 sentences per speaker. Note that some
18 sentences are shared among other speakers, especially sa1 and sa2
19 are spoken by all speakers.)
20 - total 160 recording of sentences (10 recordings per speaker)
21 - audio format: NIST Sphere, single channel, 16kHz sampling,
22 16 bit sample, PCM encoding
23
24
25 Module contents
26 ===============
27
28 The timit corpus reader provides 4 functions and 4 data items.
29
30 - utterances
31
32 List of utterances in the corpus. There are total 160 utterances,
33 each of which corresponds to a unique utterance of a speaker.
34 Here's an example of an utterance identifier in the list::
35
36 dr1-fvmh0/sx206
37 - _---- _---
38 | | | | |
39 | | | | |
40 | | | | `--- sentence number
41 | | | `----- sentence type (a:all, i:shared, x:exclusive)
42 | | `--------- speaker ID
43 | `------------ sex (m:male, f:female)
44 `-------------- dialect region (1..8)
45
46 - speakers
47
48 List of speaker IDs. An example of speaker ID::
49
50 dr1-fvmh0
51
52 Note that if you split an item ID with colon and take the first element of
53 the result, you will get a speaker ID.
54
55 >>> itemid = dr1-fvmh0/sx206
56 >>> spkrid,sentid = itemid.split('/')
57 >>> spkrid
58 'dr1-fvmh0'
59
60 The second element of the result is a sentence ID.
61
62 - dictionary()
63
64 Phonetic dictionary of words contained in this corpus. This is a Python
65 dictionary from words to phoneme lists.
66
67 - spkrinfo()
68
69 Speaker information table. It's a Python dictionary from speaker IDs to
70 records of 10 fields. Speaker IDs the same as the ones in timie.speakers.
71 Each record is a dictionary from field names to values, and the fields are
72 as follows::
73
74 id speaker ID as defined in the original TIMIT speaker info table
75 sex speaker gender (M:male, F:female)
76 dr speaker dialect region (1:new england, 2:northern,
77 3:north midland, 4:south midland, 5:southern, 6:new york city,
78 7:western, 8:army brat (moved around))
79 use corpus type (TRN:training, TST:test)
80 in this sample corpus only TRN is available
81 recdate recording date
82 birthdate speaker birth date
83 ht speaker height
84 race speaker race (WHT:white, BLK:black, AMR:american indian,
85 SPN:spanish-american, ORN:oriental,???:unknown)
86 edu speaker education level (HS:high school, AS:associate degree,
87 BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
88 PHD:doctorate degree (PhD,JD,MD), ??:unknown)
89 comments comments by the recorder
90
91 The 4 functions are as follows.
92
93 - tokenized(sentences=items, offset=False)
94
95 Given a list of items, returns an iterator of a list of word lists,
96 each of which corresponds to an item (sentence). If offset is set to True,
97 each element of the word list is a tuple of word(string), start offset and
98 end offset, where offset is represented as a number of 16kHz samples.
99
100 - phonetic(sentences=items, offset=False)
101
102 Given a list of items, returns an iterator of a list of phoneme lists,
103 each of which corresponds to an item (sentence). If offset is set to True,
104 each element of the phoneme list is a tuple of word(string), start offset
105 and end offset, where offset is represented as a number of 16kHz samples.
106
107 - audiodata(item, start=0, end=None)
108
109 Given an item, returns a chunk of audio samples formatted into a string.
110 When the fuction is called, if start and end are omitted, the entire
111 samples of the recording will be returned. If only end is omitted,
112 samples from the start offset to the end of the recording will be returned.
113
114 - play(data)
115
116 Play the given audio samples. The audio samples can be obtained from the
117 timit.audiodata function.
118
119 """
120
121 from nltk.corpus.reader.util import *
122 from nltk.corpus.reader.api import *
123 from nltk.tree import Tree
124 import sys, os, re, tempfile, time
125 from nltk.internals import deprecated, import_from_stdlib
126
128 """
129 Reader for the TIMIT corpus (or any other corpus with the same
130 file layout and use of file formats). The corpus root directory
131 should contain the following files:
132
133 - timitdic.txt: dictionary of standard transcriptions
134 - spkrinfo.txt: table of speaker information
135
136 In addition, the root directory should contain one subdirectory
137 for each speaker, containing three files for each utterance:
138
139 - <utterance-id>.txt: text content of utterances
140 - <utterance-id>.wrd: tokenized text content of utterances
141 - <utterance-id>.phn: phonetic transcription of utterances
142 - <utterance-id>.wav: utterance sound file
143 """
144
145 _FILE_RE = (r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' +
146 r'timitdic\.txt|spkrinfo\.txt')
147 """A regexp matchin files that are used by this corpus reader."""
148 _UTTERANCE_RE = r'\w+-\w+/\w+\.txt'
149
150 - def __init__(self, root, encoding=None):
172
173 - def files(self, filetype=None):
174 """
175 Return a list of file identifiers for the files that make up
176 this corpus.
177
178 @param filetype: If specified, then C{filetype} indicates that
179 only the files that have the given type should be
180 returned. Accepted values are: C{txt}, C{wrd}, C{phn},
181 C{wav}, or C{metadata},
182 """
183 if filetype is None:
184 return CorpusReader.files(self)
185 elif filetype in ('txt', 'wrd', 'phn', 'wav'):
186 return ['%s.%s' % (u, filetype) for u in self._utterances]
187 elif filetype == 'metadata':
188 return ['timitdic.txt', 'spkrinfo.txt']
189 else:
190 raise ValueError('Bad value for filetype: %r' % filetype)
191
192 - def utterances(self, dialect=None, sex=None, spkrid=None,
193 sent_type=None, sentid=None):
194 """
195 @return: A list of the utterance identifiers for all
196 utterances in this corpus, or for the given speaker, dialect
197 region, gender, sentence type, or sentence number, if
198 specified.
199 """
200 if isinstance(dialect, basestring): dialect = [dialect]
201 if isinstance(sex, basestring): sex = [sex]
202 if isinstance(spkrid, basestring): spkrid = [spkrid]
203 if isinstance(sent_type, basestring): sent_type = [sent_type]
204 if isinstance(sentid, basestring): sentid = [sentid]
205
206 utterances = list(self._utterances)
207 if dialect is not None:
208 utterances = [u for u in utterances if u[2] in dialect]
209 if sex is not None:
210 utterances = [u for u in utterances if u[4] in sex]
211 if spkrid is not None:
212 utterances = [u for u in utterances if u[:9] in spkrid]
213 if sent_type is not None:
214 utterances = [u for u in utterances if u[11] in sent_type]
215 if sentid is not None:
216 utterances = [u for u in utterances if u[10:] in spkrid]
217 return tuple(utterances)
218
220 """
221 @return: A dictionary giving the 'standard' transcription for
222 each word.
223 """
224 _transcriptions = {}
225 for line in self.open('timitdic.txt'):
226 if not line.strip() or line[0] == ';': continue
227 m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line)
228 if not m: raise ValueError('Bad line: %r' % line)
229 _transcriptions[m.group(1)] = m.group(2).split()
230 return _transcriptions
231
234
237
240
242 """
243 @return: A list of all utterances associated with a given
244 speaker.
245 """
246 return [utterance for utterance in self._utterances
247 if utterance.startswith(speaker+'/')]
248
250 """
251 @return: A dictionary mapping .. something.
252 """
253 if speaker in self._utterances:
254 speaker = self.spkrid(speaker)
255
256 if self._speakerinfo is None:
257 self._speakerinfo = {}
258 for line in self.open('spkrinfo.txt'):
259 if not line.strip() or line[0] == ';': continue
260 rec = line.strip().split(None, 9)
261 key = "dr%s-%s%s" % (rec[2],rec[1].lower(),rec[0].lower())
262 self._speakerinfo[key] = SpeakerInfo(*rec)
263
264 return self._speakerinfo[speaker]
265
266 - def phones(self, utterances=None):
270
278
279 - def words(self, utterances=None):
283
288
289 - def sents(self, utterances=None):
293
299
324
325
326
327
328 - def wav(self, utterance, start=0, end=None):
329
330 wave = import_from_stdlib('wave')
331
332 w = wave.open(self.open(utterance+'.wav'), 'rb')
333
334
335 if start==0 and end is None:
336 return w.read()
337
338
339 else:
340
341 w.readframes(start)
342
343 frames = w.readframes(end-start)
344
345
346 tf = tempfile.TemporaryFile()
347 out = wave.open(tf, 'w')
348
349 out.setparams(w.getparams())
350 out.writeframes(frames)
351 out.close()
352
353
354 tf.seek(0)
355 return tf.read()
356
357 - def audiodata(self, utterance, start=0, end=None):
365
370
371 - def play(self, utterance, start=0, end=None):
372 """
373 Play the given audio sample.
374
375 @param utterance: The utterance id of the sample to play
376 """
377
378 try:
379 import ossaudiodev
380 try:
381 dsp = ossaudiodev.open('w')
382 dsp.setfmt(ossaudiodev.AFMT_S16_LE)
383 dsp.channels(1)
384 dsp.speed(16000)
385 dsp.write(self.audiodata(utterance, start, end))
386 dsp.close()
387 except IOError, e:
388 print >>sys.stderr, ("can't acquire the audio device; please "
389 "activate your audio device.")
390 print >>sys.stderr, "system error message:", str(e)
391 return
392 except ImportError:
393 pass
394
395
396 try:
397 import pygame.mixer, StringIO
398 pygame.mixer.init(16000)
399 f = StringIO.StringIO(self.wav(utterance, start, end))
400 pygame.mixer.Sound(f).play()
401 while pygame.mixer.get_busy():
402 time.sleep(0.01)
403 return
404 except ImportError:
405 pass
406
407
408 print >>sys.stderr, ("you must install pygame or ossaudiodev "
409 "for audio playback.")
410
411
412 @deprecated("Use utterances(spkrid=...) instead.")
415
416
417
418 @deprecated("Use .sents() or .sent_times() instead.")
419 - def tokenized(self, utterances=None, offset=True):
422 @deprecated("Use .phones() or .phone_times() instead.")
423 - def phonetic(self, utterances=None, offset=True):
426
427
429 - def __init__(self, id, sex, dr, use, recdate, birthdate,
430 ht, race, edu, comments=None):
431 self.id = id
432 self.sex = sex
433 self.dr = dr
434 self.use = use
435 self.recdate = recdate
436 self.birthdate = birthdate
437 self.ht = ht
438 self.race = race
439 self.edu = edu
440 self.comments = comments
441
443 attribs = 'id sex dr use recdate birthdate ht race edu comments'
444 args = ['%s=%r' % (attr, getattr(self, attr))
445 for attr in attribs.split()]
446 return 'SpeakerInfo(%s)' % (', '.join(args))
447