nltk.corpus.reader.timit

128 """ 129 Reader for the TIMIT corpus (or any other corpus with the same 130 file layout and use of file formats). The corpus root directory 131 should contain the following files: 132 133 - timitdic.txt: dictionary of standard transcriptions 134 - spkrinfo.txt: table of speaker information 135 136 In addition, the root directory should contain one subdirectory 137 for each speaker, containing three files for each utterance: 138 139 - <utterance-id>.txt: text content of utterances 140 - <utterance-id>.wrd: tokenized text content of utterances 141 - <utterance-id>.phn: phonetic transcription of utterances 142 - <utterance-id>.wav: utterance sound file 143 """ 144 145 _FILE_RE = (r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' + 146 r'timitdic\.txt|spkrinfo\.txt') 147 """A regexp matchin files that are used by this corpus reader.""" 148 _UTTERANCE_RE = r'\w+-\w+/\w+\.txt' 149

150 - def __init__(self, root, encoding=None):

151 """ 152 Construct a new TIMIT corpus reader in the given directory. 153 @param root: The root directory for this corpus. 154 """ 155 # Ensure that wave files don't get treated as unicode data: 156 if isinstance(encoding, basestring): 157 encoding = [('.*\.wav', None), ('.*', encoding)] 158 159 CorpusReader.__init__(self, root, 160 find_corpus_files(root, self._FILE_RE), 161 encoding=encoding) 162 163 self._utterances = [name[:-4] for name in 164 find_corpus_files(root, self._UTTERANCE_RE)] 165 """A list of the utterance identifiers for all utterances in 166 this corpus.""" 167 168 self._speakerinfo = None 169 self._root = root 170 self.speakers = tuple(sorted(set(u.split('/')[0] 171 for u in self._utterances)))

172

173 - def files(self, filetype=None):

174 """ 175 Return a list of file identifiers for the files that make up 176 this corpus. 177 178 @param filetype: If specified, then C{filetype} indicates that 179 only the files that have the given type should be 180 returned. Accepted values are: C{txt}, C{wrd}, C{phn}, 181 C{wav}, or C{metadata}, 182 """ 183 if filetype is None: 184 return CorpusReader.files(self) 185 elif filetype in ('txt', 'wrd', 'phn', 'wav'): 186 return ['%s.%s' % (u, filetype) for u in self._utterances] 187 elif filetype == 'metadata': 188 return ['timitdic.txt', 'spkrinfo.txt'] 189 else: 190 raise ValueError('Bad value for filetype: %r' % filetype)

191

192 - def utterances(self, dialect=None, sex=None, spkrid=None, 193 sent_type=None, sentid=None):

194 """ 195 @return: A list of the utterance identifiers for all 196 utterances in this corpus, or for the given speaker, dialect 197 region, gender, sentence type, or sentence number, if 198 specified. 199 """ 200 if isinstance(dialect, basestring): dialect = [dialect] 201 if isinstance(sex, basestring): sex = [sex] 202 if isinstance(spkrid, basestring): spkrid = [spkrid] 203 if isinstance(sent_type, basestring): sent_type = [sent_type] 204 if isinstance(sentid, basestring): sentid = [sentid] 205 206 utterances = list(self._utterances) 207 if dialect is not None: 208 utterances = [u for u in utterances if u[2] in dialect] 209 if sex is not None: 210 utterances = [u for u in utterances if u[4] in sex] 211 if spkrid is not None: 212 utterances = [u for u in utterances if u[:9] in spkrid] 213 if sent_type is not None: 214 utterances = [u for u in utterances if u[11] in sent_type] 215 if sentid is not None: 216 utterances = [u for u in utterances if u[10:] in spkrid] 217 return tuple(utterances)

218

219 - def transcription_dict(self):

220 """ 221 @return: A dictionary giving the 'standard' transcription for 222 each word. 223 """ 224 _transcriptions = {} 225 for line in self.open('timitdic.txt'): 226 if not line.strip() or line[0] == ';': continue 227 m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line) 228 if not m: raise ValueError('Bad line: %r' % line) 229 _transcriptions[m.group(1)] = m.group(2).split() 230 return _transcriptions

231

232 - def spkrid(self, utterance):

233 return utterance.split('/')[0]

234

235 - def sentid(self, utterance):

236 return utterance.split('/')[1]

237

238 - def utterance(self, spkrid, sentid):

239 return '%s/%s' % (spkrid, sentid)

240

241 - def spkrutterances(self, speaker):

242 """ 243 @return: A list of all utterances associated with a given 244 speaker. 245 """ 246 return [utterance for utterance in self._utterances 247 if utterance.startswith(speaker+'/')]

248

249 - def spkrinfo(self, speaker):

250 """ 251 @return: A dictionary mapping .. something. 252 """ 253 if speaker in self._utterances: 254 speaker = self.spkrid(speaker) 255 256 if self._speakerinfo is None: 257 self._speakerinfo = {} 258 for line in self.open('spkrinfo.txt'): 259 if not line.strip() or line[0] == ';': continue 260 rec = line.strip().split(None, 9) 261 key = "dr%s-%s%s" % (rec[2],rec[1].lower(),rec[0].lower()) 262 self._speakerinfo[key] = SpeakerInfo(*rec) 263 264 return self._speakerinfo[speaker]

265

266 - def phones(self, utterances=None):

267 return [line.split()[-1] 268 for fileid in self._utterance_files(utterances, '.phn') 269 for line in self.open(fileid) if line.strip()]

270

271 - def phone_times(self, utterances=None):

272 """ 273 offset is represented as a number of 16kHz samples! 274 """ 275 return [(line.split()[2], int(line.split()[0]), int(line.split()[1])) 276 for fileid in self._utterance_files(utterances, '.phn') 277 for line in self.open(fileid) if line.strip()]

278

279 - def words(self, utterances=None):

280 return [line.split()[-1] 281 for fileid in self._utterance_files(utterances, '.wrd') 282 for line in self.open(fileid) if line.strip()]

283

284 - def word_times(self, utterances=None):

285 return [(line.split()[2], int(line.split()[0]), int(line.split()[1])) 286 for fileid in self._utterance_files(utterances, '.wrd') 287 for line in self.open(fileid) if line.strip()]

288

289 - def sents(self, utterances=None):

290 return [[line.split()[-1] 291 for line in self.open(fileid) if line.strip()] 292 for fileid in self._utterance_files(utterances, '.wrd')]

293

294 - def sent_times(self, utterances=None):

295 return [(line.split(None,2)[-1].strip(), 296 int(line.split()[0]), int(line.split()[1])) 297 for fileid in self._utterance_files(utterances, '.txt') 298 for line in self.open(fileid) if line.strip()]

299

300 - def phone_trees(self, utterances=None):

301 if utterances is None: utterances = self._utterances 302 if isinstance(utterances, basestring): utterances = [utterances] 303 304 trees = [] 305 for utterance in utterances: 306 word_times = self.word_times(utterance) 307 phone_times = self.phone_times(utterance) 308 sent_times = self.sent_times(utterance) 309 310 while sent_times: 311 (sent, sent_start, sent_end) = sent_times.pop(0) 312 trees.append(Tree('S', [])) 313 while (word_times and phone_times and 314 phone_times[0][2] <= word_times[0][1]): 315 trees[-1].append(phone_times.pop(0)[0]) 316 while word_times and word_times[0][2] <= sent_end: 317 (word, word_start, word_end) = word_times.pop(0) 318 trees[-1].append(Tree(word, [])) 319 while phone_times and phone_times[0][2] <= word_end: 320 trees[-1][-1].append(phone_times.pop(0)[0]) 321 while phone_times and phone_times[0][2] <= sent_end: 322 trees[-1].append(phone_times.pop(0)[0]) 323 return trees

324 325 # [xx] NOTE: This is currently broken -- we're assuming that the 326 # files are WAV files (aka RIFF), but they're actually NIST SPHERE 327 # files.

328 - def wav(self, utterance, start=0, end=None):

329 # nltk.chunk conflicts with the stdlib module 'chunk' 330 wave = import_from_stdlib('wave') 331 332 w = wave.open(self.open(utterance+'.wav'), 'rb') 333 334 # If they want the whole thing, return it as-is. 335 if start==0 and end is None: 336 return w.read() 337 338 # Select the piece we want using the 'wave' module. 339 else: 340 # Skip past frames before start. 341 w.readframes(start) 342 # Read the frames we want. 343 frames = w.readframes(end-start) 344 # Open a new temporary file -- the wave module requires 345 # an actual file, and won't work w/ stringio. :( 346 tf = tempfile.TemporaryFile() 347 out = wave.open(tf, 'w') 348 # Write the parameters & data to the new file. 349 out.setparams(w.getparams()) 350 out.writeframes(frames) 351 out.close() 352 # Read the data back from the file, and return it. The 353 # file will automatically be deleted when we return. 354 tf.seek(0) 355 return tf.read()

356

357 - def audiodata(self, utterance, start=0, end=None):

358 assert(end is None or end > start) 359 headersize = 44 360 if end is None: 361 data = self.open(utterance+'.wav').read() 362 else: 363 data = self.open(utterance+'.wav').read(headersize+end*2) 364 return data[headersize+start*2:]

365

366 - def _utterance_files(self, utterances, extension):

367 if utterances is None: utterances = self._utterances 368 if isinstance(utterances, basestring): utterances = [utterances] 369 return ['%s%s' % (u, extension) for u in utterances]

370

371 - def play(self, utterance, start=0, end=None):

372 """ 373 Play the given audio sample. 374 375 @param utterance: The utterance id of the sample to play 376 """ 377 # Method 1: os audio dev. 378 try: 379 import ossaudiodev 380 try: 381 dsp = ossaudiodev.open('w') 382 dsp.setfmt(ossaudiodev.AFMT_S16_LE) 383 dsp.channels(1) 384 dsp.speed(16000) 385 dsp.write(self.audiodata(utterance, start, end)) 386 dsp.close() 387 except IOError, e: 388 print >>sys.stderr, ("can't acquire the audio device; please " 389 "activate your audio device.") 390 print >>sys.stderr, "system error message:", str(e) 391 return 392 except ImportError: 393 pass 394 395 # Method 2: pygame 396 try: 397 import pygame.mixer, StringIO 398 pygame.mixer.init(16000) 399 f = StringIO.StringIO(self.wav(utterance, start, end)) 400 pygame.mixer.Sound(f).play() 401 while pygame.mixer.get_busy(): 402 time.sleep(0.01) 403 return 404 except ImportError: 405 pass 406 407 # Method 3: complain. :) 408 print >>sys.stderr, ("you must install pygame or ossaudiodev " 409 "for audio playback.")

410 411 #{ Deprecated since 0.9.1 412 @deprecated("Use utterances(spkrid=...) instead.")

413 - def spkritems(self, spkrid):

414 return self.utterances(spkrid=spkrid)

415 #} 416 417 #{ Deprecated since 0.8 418 @deprecated("Use .sents() or .sent_times() instead.")

419 - def tokenized(self, utterances=None, offset=True):

420 if offset: return self.sent_times(utterances) 421 else: return self.sents(utterances)

422 @deprecated("Use .phones() or .phone_times() instead.")

423 - def phonetic(self, utterances=None, offset=True):

424 if offset: return self.phone_times(utterances) 425 else: return self.phones(utterances)

Source Code for Module nltk.corpus.reader.timit