Package nltk :: Package inference :: Module discourse
[hide private]
[frames] | no frames]

Source Code for Module nltk.inference.discourse

  1  # Natural Language Toolkit: Discourse Processing 
  2  # 
  3  # Author: Ewan Klein <[email protected]> 
  4  # 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7  # $Id: discourse.py 6471 2008-08-22 03:35:08Z DHGarrette $ 
  8   
  9  import os 
 10   
 11  from nltk.sem import root_semrep, Expression 
 12  from nltk import parse 
 13  from nltk.data import show_cfg 
 14   
 15  from nltk.inference import MaceCommand, spacer, get_prover 
 16   
 17  """ 
 18  Module for incrementally developing simple discourses, and checking for semantic ambiguity,  
 19  consistency and informativeness. 
 20   
 21  Many of the ideas are based on the CURT family of programs of Blackburn and Bos  
 22  (see U{http://homepages.inf.ed.ac.uk/jbos/comsem/book1.html}). 
 23   
 24  Consistency checking is carried out  by using the L{mace} module to call the Mace4 model builder. 
 25  Informativeness checking is carried out with a call to C{get_prover()} from 
 26  the L{inference}  module. 
 27   
 28  C{DiscourseTester} is a constructor for discourses.  
 29  The basic data structure is a list of sentences, stored as C{self._sentences}. Each sentence in the list 
 30  is assigned a I{sentence ID} (C{sid}) of the form C{s}I{i}. For example:: 
 31   
 32      s0: A boxer walks 
 33      s1: Every boxer chases a girl 
 34   
 35  Each sentence can be ambiguous between a number of readings, each of which receives a  
 36  I{reading ID} (C{rid}) of the form C{s}I{i} -C{r}I{j}. For example:: 
 37   
 38      s0 readings: 
 39      ------------------------------ 
 40      s0-r1: some x.((boxer x) and (walk x)) 
 41      s0-r0: some x.((boxerdog x) and (walk x)) 
 42   
 43  A I{thread} is a list of readings, represented 
 44  as a list of C{rid}s. Each thread receives a I{thread ID} (C{tid}) of the form C{d}I{i}.  
 45  For example:: 
 46   
 47      d0: ['s0-r0', 's1-r0'] 
 48   
 49  The set of all threads for a discourse is the Cartesian product of all the readings of the sequences of sentences. 
 50  (This is not intended to scale beyond very short discourses!) The method L{readings(filter=True)} will only show 
 51  those threads which are consistent (taking into account any background assumptions). 
 52  """ 
 53   
 54   
 55   
56 -class DiscourseTester(object):
57 """ 58 Check properties of an ongoing discourse. 59 """
60 - def __init__(self, input, gramfile=None, background=None):
61 """ 62 Initialize a C{DiscourseTester}. 63 64 @parameter input: the discourse sentences 65 @type input: C{list} of C{str} 66 @parameter gramfile: name of file where grammar can be loaded 67 @type gramfile: C{str} 68 @parameter background: Formulas which express background assumptions 69 @type background: C{list} of L{logic.Expression}. 70 """ 71 self._input = input 72 self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(input)]) 73 self._models = None 74 self._readings = {} 75 if gramfile is None: 76 self._gramfile = 'grammars/sem4.fcfg' 77 else: 78 self._gramfile = gramfile 79 self._threads = {} 80 self._filtered_threads = {} 81 self._parser = parse.load_earley(self._gramfile) 82 if background is not None: 83 for e in background: 84 assert isinstance(e, Expression) 85 self._background = background 86 else: 87 self._background = []
88 89 ############################### 90 # Sentences 91 ############################### 92
93 - def sentences(self):
94 """ 95 Display the list of sentences in the current discourse. 96 """ 97 for id in sorted(self._sentences.keys()): 98 print "%s: %s" % (id, self._sentences[id])
99
100 - def add_sentence(self, sentence, informchk=False, consistchk=False,):
101 """ 102 Add a sentence to the current discourse. 103 104 Updates C{self._input} and C{self._sentences}. 105 @parameter sentence: An input sentence 106 @type sentence: C{str} 107 @parameter informchk: if C{True}, check that the result of adding the sentence is thread-informative. Updates C{self._readings}. 108 @parameter consistchk: if C{True}, check that the result of adding the sentence is thread-consistent. Updates C{self._readings}. 109 110 """ 111 # check whether the new sentence is informative (i.e. not entailed by the previous discourse) 112 if informchk: 113 self.readings(quiet=True) 114 for tid in sorted(self._threads.keys()): 115 assumptions = [reading for (rid, reading) in self.expand_threads(tid)] 116 assumptions += self._background 117 for sent_reading in self._get_readings(sentence): 118 tp = get_prover(goal=sent_reading, assumptions=assumptions, prover_name='Prover9') 119 if tp.prove(): 120 print "Sentence '%s' under reading '%s':" % (sentence, str(sent_reading)) 121 print "Not informative relative to thread '%s'" % tid 122 123 self._input.append(sentence) 124 self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)]) 125 # check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of 126 # of assumptions 127 if consistchk: 128 self.readings(quiet=True) 129 self.models(show=False)
130
131 - def retract_sentence(self, sentence, quiet=False):
132 """ 133 Remove a sentence from the current discourse. 134 135 Updates C{self._input}, C{self._sentences} and C{self._readings}. 136 @parameter sentence: An input sentence 137 @type sentence: C{str} 138 @parameter quiet: If C{False}, report on the updated list of sentences. 139 """ 140 try: 141 self._input.remove(sentence) 142 except ValueError: 143 print "Retraction failed. The sentence '%s' is not part of the current discourse:" % sentence 144 self.sentences() 145 return None 146 self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)]) 147 self.readings(quiet=True) 148 if not quiet: 149 print "Current sentences are " 150 self.sentences()
151
152 - def grammar(self):
153 """ 154 Print out the grammar in use for parsing input sentences 155 """ 156 show_cfg(self._gramfile)
157 158 ############################### 159 # Readings and Threads 160 ############################### 161
162 - def _get_readings(self, sentence):
163 """ 164 Build a list of semantic readings for a sentence. 165 166 @rtype: C{list} of L{logic.Expression}. 167 """ 168 tokens = sentence.split() 169 trees = self._parser.nbest_parse(tokens) 170 return [root_semrep(tree) for tree in trees]
171
172 - def _construct_readings(self):
173 """ 174 Use C{self._sentences} to construct a value for C{self._readings}. 175 """ 176 # re-initialize self._readings in case we have retracted a sentence 177 self._readings = {} 178 for sid in self._sentences: 179 readings = self._get_readings(self._sentences[sid]) 180 self._readings[sid] = dict([("%s-r%s" % (sid, rid), reading) 181 for rid, reading in enumerate(readings)])
182
183 - def _construct_threads(self):
184 """ 185 Use C{self._readings} to construct a value for C{self._threads} 186 and use the model builder to construct a value for C{self._filtered_threads} 187 """ 188 thread_list = [[]] 189 for sid in sorted(self._readings.keys()): 190 thread_list = self.multiply(thread_list, sorted(self._readings[sid].keys())) 191 self._threads = dict([("d%s" % tid, thread) for tid, thread in enumerate(thread_list)]) 192 # re-initialize the filtered threads 193 self._filtered_threads = {} 194 # keep the same ids, but only include threads which get models 195 for (tid, thread) in self._threads.items(): 196 if (tid, True) in self._check_consistency(self._threads): 197 self._filtered_threads[tid] = thread
198 199
200 - def _show_readings(self, sentence=None):
201 """ 202 Print out the readings for the discourse (or a single sentence). 203 """ 204 if sentence is not None: 205 print "The sentence '%s' has these readings:" % sentence 206 for r in [str(reading) for reading in (self._get_readings(sentence))]: 207 print " %s" % r 208 else: 209 for sid in sorted(self._readings.keys()): 210 print 211 print '%s readings:' % sid 212 print '-' * 30 213 for rid in sorted(self._readings[sid]): 214 lf = self._readings[sid][rid] 215 #TODO lf = lf.normalize('[xyz]\d*', 'z%d') 216 print "%s: %s" % (rid, lf)
217
218 - def _show_threads(self, filter=False):
219 """ 220 Print out the value of C{self._threads} or C{self._filtered_hreads} 221 """ 222 if filter: 223 threads = self._filtered_threads 224 else: 225 threads = self._threads 226 for tid in sorted(threads.keys()): 227 print "%s:" % tid, self._threads[tid]
228 229
230 - def readings(self, sentence=None, threaded=False, quiet=False, filter=False):
231 """ 232 Construct and show the readings of the discourse (or of a single sentence). 233 234 @parameter sentence: test just this sentence 235 @type sentence: C{str} 236 @parameter threaded: if C{True}, print out each thread ID and the corresponding thread. 237 @parameter filter: if C{True}, only print out consistent thread IDs and threads. 238 """ 239 self._construct_readings() 240 self._construct_threads() 241 242 # if we are filtering, just show threads 243 if filter: threaded=True 244 if not quiet: 245 if not threaded: 246 self._show_readings(sentence=sentence) 247 else: 248 self._show_threads(filter=filter)
249
250 - def expand_threads(self, thread_id, threads=None):
251 """ 252 Given a thread ID, find the list of L{logic.Expression}s corresponding to the reading IDs in that thread. 253 254 @parameter thread_id: thread ID 255 @type thread_id: C{str} 256 @parameter threads: a mapping from thread IDs to lists of reading IDs 257 @type threads: C{dict} 258 @return: A list of pairs (C{rid}, I{reading}) where I{reading} is the L{logic.Expression} associated with a reading ID 259 @rtype: C{list} of C{tuple} 260 """ 261 if threads is None: 262 threads = self._threads 263 return [(rid, self._readings[sid][rid]) for rid in threads[thread_id] for sid in rid.split('-')[:1]]
264 265 266 ############################### 267 # Models and Background 268 ############################### 269
270 - def _check_consistency(self, threads, show=False, quiet=True):
271 results = [] 272 for tid in sorted(threads.keys()): 273 assumptions = [reading for (rid, reading) in self.expand_threads(tid, threads=threads)] 274 assumptions += self._background 275 # if Mace4 finds a model, it always seems to find it quickly 276 mb = MaceCommand(None, assumptions, timeout=2) 277 modelfound = mb.build_model() 278 results.append((tid, modelfound)) 279 if show: 280 spacer(80) 281 print "Model for Discourse Thread %s" % tid 282 spacer(80) 283 if not quiet: 284 for a in assumptions: 285 print a 286 spacer(80) 287 if modelfound: 288 mb.show_model(format='cooked') 289 else: 290 print "No model found!\n" 291 return results
292
293 - def models(self, thread_id=None, show=True, quiet=True):
294 """ 295 Call Mace4 to build a model for each current discourse thread. 296 297 @parameter thread_id: thread ID 298 @type thread_id: C{str} 299 @parameter show: If C{True}, display the model that has been found. 300 """ 301 self._construct_readings() 302 self._construct_threads() 303 if thread_id is None: 304 threads = self._threads 305 else: 306 threads = {thread_id: self._threads[thread_id]} 307 308 for (tid, modelfound) in self._check_consistency(threads, show=show, quiet=quiet): 309 idlist = [rid for rid in threads[tid]] 310 311 if not modelfound: 312 print "Inconsistent discourse %s %s:" % (tid, idlist) 313 for rid, reading in [(rid, str(reading)) for (rid, reading) in self.expand_threads(tid)]: 314 print " %s: %s" % (rid, reading) 315 print 316 else: 317 print "Consistent discourse: %s %s:" % (tid, idlist) 318 for rid, reading in [(rid, str(reading)) for (rid, reading) in self.expand_threads(tid)]: 319 print " %s: %s" % (rid, reading) 320 print
321
322 - def add_background(self, background, quiet=True):
323 """ 324 Add a list of background assumptions for reasoning about the discourse. 325 326 When called, this method also updates the discourse model's set of readings and threads. 327 @parameter background: Formulas which contain background information 328 @type background: C{list} of L{logic.Expression}. 329 """ 330 for (count, e) in enumerate(background): 331 assert isinstance(e, Expression) 332 if not quiet: 333 print "Adding assumption %s to background" % count 334 self._background.append(e) 335 336 #update the state 337 self._construct_readings() 338 self._construct_threads()
339
340 - def background(self):
341 """ 342 Show the current background assumptions. 343 """ 344 for e in self._background: 345 print str(e)
346 347 ############################### 348 # Misc 349 ############################### 350 351 @staticmethod
352 - def multiply(discourse, readings):
353 """ 354 Multiply every thread in C{discourse} by every reading in C{readings}. 355 356 Given discourse = [['A'], ['B']], readings = ['a', 'b', 'c'] , returns 357 [['A', 'a'], ['A', 'b'], ['A', 'c'], ['B', 'a'], ['B', 'b'], ['B', 'c']] 358 359 @parameter discourse: the current list of readings 360 @type discourse: C{list} of C{list}s 361 @parameter readings: an additional list of readings 362 @type readings: C{list} of C{logic.Expression}s 363 @rtype: A C{list} of C{list}s 364 """ 365 result = [] 366 for sublist in discourse: 367 for r in readings: 368 new = [] 369 new += sublist 370 new.append(r) 371 result.append(new) 372 return result
373 374 #multiply = DiscourseTester.multiply 375 #L1 = [['A'], ['B']] 376 #L2 = ['a', 'b', 'c'] 377 #print multiply(L1,L2) 378
379 -def parse_fol(s):
380 """ 381 Temporarily duplicated from L{nltk.sem.util}. 382 Convert a file of First Order Formulas into a list of C{Expression}s. 383 384 @parameter s: the contents of the file 385 @type s: C{str} 386 @return: a list of parsed formulas. 387 @rtype: C{list} of L{Expression} 388 """ 389 from nltk.sem import LogicParser 390 statements = [] 391 lp = LogicParser() 392 for linenum, line in enumerate(s.splitlines()): 393 line = line.strip() 394 if line.startswith('#') or line=='': continue 395 try: 396 statements.append(lp.parse(line)) 397 except Error: 398 raise ValueError, 'Unable to parse line %s: %s' % (linenum, line) 399 return statements
400 401 ############################### 402 # Demo 403 ############################### 404
405 -def discourse_demo():
406 """ 407 Illustrate the various methods of C{DiscourseTester} 408 """ 409 dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl']) 410 dt.models() 411 print 412 #dt.grammar() 413 print 414 dt.sentences() 415 print 416 dt.readings() 417 print 418 dt.readings(threaded=True) 419 print 420 dt.models('d1') 421 dt.add_sentence('John is a boxer') 422 print 423 dt.sentences() 424 print 425 dt.readings(threaded=True) 426 print 427 dt = DiscourseTester(['A student dances', 'Every student is a person']) 428 print 429 dt.add_sentence('No person dances', consistchk=True) 430 print 431 dt.readings() 432 print 433 dt.retract_sentence('No person dances', quiet=False) 434 print 435 dt.models() 436 print 437 dt.readings('A person dances') 438 print 439 dt.add_sentence('A person dances', informchk=True) 440 dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks']) 441 dt.readings(filter=True) 442 import nltk.data 443 world = nltk.data.load('/grammars/world.fol') 444 print 445 dt.add_background(world, quiet=True) 446 dt.background() 447 print 448 dt.readings(filter=True) 449 print 450 dt.models() 451 452 453 454 if __name__ == '__main__': 455 discourse_demo() 456