nltk.corpus.reader.rte

1 # Natural Language Toolkit: RTE Corpus Reader 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Ewan Klein <[email protected]> 5 # URL: <http://nltk.org> 6 # For license information, see LICENSE.TXT 7 8 """ 9 Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora. 10 11 The files were taken from the RTE1, RTE2 and RTE3 datasets and the filenames 12 were regularized. 13 14 Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the gold standard annotated files. 15 16 Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following example is taken from RTE3:: 17 18 <pair id="1" entailment="YES" task="IE" length="short" > 19 <t>The sale was made to pay Yukos' US$ 27.5 billion tax bill, Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known company Baikalfinansgroup which was later bought by the Russian state-owned oil company Rosneft .</t> 20 <h>Baikalfinansgroup was sold to Rosneft.</h> 21 </pair> 22 23 In order to provide globally unique IDs for each pair, a new attribute C{challenge} has been added to the root element C{entailment-corpus} of each file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the challenge number and 'n' is the pair ID. 24 """ 25 26 from util import * 27 from api import * 28 from xmldocs import XMLCorpusReader 29

30 -def norm(value_string):

31 """ 32 Normalize the string value in an RTE pair's C{value} or C{entailment} 33 attribute as an integer (1, 0). 34 35 @param value_string: the label used to classify a text/hypothesis pair 36 @type value_string: C{str} 37 @rtype: C{int} 38 """ 39 40 valdict = {"TRUE": 1, 41 "FALSE": 0, 42 "YES": 1, 43 "NO": 0} 44 return valdict[value_string.upper()]

45

46 -class RTEPair:

47 """ 48 Container for RTE text-hypothesis pairs. 49 50 The entailment relation is signalled by the C{value} attribute in RTE1, and by 51 C{entailment} in RTE2 and RTE3. These both get mapped on to the C{entailment} 52 attribute of this class. 53 """

54 - def __init__(self, pair, challenge=None, id=None, text=None, hyp=None, 55 value=None, task=None, length=None):

56 """ 57 @param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3) 58 @param id: identifier for the pair 59 @param text: the text component of the pair 60 @param hyp: the hypothesis component of the pair 61 @param value: classification label for the pair 62 @param task: attribute for the particular NLP task that the data was drawn from 63 @param length: attribute for the length of the text of the pair 64 """ 65 self.challenge = challenge 66 self.id = pair.attrib["id"] 67 self.gid = "%s-%s" % (self.challenge, self.id) 68 self.text = pair[0].text 69 self.hyp = pair[1].text 70 71 if "value" in pair.attrib: 72 self.value = norm(pair.attrib["value"]) 73 elif "entailment" in pair.attrib: 74 self.value = norm(pair.attrib["entailment"]) 75 else: 76 self.value = value 77 if "task" in pair.attrib: 78 self.task = pair.attrib["task"] 79 else: 80 self.task = task 81 if "length" in pair.attrib: 82 self.length = pair.attrib["length"] 83 else: 84 self.length = length

85

86 - def __repr__(self):

87 if self.challenge: 88 return '<RTEPair: gid=%s-%s>' % (self.challenge, self.id) 89 else: 90 return '<RTEPair: id=%s>' % self.id

91 92 93 # [xx] This could use more documentation!

94 -class RTECorpusReader(XMLCorpusReader):

95 """ 96 Corpus reader for corpora in RTE challenges. 97 """

98 - def _read_etree(self, doc):

99 """ 100 Build a list of RTE text/hypothesis pairs from the XML input. 101 102 @param doc: a parsed XML document 103 @return: a list of C{RTEPair}s 104 """ 105 try: 106 challenge = doc.attrib['challenge'] 107 except KeyError: 108 challenge = None 109 return [RTEPair(pair, challenge=challenge) 110 for pair in doc.getiterator("pair")]

111 112

113 - def pairs(self, files=None):

114 """ 115 Build a list of RTE pairs from a RTE corpus. 116 """ 117 doc = self.xml(files) 118 if doc.tag == 'documents': 119 return concat([self._read_etree(corpus) 120 for corpus in doc.getchildren()]) 121 else: 122 return self._read_etree(doc)

123

Source Code for Module nltk.corpus.reader.rte