nltk.corpus.reader.toolbox

1 #!/usr/bin/env python 2 3 # Natural Language Toolkit: Toolbox Reader 4 # 5 # Copyright (C) 2001-2008 NLTK Project 6 # Author: Greg Aumann <[email protected]> 7 # Stuart Robinson <[email protected]> 8 # Steven Bird <[email protected]> 9 # URL: <http://nltk.org> 10 # For license information, see LICENSE.TXT 11 12 """ 13 Module for reading, writing and manipulating 14 Toolbox databases and settings files. 15 """ 16 17 import os, re, codecs 18 from nltk.corpus.reader.util import * 19 from nltk.corpus.reader.api import * 20 from StringIO import StringIO 21 from nltk.etree.ElementTree import TreeBuilder, Element 22 from nltk.internals import deprecated 23

24 -class ToolboxCorpusReader(CorpusReader):

25 - def xml(self, files, key=None):

26 return concat([ToolboxData(path, enc).parse(key) 27 for (path, enc) in self.abspaths(files, True)])

28

29 - def fields(self, files, strip=True, unwrap=True, encoding=None, 30 errors='strict', unicode_fields=None):

31 return concat([list(ToolboxData(filename,enc).fields( 32 strip, unwrap, encoding, errors, unicode_fields)) 33 for (filename, enc) in self.abspaths(files)])

34

35 - def raw(self, files):

36 if files is None: files = self._files 37 elif isinstance(files, basestring): files = [files] 38 return concat([self.open(f).read() for f in files])

39 40 #{ Deprecated since 0.8 41 @deprecated("Use .xml() instead.")

42 - def dictionary(self, files=None):

43 raise ValueError("no longer supported -- use .xml() instead")

44 @deprecated("Use .xml() instead.")

45 - def parse_corpus(self, files=None, key=None):

46 return self.xml(items, key)

47 #} 48

49 -class StandardFormat(object):

50 """ 51 Class for reading and processing standard format marker files and strings. 52 """

53 - def __init__(self, filename=None, encoding=None):

54 self._encoding = encoding 55 if filename is not None: 56 self.open(filename)

57

58 - def open(self, sfm_file):

59 """Open a standard format marker file for sequential reading. 60 61 @param sfm_file: name of the standard format marker input file 62 @type sfm_file: string 63 """ 64 if isinstance(sfm_file, PathPointer): 65 # [xx] We don't use 'rU' mode here -- do we need to? 66 # (PathPointer.open doesn't take a mode option) 67 self._file = sfm_file.open(self._encoding) 68 else: 69 self._file = codecs.open(sfm_file, 'rU', self._encoding)

70

71 - def open_string(self, s):

72 """Open a standard format marker string for sequential reading. 73 74 @param s: string to parse as a standard format marker input file 75 @type s: string 76 """ 77 self._file = StringIO(s)

78

79 - def raw_fields(self):

80 """Return an iterator for the fields in the standard format marker 81 file. 82 83 @return: an iterator that returns the next field in a (marker, value) 84 tuple. Linebreaks and trailing white space are preserved except 85 for the final newline in each field. 86 @rtype: iterator over C{(marker, value)} tuples 87 """ 88 join_string = '\n' 89 line_regexp = r'^%s(?:\\(\S+)\s*)?(.*)$' 90 first_line_pat = re.compile(line_regexp % u'\ufeff?') 91 line_pat = re.compile(line_regexp % '') 92 # need to get first line outside the loop for correct handling 93 # of the first marker if it spans multiple lines 94 file_iter = iter(self._file) 95 line = file_iter.next() 96 mobj = re.match(first_line_pat, line) 97 mkr, line_value = mobj.groups() 98 value_lines = [line_value,] 99 self.line_num = 0 100 for line in file_iter: 101 self.line_num += 1 102 mobj = re.match(line_pat, line) 103 line_mkr, line_value = mobj.groups() 104 if line_mkr: 105 yield (mkr, join_string.join(value_lines)) 106 mkr = line_mkr 107 value_lines = [line_value,] 108 else: 109 value_lines.append(line_value) 110 self.line_num += 1 111 yield (mkr, join_string.join(value_lines))

112

113 - def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):

114 """Return an iterator for the fields in the standard format marker file. 115 116 @param strip: strip trailing whitespace from the last line of each field 117 @type strip: boolean 118 @param unwrap: Convert newlines in a field to spaces. 119 @type unwrap: boolean 120 @param encoding: Name of an encoding to use. If it is specified then 121 the C{fields} method returns unicode strings rather than non 122 unicode strings. 123 @type encoding: string or None 124 @param errors: Error handling scheme for codec. Same as the C{decode} 125 inbuilt string method. 126 @type errors: string 127 @param unicode_fields: Set of marker names whose values are UTF-8 encoded. 128 Ignored if encoding is None. If the whole file is UTF-8 encoded set 129 C{encoding='utf8'} and leave C{unicode_fields} with its default 130 value of None. 131 @type unicode_fields: set or dictionary (actually any sequence that 132 supports the 'in' operator). 133 @return: an iterator that returns the next field in a C{(marker, value)} 134 tuple. C{marker} and C{value} are unicode strings if an C{encoding} was specified in the 135 C{fields} method. Otherwise they are nonunicode strings. 136 @rtype: iterator over C{(marker, value)} tuples 137 """ 138 if encoding is None and unicode_fields is not None: 139 raise ValueError, 'unicode_fields is set but not encoding.' 140 unwrap_pat = re.compile(r'\n+') 141 for mkr, val in self.raw_fields(): 142 if encoding: 143 if unicode_fields is not None and mkr in unicode_fields: 144 val = val.decode('utf8', errors) 145 else: 146 val = val.decode(encoding, errors) 147 mkr = mkr.decode(encoding, errors) 148 if unwrap: 149 val = unwrap_pat.sub(' ', val) 150 if strip: 151 val = val.rstrip() 152 yield (mkr, val)

153

154 - def close(self):

155 """Close a previously opened standard format marker file or string.""" 156 self._file.close() 157 try: 158 del self.line_num 159 except AttributeError: 160 pass

161

162 -class ToolboxData(StandardFormat):

163 - def parse(self, *args, **kwargs):

164 return self._record_parse(*args, **kwargs)

165

166 - def _record_parse(self, key=None, **kwargs):

167 """ 168 Returns an element tree structure corresponding to a toolbox data file with 169 all markers at the same level. 170 171 Thus the following Toolbox database:: 172 \_sh v3.0 400 Rotokas Dictionary 173 \_DateStampHasFourDigitYear 174 175 \lx kaa 176 \ps V.A 177 \ge gag 178 \gp nek i pas 179 180 \lx kaa 181 \ps V.B 182 \ge strangle 183 \gp pasim nek 184 185 after parsing will end up with the same structure (ignoring the extra 186 whitespace) as the following XML fragment after being parsed by 187 ElementTree:: 188 <toolbox_data> 189 <header> 190 <_sh>v3.0 400 Rotokas Dictionary</_sh> 191 <_DateStampHasFourDigitYear/> 192 </header> 193 194 <record> 195 <lx>kaa</lx> 196 <ps>V.A</ps> 197 <ge>gag</ge> 198 <gp>nek i pas</gp> 199 </record> 200 201 <record> 202 <lx>kaa</lx> 203 <ps>V.B</ps> 204 <ge>strangle</ge> 205 <gp>pasim nek</gp> 206 </record> 207 </toolbox_data> 208 209 @param key: Name of key marker at the start of each record. If set to 210 None (the default value) the first marker that doesn't begin with an 211 underscore is assumed to be the key. 212 @type key: string 213 @param kwargs: Keyword arguments passed to L{StandardFormat.fields()} 214 @type kwargs: keyword arguments dictionary 215 @rtype: ElementTree._ElementInterface 216 @return: contents of toolbox data divided into header and records 217 """ 218 builder = TreeBuilder() 219 builder.start('toolbox_data', {}) 220 builder.start('header', {}) 221 in_records = False 222 for mkr, value in self.fields(**kwargs): 223 if key is None and not in_records and mkr[0] != '_': 224 key = mkr 225 if mkr == key: 226 if in_records: 227 builder.end('record') 228 else: 229 builder.end('header') 230 in_records = True 231 builder.start('record', {}) 232 builder.start(mkr, {}) 233 builder.data(value) 234 builder.end(mkr) 235 if in_records: 236 builder.end('record') 237 else: 238 builder.end('header') 239 builder.end('toolbox_data') 240 return builder.close()

241 242 _is_value = re.compile(r"\S") 243

244 -def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None):

245 """Return a string with a standard format representation of the toolbox 246 data in tree (tree can be a toolbox database or a single record). 247 248 @param tree: flat representation of toolbox data (whole database or single record) 249 @type tree: ElementTree._ElementInterface 250 @param encoding: Name of an encoding to use. 251 @type encoding: string 252 @param errors: Error handling scheme for codec. Same as the C{encode} 253 inbuilt string method. 254 @type errors: string 255 @param unicode_fields: 256 @type unicode_fields: string 257 @rtype: string 258 @return: string using standard format markup 259 """ 260 if tree.tag == 'record': 261 root = Element('toolbox_data') 262 root.append(tree) 263 tree = root 264 265 if tree.tag != 'toolbox_data': 266 raise ValueError, "not a toolbox_data element structure" 267 if encoding is None and unicode_fields is not None: 268 raise ValueError, \ 269 "if encoding is not specified then neither should unicode_fields" 270 l = [] 271 for rec in tree: 272 l.append('\n') 273 for field in rec: 274 mkr = field.tag 275 value = field.text 276 if encoding is not None: 277 if unicode_fields is not None and mkr in unicode_fields: 278 cur_encoding = 'utf8' 279 else: 280 cur_encoding = encoding 281 if re.search(_is_value, value): 282 l.append((u"\\%s %s\n" % (mkr, value)).encode(cur_encoding, errors)) 283 else: 284 l.append((u"\\%s%s\n" % (mkr, value)).encode(cur_encoding, errors)) 285 else: 286 if re.search(_is_value, value): 287 l.append("\\%s %s\n" % (mkr, value)) 288 else: 289 l.append("\\%s%s\n" % (mkr, value)) 290 return ''.join(l[1:])

291

292 -class ToolboxSettings(StandardFormat):

293 """This class is the base class for settings files.""" 294

295 - def __init__(self):

296 super(ToolboxSettings, self).__init__()

297

298 - def parse(self, encoding=None, errors='strict', **kwargs):

299 """Parses a settings file using ElementTree. 300 301 @param encoding: encoding used by settings file 302 @type encoding: string 303 @param errors: Error handling scheme for codec. Same as C{.decode} inbuilt method. 304 @type errors: string 305 @param kwargs: Keyword arguments passed to L{StandardFormat.fields()} 306 @type kwargs: keyword arguments dictionary 307 @rtype: ElementTree._ElementInterface 308 @return: contents of toolbox settings file with a nested structure 309 """ 310 builder = TreeBuilder() 311 for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs): 312 # Check whether the first char of the field marker 313 # indicates a block start (+) or end (-) 314 block=mkr[0] 315 if block in ("+", "-"): 316 mkr=mkr[1:] 317 else: 318 block=None 319 # Build tree on the basis of block char 320 if block == "+": 321 builder.start(mkr, {}) 322 builder.data(value) 323 elif block == '-': 324 builder.end(mkr) 325 else: 326 builder.start(mkr, {}) 327 builder.data(value) 328 builder.end(mkr) 329 return builder.close()

330

331 -def to_settings_string(tree, encoding=None, errors='strict', unicode_fields=None):

332 # write XML to file 333 l = list() 334 _to_settings_string(tree.getroot(), l, encoding=encoding, errors=errors, unicode_fields=unicode_fields) 335 return ''.join(l)

336

337 -def _to_settings_string(node, l, **kwargs):

338 # write XML to file 339 tag = node.tag 340 text = node.text 341 if len(node) == 0: 342 if text: 343 l.append('\\%s %s\n' % (tag, text)) 344 else: 345 l.append('\\%s\n' % tag) 346 else: 347 if text: 348 l.append('\\+%s %s\n' % (tag, text)) 349 else: 350 l.append('\\+%s\n' % tag) 351 for n in node: 352 _to_settings_string(n, l, **kwargs) 353 l.append('\\-%s\n' % tag) 354 return

355

356 -def demo():

357 from nltk.corpus import toolbox 358 from itertools import islice 359 360 lexicon = toolbox.xml('rotokas.dic') 361 print 'first field in fourth record:' 362 print lexicon[3][0].tag 363 print lexicon[3][0].text 364 365 print '\nfields in sequential order:' 366 for field in islice(lexicon.find('record'), 10): 367 print field.tag, field.text 368 369 print '\nlx fields:' 370 for field in islice(lexicon.findall('record/lx'), 10): 371 print field.text 372 373 from nltk.etree.ElementTree import ElementTree 374 375 settings = ToolboxSettings() 376 # need a more general solution for the following line 377 settings.open(os.path.join(os.environ['NLTK_DATA'], 'corpora', 'toolbox', 'MDF', 'MDF_AltH.typ')) 378 tree = settings.parse(unwrap=False, encoding='cp1252') 379 print tree.find('expset/expMDF/rtfPageSetup/paperSize').text 380 settings_tree = ElementTree(tree) 381 print to_settings_string(settings_tree).encode('utf8')

382 383 if __name__ == '__main__': 384 demo() 385

Source Code for Module nltk.corpus.reader.toolbox