1
2
3
4
5
6
7
8
9 import subprocess
10 import os
11 import os.path
12 import re
13 import warnings
14 import textwrap
15 import types
16 import sys
17
18 from nltk import __file__
19
20
21 try: from xml.etree import cElementTree as ElementTree
22 except ImportError: from nltk.etree import ElementTree
23
24
25
26
27
29 """
30 Convert all grouping parenthases in the given regexp pattern to
31 non-grouping parenthases, and return the result. E.g.:
32
33 >>> convert_regexp_to_nongrouping('ab(c(x+)(z*))?d')
34 'ab(?:c(?:x+)(?:z*))?d'
35
36 @type pattern: C{str}
37 @rtype: C{str}
38 """
39
40 for s in re.findall(r'\\.|\(\?P=', pattern):
41 if s[1] in '0123456789' or s == '(?P=':
42 raise ValueError('Regular expressions with back-references '
43 'are not supported: %r' % pattern)
44
45
46
47 def subfunc(m):
48 return re.sub('^\((\?P<[^>]*>)?$', '(?:', m.group())
49
50
51
52
53
54
55 return re.sub(r'''(?x)
56 \\. | # Backslashed character
57 \(\?P<[^>]*> | # Named group
58 \(\? | # Extension group
59 \( # Grouping parenthasis''', subfunc, pattern)
60
61
62
63
64
65
66 _java_bin = None
67 _java_options = []
68
70 """
71 Configure nltk's java interface, by letting nltk know where it can
72 find the C{java} binary, and what extra options (if any) should be
73 passed to java when it is run.
74
75 @param bin: The full path to the C{java} binary. If not specified,
76 then nltk will search the system for a C{java} binary; and if
77 one is not found, it will raise a C{LookupError} exception.
78 @type bin: C{string}
79 @param options: A list of options that should be passed to the
80 C{java} binary when it is called. A common value is
81 C{['-Xmx512m']}, which tells the C{java} binary to increase
82 the maximum heap size to 512 megabytes. If no options are
83 specified, then do not modify the options list.
84 @type options: C{list} of C{string}
85 """
86 global _java_bin, _java_options
87 _java_bin = find_binary('java', bin, env_vars=['JAVAHOME', 'JAVA_HOME'])
88
89 if options is not None:
90 if isinstance(options, basestring):
91 options = options.split()
92 _java_options = list(options)
93
94 -def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None,
95 blocking=True):
96 """
97 Execute the given java command, by opening a subprocess that calls
98 C{java}. If java has not yet been configured, it will be configured
99 by calling L{config_java()} with no arguments.
100
101 @param cmd: The java command that should be called, formatted as
102 a list of strings. Typically, the first string will be the name
103 of the java class; and the remaining strings will be arguments
104 for that java class.
105 @type cmd: C{list} of C{string}
106
107 @param classpath: A C{':'} separated list of directories, JAR
108 archives, and ZIP archives to search for class files.
109 @type classpath: C{string}
110
111 @param stdin, stdout, stderr: Specify the executed programs'
112 standard input, standard output and standard error file
113 handles, respectively. Valid values are C{subprocess.PIPE},
114 an existing file descriptor (a positive integer), an existing
115 file object, and C{None}. C{subprocess.PIPE} indicates that a
116 new pipe to the child should be created. With C{None}, no
117 redirection will occur; the child's file handles will be
118 inherited from the parent. Additionally, stderr can be
119 C{subprocess.STDOUT}, which indicates that the stderr data
120 from the applications should be captured into the same file
121 handle as for stdout.
122
123 @param blocking: If C{false}, then return immediately after
124 spawning the subprocess. In this case, the return value is
125 the C{Popen} object, and not a C{(stdout, stderr)} tuple.
126
127 @return: If C{blocking=True}, then return a tuple C{(stdout,
128 stderr)}, containing the stdout and stderr outputs generated
129 by the java command if the C{stdout} and C{stderr} parameters
130 were set to C{subprocess.PIPE}; or C{None} otherwise. If
131 C{blocking=False}, then return a C{subprocess.Popen} object.
132
133 @raise OSError: If the java command returns a nonzero return code.
134 """
135 if stdin == 'pipe': stdin = subprocess.PIPE
136 if stdout == 'pipe': stdout = subprocess.PIPE
137 if stderr == 'pipe': stderr = subprocess.PIPE
138 if isinstance(cmd, basestring):
139 raise TypeError('cmd should be a list of strings')
140
141
142 if _java_bin is None:
143 config_java()
144
145
146 if classpath is None:
147 classpath = NLTK_JAR
148 else:
149 classpath += ':' + NLTK_JAR
150
151
152 cmd = list(cmd)
153 cmd = ['-cp', classpath] + cmd
154 cmd = [_java_bin] + _java_options + cmd
155
156
157 p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr)
158 if not blocking: return p
159 (stdout, stderr) = p.communicate()
160
161
162 if p.returncode != 0:
163 print stderr
164 raise OSError('Java command failed!')
165
166 return (stdout, stderr)
167
168
169
170
171 NLTK_JAR = os.path.abspath(os.path.join(os.path.split(__file__)[0],
172 'nltk.jar'))
173
174 if 0:
175
176
177
178
179
180
181 (a,b) = java(['weka.classifiers.bayes.NaiveBayes',
182 '-l', '/tmp/names.model', '-T', '/tmp/test.arff',
183 '-p', '0'],
184 classpath='/Users/edloper/Desktop/weka/weka.jar')
185
186
187
188
189
190
192 """
193 Exception raised by parse_* functions when they fail.
194 @param position: The index in the input string where an error occured.
195 @param expected: What was expected when an error occured.
196 """
197 - def __init__(self, expected, position):
198 ValueError.__init__(self, expected, position)
199 self.expected = expected
200 self.position = position
202 return 'Expected %s at %s' % (self.expected, self.position)
203
204 _STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')")
206 """
207 If a Python string literal begins at the specified position in the
208 given string, then return a tuple C{(val, end_position)}
209 containing the value of the string literal and the position where
210 it ends. Otherwise, raise a L{ParseError}.
211 """
212
213 m = _STRING_START_RE.match(s, start_position)
214 if not m: raise ParseError('open quote', start_position)
215 quotemark = m.group(1)
216
217
218 _STRING_END_RE = re.compile(r'\\|%s' % quotemark)
219 position = m.end()
220 while True:
221 match = _STRING_END_RE.search(s, position)
222 if not match: raise ParseError('close quote', position)
223 if match.group(0) == '\\': position = match.end()+1
224 else: break
225
226
227
228 try:
229 return eval(s[start_position:match.end()]), match.end()
230 except ValueError, e:
231 raise ParseError('valid string (%s)' % e, start)
232
233 _PARSE_INT_RE = re.compile(r'-?\d+')
235 """
236 If an integer begins at the specified position in the given
237 string, then return a tuple C{(val, end_position)} containing the
238 value of the integer and the position where it ends. Otherwise,
239 raise a L{ParseError}.
240 """
241 m = _PARSE_INT_RE.match(s, start_position)
242 if not m: raise ParseError('integer', start_position)
243 return int(m.group()), m.end()
244
245 _PARSE_NUMBER_VALUE = re.compile(r'-?(\d*)([.]?\d*)?')
247 """
248 If an integer or float begins at the specified position in the
249 given string, then return a tuple C{(val, end_position)}
250 containing the value of the number and the position where it ends.
251 Otherwise, raise a L{ParseError}.
252 """
253 m = _PARSE_NUMBER_VALUE.match(s, start_position)
254 if not m or not (m.group(1) or m.group(2)):
255 raise ParseError('number', start_position)
256 if m.group(2): return float(m.group()), m.end()
257 else: return int(m.group()), m.end()
258
259
260
261
262
263
264
266 """
267 @return: True if C{method} overrides some method with the same
268 name in a base class. This is typically used when defining
269 abstract base classes or interfaces, to allow subclasses to define
270 either of two related methods:
271
272 >>> class EaterI:
273 ... '''Subclass must define eat() or batch_eat().'''
274 ... def eat(self, food):
275 ... if overridden(self.batch_eat):
276 ... return self.batch_eat([food])[0]
277 ... else:
278 ... raise NotImplementedError()
279 ... def batch_eat(self, foods):
280 ... return [self.eat(food) for food in foods]
281
282 @type method: instance method
283 """
284
285 if isinstance(method, types.MethodType) and method.im_class is not None:
286 name = method.__name__
287 funcs = [cls.__dict__[name]
288 for cls in _mro(method.im_class)
289 if name in cls.__dict__]
290 return len(funcs) > 1
291 else:
292 raise TypeError('Expected an instance method.')
293
295 """
296 Return the I{method resolution order} for C{cls} -- i.e., a list
297 containing C{cls} and all its base classes, in the order in which
298 they would be checked by C{getattr}. For new-style classes, this
299 is just cls.__mro__. For classic classes, this can be obtained by
300 a depth-first left-to-right traversal of C{__bases__}.
301 """
302 if isinstance(cls, type):
303 return cls.__mro__
304 else:
305 mro = [cls]
306 for base in cls.__bases__: mro.extend(_mro(base))
307 return mro
308
309
310
311
312
313
314 -def _add_epytext_field(obj, field, message):
315 """Add an epytext @field to a given object's docstring."""
316 indent = ''
317
318
319 if obj.__doc__:
320 obj.__doc__ = obj.__doc__.rstrip()+'\n\n'
321 indents = re.findall(r'(?<=\n)[ ]+(?!\s)', obj.__doc__.expandtabs())
322 if indents: indent = min(indents)
323
324 else:
325 obj.__doc__ = ''
326
327 obj.__doc__ += textwrap.fill('@%s: %s' % (field, message),
328 initial_indent=indent,
329 subsequent_indent=indent+' ')
330
332 """
333 A decorator used to mark functions as deprecated. This will cause
334 a warning to be printed the when the function is used. Usage:
335
336 >>> @deprecated('Use foo() instead')
337 >>> def bar(x):
338 ... print x/10
339 """
340 def decorator(func):
341 msg = ("Function %s() has been deprecated. %s"
342 % (func.__name__, message))
343 msg = '\n' + textwrap.fill(msg, initial_indent=' ',
344 subsequent_indent=' ')
345 def newFunc(*args, **kwargs):
346 warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
347 return func(*args, **kwargs)
348
349
350 newFunc.__dict__.update(func.__dict__)
351 newFunc.__name__ = func.__name__
352 newFunc.__doc__ = func.__doc__
353 newFunc.__deprecated__ = True
354
355 _add_epytext_field(newFunc, 'deprecated', message)
356 return newFunc
357 return decorator
358
360 """
361 A base class used to mark deprecated classes. A typical usage is to
362 alert users that the name of a class has changed:
363
364 >>> class OldClassName(Deprecated, NewClassName):
365 ... "Use NewClassName instead."
366
367 The docstring of the deprecated class will be used in the
368 deprecation warning message.
369 """
370 - def __new__(cls, *args, **kwargs):
371
372 dep_cls = None
373 for base in _mro(cls):
374 if Deprecated in base.__bases__:
375 dep_cls = base; break
376 assert dep_cls, 'Unable to determine which base is deprecated.'
377
378
379 doc = dep_cls.__doc__ or ''.strip()
380
381 doc = re.sub(r'\A\s*@deprecated:', r'', doc)
382
383 doc = re.sub(r'(?m)^\s*', '', doc)
384
385 name = 'Class %s' % dep_cls.__name__
386 if cls != dep_cls:
387 name += ' (base class for %s)' % cls.__name__
388
389 msg = '%s has been deprecated. %s' % (name, doc)
390
391 msg = '\n' + textwrap.fill(msg, initial_indent=' ',
392 subsequent_indent=' ')
393 warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
394
395 return object.__new__(cls, *args, **kwargs)
396
397
398
399
400
402 """
403 A counter that auto-increments each time its value is read.
404 """
406 self._value = initial_value
408 self._value += 1
409 return self._value
410
411
412
413
414
415 -def find_binary(name, path_to_bin=None, env_vars=(),
416 searchpath=(), binary_names=None, url=None,
417 verbose=True):
418 """
419 Search for the binary for a program that is used by nltk.
420
421 @param name: The name of the program
422 @param path_to_bin: The user-supplied binary location, or None.
423 @param env_vars: A list of environment variable names to check
424 @param binary_names: A list of alternative binary names to check.
425 @param searchpath: List of directories to search.
426 """
427 if binary_names is None: binary_names = [name]
428 assert isinstance(name, basestring)
429 assert not isinstance(binary_names, basestring)
430 assert not isinstance(searchpath, basestring)
431 if isinstance(env_vars, basestring):
432 env_vars = env_vars.split()
433
434
435
436 if path_to_bin is not None:
437 if os.path.isfile(path_to_bin):
438 return path_to_bin
439 for bin in binary_names:
440 if os.path.isfile(os.path.join(path_to_bin, bin)):
441 return os.path.join(path_to_bin, bin)
442 if os.path.isfile(os.path.join(path_to_bin, 'bin', bin)):
443 return os.path.join(path_to_bin, 'bin', bin)
444 raise ValueError('Could not find %s binary at %s' %
445 (name, path_to_bin))
446
447
448 for env_var in env_vars:
449 if env_var in os.environ:
450 path_to_bin = os.environ[env_var]
451 if os.path.isfile(path_to_bin):
452 if verbose: print '[Found %s: %s]' % (name, path_to_bin)
453 return os.environ[env_var]
454 else:
455 for bin_name in binary_names:
456 path_to_bin = os.path.join(os.environ[env_var], bin_name)
457 if os.path.isfile(path_to_bin):
458 if verbose: print '[Found %s: %s]'%(name, path_to_bin)
459 return path_to_bin
460 path_to_bin = os.path.join(os.environ[env_var], 'bin',
461 bin_name)
462 if os.path.isfile(path_to_bin):
463 if verbose: print '[Found %s: %s]'%(name, path_to_bin)
464 return path_to_bin
465
466
467 for directory in searchpath:
468 for bin in binary_names:
469 path_to_bin = os.path.join(directory, bin)
470 if os.path.isfile(path_to_bin):
471 return path_to_bin
472
473
474
475
476 if os.name == 'posix':
477 for bin in binary_names:
478 try:
479 p = subprocess.Popen(['which', bin], stdout=subprocess.PIPE)
480 stdout, stderr = p.communicate()
481 path = stdout.strip()
482 if path.endswith(bin) and os.path.exists(path):
483 if verbose: print '[Found %s: %s]' % (name, path)
484 return path
485 except KeyboardInterrupt, SystemExit:
486 raise
487 except:
488 pass
489
490 msg = ("NLTK was unable to find the %s executable! Use "
491 "config_%s()" % (name, name))
492 if env_vars: msg += ' or set the %s environment variable' % env_vars[0]
493 msg = textwrap.fill(msg+'.', initial_indent=' ',
494 subsequent_indent=' ')
495 msg += "\n\n >>> config_%s('/path/to/%s')" % (name, name)
496 if searchpath:
497 msg += '\n\n Searched in:'
498 msg += ''.join('\n - %s' % d for d in searchpath)
499 if url: msg += ('\n\n For more information, on %s, see:\n <%s>' %
500 (name, url))
501 div = '='*75
502 raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))
503
504
505
506
507
509 """
510 When python is run from within the nltk/ directory tree, the
511 current directory is included at the beginning of the search path.
512 Unfortunately, that means that modules within nltk can sometimes
513 shadow standard library modules. As an example, the stdlib
514 'inspect' module will attempt to import the stdlib 'tokenzie'
515 module, but will instead end up importing NLTK's 'tokenize' module
516 instead (causing the import to fail).
517 """
518 old_path = sys.path
519 sys.path = [d for d in sys.path if d not in ('', '.')]
520 m = __import__(module)
521 sys.path = old_path
522 return m
523
524
525
526
527
529 """
530 A decorator used to mark methods as abstract. I.e., methods that
531 are marked by this decorator must be overridden by subclasses. If
532 an abstract method is called (either in the base class or in a
533 subclass that does not override the base class method), it will
534 raise C{NotImplementedError}.
535 """
536
537 inspect = import_from_stdlib('inspect')
538
539
540 args, varargs, varkw, defaults = inspect.getargspec(func)
541
542
543
544 msg = '%s is an abstract method.' % func.__name__
545 signature = inspect.formatargspec(args, varargs, varkw, ())
546 exec ('def newfunc%s: raise NotImplementedError(%r)' % (signature, msg))
547
548
549
550 newfunc.func_defaults = func.func_defaults
551
552
553 newfunc.__name__ = func.__name__
554 newfunc.__doc__ = func.__doc__
555 newfunc.__abstract__ = True
556 _add_epytext_field(newfunc, "note", "This method is abstract.")
557
558
559 return newfunc
560
561
562
563
564
566 """
567 A wrapper around ElementTree Element objects whose main purpose is
568 to provide nicer __repr__ and __str__ methods. In addition, any
569 of the wrapped Element's methods that return other Element objects
570 are overridden to wrap those values before returning them.
571
572 This makes Elements more convenient to work with in
573 interactive sessions and doctests, at the expense of some
574 efficiency.
575 """
576
577
579 """
580 Create and return a wrapper around a given Element object.
581 If C{etree} is an C{ElementWrapper}, then C{etree} is
582 returned as-is.
583 """
584 if isinstance(etree, ElementWrapper):
585 return etree
586 else:
587 return object.__new__(ElementWrapper, etree)
588
590 """
591 Initialize a new Element wrapper for C{etree}. If
592 C{etree} is a string, then it will be converted to an
593 Element object using C{ElementTree.fromstring()} first.
594 """
595 if isinstance(etree, basestring):
596 etree = ElementTree.fromstring(etree)
597 self.__dict__['_etree'] = etree
598
600 """
601 Return the Element object wrapped by this wrapper.
602 """
603 return self._etree
604
605
606
607
608
610 s = ElementTree.tostring(self._etree)
611 if len(s) > 60:
612 e = s.rfind('<')
613 if (len(s)-e) > 30: e = -20
614 s = '%s...%s' % (s[:30], s[e:])
615 return '<Element %r>' % s
616
618 """
619 @return: the result of applying C{ElementTree.tostring()} to
620 the wrapped Element object.
621 """
622 return ElementTree.tostring(self._etree)
623
624
625
626
627
629 return getattr(self._etree, attrib)
630
632 return setattr(self._etree, attr, value)
633
635 return delattr(self._etree, attr)
636
638 self._etree[index] = element
639
641 del self._etree[index]
642
645
648
650 return len(self._etree)
651
652
653
654
655
658
661
664
668
671
672 - def find(self, path):
676
679
680
681
682
683
685 """
686 Given a slice, return the corresponding (start, stop) bounds,
687 taking into account None indices and negative indices. The
688 following guarantees are made for the returned start and stop values:
689
690 - 0 <= start <= len(sequence)
691 - 0 <= stop <= len(sequence)
692 - start <= stop
693
694 @raise ValueError: If C{slice_obj.step} is not C{None}.
695 """
696 if slice_obj.step is not None:
697 raise ValueError('slices with steps are not supported by %s' %
698 sequence.__class__.__name__)
699 start, stop = slice_obj.start, slice_obj.stop
700
701
702 if start is None: start = 0
703 if stop is None: stop = len(sequence)
704
705
706 if start < 0: start = max(0, len(sequence)+start)
707 if stop < 0: stop = max(0, len(sequence)+stop)
708
709
710
711
712 if stop > 0:
713 try: sequence[stop-1]
714 except IndexError: stop = len(sequence)
715
716
717 start = min(start, stop)
718
719
720 return start, stop
721