nltk.internals

1 # Natural Language Toolkit: Internal utility functions 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Steven Bird <[email protected]> 5 # Edward Loper <[email protected]> 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 import subprocess 10 import os 11 import os.path 12 import re 13 import warnings 14 import textwrap 15 import types 16 import sys 17 18 from nltk import __file__ 19 20 # Use the c version of ElementTree, which is faster, if possible: 21 try: from xml.etree import cElementTree as ElementTree 22 except ImportError: from nltk.etree import ElementTree 23 24 ###################################################################### 25 # Regular Expression Processing 26 ###################################################################### 27

28 -def convert_regexp_to_nongrouping(pattern):

29 """ 30 Convert all grouping parenthases in the given regexp pattern to 31 non-grouping parenthases, and return the result. E.g.: 32 33 >>> convert_regexp_to_nongrouping('ab(c(x+)(z*))?d') 34 'ab(?:c(?:x+)(?:z*))?d' 35 36 @type pattern: C{str} 37 @rtype: C{str} 38 """ 39 # Sanity check: back-references are not allowed! 40 for s in re.findall(r'\\.|\(\?P=', pattern): 41 if s[1] in '0123456789' or s == '(?P=': 42 raise ValueError('Regular expressions with back-references ' 43 'are not supported: %r' % pattern) 44 45 # This regexp substitution function replaces the string '(' 46 # with the string '(?:', but otherwise makes no changes. 47 def subfunc(m): 48 return re.sub('^\((\?P<[^>]*>)?$', '(?:', m.group())

49 50 # Scan through the regular expression. If we see any backslashed 51 # characters, ignore them. If we see a named group, then 52 # replace it with "(?:". If we see any open parens that are part 53 # of an extension group, ignore those too. But if we see 54 # any other open paren, replace it with "(?:") 55 return re.sub(r'''(?x) 56 \\. | # Backslashed character 57 \(\?P<[^>]*> | # Named group 58 \(\? | # Extension group 59 \( # Grouping parenthasis''', subfunc, pattern) 60 61 62 ########################################################################## 63 # Java Via Command-Line 64 ########################################################################## 65 66 _java_bin = None 67 _java_options = [] 68 # [xx] add classpath option to config_java?

69 -def config_java(bin=None, options=None):

70 """ 71 Configure nltk's java interface, by letting nltk know where it can 72 find the C{java} binary, and what extra options (if any) should be 73 passed to java when it is run. 74 75 @param bin: The full path to the C{java} binary. If not specified, 76 then nltk will search the system for a C{java} binary; and if 77 one is not found, it will raise a C{LookupError} exception. 78 @type bin: C{string} 79 @param options: A list of options that should be passed to the 80 C{java} binary when it is called. A common value is 81 C{['-Xmx512m']}, which tells the C{java} binary to increase 82 the maximum heap size to 512 megabytes. If no options are 83 specified, then do not modify the options list. 84 @type options: C{list} of C{string} 85 """ 86 global _java_bin, _java_options 87 _java_bin = find_binary('java', bin, env_vars=['JAVAHOME', 'JAVA_HOME']) 88 89 if options is not None: 90 if isinstance(options, basestring): 91 options = options.split() 92 _java_options = list(options)

93

94 -def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, 95 blocking=True):

96 """ 97 Execute the given java command, by opening a subprocess that calls 98 C{java}. If java has not yet been configured, it will be configured 99 by calling L{config_java()} with no arguments. 100 101 @param cmd: The java command that should be called, formatted as 102 a list of strings. Typically, the first string will be the name 103 of the java class; and the remaining strings will be arguments 104 for that java class. 105 @type cmd: C{list} of C{string} 106 107 @param classpath: A C{':'} separated list of directories, JAR 108 archives, and ZIP archives to search for class files. 109 @type classpath: C{string} 110 111 @param stdin, stdout, stderr: Specify the executed programs' 112 standard input, standard output and standard error file 113 handles, respectively. Valid values are C{subprocess.PIPE}, 114 an existing file descriptor (a positive integer), an existing 115 file object, and C{None}. C{subprocess.PIPE} indicates that a 116 new pipe to the child should be created. With C{None}, no 117 redirection will occur; the child's file handles will be 118 inherited from the parent. Additionally, stderr can be 119 C{subprocess.STDOUT}, which indicates that the stderr data 120 from the applications should be captured into the same file 121 handle as for stdout. 122 123 @param blocking: If C{false}, then return immediately after 124 spawning the subprocess. In this case, the return value is 125 the C{Popen} object, and not a C{(stdout, stderr)} tuple. 126 127 @return: If C{blocking=True}, then return a tuple C{(stdout, 128 stderr)}, containing the stdout and stderr outputs generated 129 by the java command if the C{stdout} and C{stderr} parameters 130 were set to C{subprocess.PIPE}; or C{None} otherwise. If 131 C{blocking=False}, then return a C{subprocess.Popen} object. 132 133 @raise OSError: If the java command returns a nonzero return code. 134 """ 135 if stdin == 'pipe': stdin = subprocess.PIPE 136 if stdout == 'pipe': stdout = subprocess.PIPE 137 if stderr == 'pipe': stderr = subprocess.PIPE 138 if isinstance(cmd, basestring): 139 raise TypeError('cmd should be a list of strings') 140 141 # Make sure we know where a java binary is. 142 if _java_bin is None: 143 config_java() 144 145 # Set up the classpath. 146 if classpath is None: 147 classpath = NLTK_JAR 148 else: 149 classpath += ':' + NLTK_JAR 150 151 # Construct the full command string. 152 cmd = list(cmd) 153 cmd = ['-cp', classpath] + cmd 154 cmd = [_java_bin] + _java_options + cmd 155 156 # Call java via a subprocess 157 p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr) 158 if not blocking: return p 159 (stdout, stderr) = p.communicate() 160 161 # Check the return code. 162 if p.returncode != 0: 163 print stderr 164 raise OSError('Java command failed!') 165 166 return (stdout, stderr)

167 168 #: The location of the NLTK jar file, which is used to communicate 169 #: with external Java packages (such as Mallet) that do not have 170 #: a sufficiently powerful native command-line interface. 171 NLTK_JAR = os.path.abspath(os.path.join(os.path.split(__file__)[0], 172 'nltk.jar')) 173 174 if 0: 175 #config_java(options='-Xmx512m') 176 # Write: 177 #java('weka.classifiers.bayes.NaiveBayes', 178 # ['-d', '/tmp/names.model', '-t', '/tmp/train.arff'], 179 # classpath='/Users/edloper/Desktop/weka/weka.jar') 180 # Read: 181 (a,b) = java(['weka.classifiers.bayes.NaiveBayes', 182 '-l', '/tmp/names.model', '-T', '/tmp/test.arff', 183 '-p', '0'],#, '-distribution'], 184 classpath='/Users/edloper/Desktop/weka/weka.jar') 185 186 187 ###################################################################### 188 # Parsing 189 ###################################################################### 190

191 -class ParseError(ValueError):

192 """ 193 Exception raised by parse_* functions when they fail. 194 @param position: The index in the input string where an error occured. 195 @param expected: What was expected when an error occured. 196 """

197 - def __init__(self, expected, position):

198 ValueError.__init__(self, expected, position) 199 self.expected = expected 200 self.position = position

201 - def __str__(self):

202 return 'Expected %s at %s' % (self.expected, self.position)

203 204 _STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')")

205 -def parse_str(s, start_position):

206 """ 207 If a Python string literal begins at the specified position in the 208 given string, then return a tuple C{(val, end_position)} 209 containing the value of the string literal and the position where 210 it ends. Otherwise, raise a L{ParseError}. 211 """ 212 # Read the open quote, and any modifiers. 213 m = _STRING_START_RE.match(s, start_position) 214 if not m: raise ParseError('open quote', start_position) 215 quotemark = m.group(1) 216 217 # Find the close quote. 218 _STRING_END_RE = re.compile(r'\\|%s' % quotemark) 219 position = m.end() 220 while True: 221 match = _STRING_END_RE.search(s, position) 222 if not match: raise ParseError('close quote', position) 223 if match.group(0) == '\\': position = match.end()+1 224 else: break 225 226 # Parse it, using eval. Strings with invalid escape sequences 227 # might raise ValueEerror. 228 try: 229 return eval(s[start_position:match.end()]), match.end() 230 except ValueError, e: 231 raise ParseError('valid string (%s)' % e, start)

232 233 _PARSE_INT_RE = re.compile(r'-?\d+')

234 -def parse_int(s, start_position):

235 """ 236 If an integer begins at the specified position in the given 237 string, then return a tuple C{(val, end_position)} containing the 238 value of the integer and the position where it ends. Otherwise, 239 raise a L{ParseError}. 240 """ 241 m = _PARSE_INT_RE.match(s, start_position) 242 if not m: raise ParseError('integer', start_position) 243 return int(m.group()), m.end()

244 245 _PARSE_NUMBER_VALUE = re.compile(r'-?(\d*)([.]?\d*)?')

246 -def parse_number(s, start_position):

247 """ 248 If an integer or float begins at the specified position in the 249 given string, then return a tuple C{(val, end_position)} 250 containing the value of the number and the position where it ends. 251 Otherwise, raise a L{ParseError}. 252 """ 253 m = _PARSE_NUMBER_VALUE.match(s, start_position) 254 if not m or not (m.group(1) or m.group(2)): 255 raise ParseError('number', start_position) 256 if m.group(2): return float(m.group()), m.end() 257 else: return int(m.group()), m.end()

258 259 260 261 ###################################################################### 262 # Check if a method has been overridden 263 ###################################################################### 264

265 -def overridden(method):

266 """ 267 @return: True if C{method} overrides some method with the same 268 name in a base class. This is typically used when defining 269 abstract base classes or interfaces, to allow subclasses to define 270 either of two related methods: 271 272 >>> class EaterI: 273 ... '''Subclass must define eat() or batch_eat().''' 274 ... def eat(self, food): 275 ... if overridden(self.batch_eat): 276 ... return self.batch_eat([food])[0] 277 ... else: 278 ... raise NotImplementedError() 279 ... def batch_eat(self, foods): 280 ... return [self.eat(food) for food in foods] 281 282 @type method: instance method 283 """ 284 # [xx] breaks on classic classes! 285 if isinstance(method, types.MethodType) and method.im_class is not None: 286 name = method.__name__ 287 funcs = [cls.__dict__[name] 288 for cls in _mro(method.im_class) 289 if name in cls.__dict__] 290 return len(funcs) > 1 291 else: 292 raise TypeError('Expected an instance method.')

293

294 -def _mro(cls):

295 """ 296 Return the I{method resolution order} for C{cls} -- i.e., a list 297 containing C{cls} and all its base classes, in the order in which 298 they would be checked by C{getattr}. For new-style classes, this 299 is just cls.__mro__. For classic classes, this can be obtained by 300 a depth-first left-to-right traversal of C{__bases__}. 301 """ 302 if isinstance(cls, type): 303 return cls.__mro__ 304 else: 305 mro = [cls] 306 for base in cls.__bases__: mro.extend(_mro(base)) 307 return mro

308 309 ###################################################################### 310 # Deprecation decorator & base class 311 ###################################################################### 312 # [xx] dedent msg first if it comes from a docstring. 313

314 -def _add_epytext_field(obj, field, message):

315 """Add an epytext @field to a given object's docstring.""" 316 indent = '' 317 # If we already have a docstring, then add a blank line to separate 318 # it from the new field, and check its indentation. 319 if obj.__doc__: 320 obj.__doc__ = obj.__doc__.rstrip()+'\n\n' 321 indents = re.findall(r'(?<=\n)[ ]+(?!\s)', obj.__doc__.expandtabs()) 322 if indents: indent = min(indents) 323 # If we don't have a docstring, add an empty one. 324 else: 325 obj.__doc__ = '' 326 327 obj.__doc__ += textwrap.fill('@%s: %s' % (field, message), 328 initial_indent=indent, 329 subsequent_indent=indent+' ')

330

331 -def deprecated(message):

332 """ 333 A decorator used to mark functions as deprecated. This will cause 334 a warning to be printed the when the function is used. Usage: 335 336 >>> @deprecated('Use foo() instead') 337 >>> def bar(x): 338 ... print x/10 339 """ 340 def decorator(func): 341 msg = ("Function %s() has been deprecated. %s" 342 % (func.__name__, message)) 343 msg = '\n' + textwrap.fill(msg, initial_indent=' ', 344 subsequent_indent=' ') 345 def newFunc(*args, **kwargs): 346 warnings.warn(msg, category=DeprecationWarning, stacklevel=2) 347 return func(*args, **kwargs)

348 349 # Copy the old function's name, docstring, & dict 350 newFunc.__dict__.update(func.__dict__) 351 newFunc.__name__ = func.__name__ 352 newFunc.__doc__ = func.__doc__ 353 newFunc.__deprecated__ = True 354 # Add a @deprecated field to the docstring. 355 _add_epytext_field(newFunc, 'deprecated', message) 356 return newFunc 357 return decorator 358

359 -class Deprecated(object):

360 """ 361 A base class used to mark deprecated classes. A typical usage is to 362 alert users that the name of a class has changed: 363 364 >>> class OldClassName(Deprecated, NewClassName): 365 ... "Use NewClassName instead." 366 367 The docstring of the deprecated class will be used in the 368 deprecation warning message. 369 """

370 - def __new__(cls, *args, **kwargs):

371 # Figure out which class is the deprecated one. 372 dep_cls = None 373 for base in _mro(cls): 374 if Deprecated in base.__bases__: 375 dep_cls = base; break 376 assert dep_cls, 'Unable to determine which base is deprecated.' 377 378 # Construct an appropriate warning. 379 doc = dep_cls.__doc__ or ''.strip() 380 # If there's a @deprecated field, strip off the field marker. 381 doc = re.sub(r'\A\s*@deprecated:', r'', doc) 382 # Strip off any indentation. 383 doc = re.sub(r'(?m)^\s*', '', doc) 384 # Construct a 'name' string. 385 name = 'Class %s' % dep_cls.__name__ 386 if cls != dep_cls: 387 name += ' (base class for %s)' % cls.__name__ 388 # Put it all together. 389 msg = '%s has been deprecated. %s' % (name, doc) 390 # Wrap it. 391 msg = '\n' + textwrap.fill(msg, initial_indent=' ', 392 subsequent_indent=' ') 393 warnings.warn(msg, category=DeprecationWarning, stacklevel=2) 394 # Do the actual work of __new__. 395 return object.__new__(cls, *args, **kwargs)

396 397 ########################################################################## 398 # COUNTER, FOR UNIQUE NAMING 399 ########################################################################## 400

401 -class Counter:

402 """ 403 A counter that auto-increments each time its value is read. 404 """

405 - def __init__(self, initial_value=0):

406 self._value = initial_value

407 - def get(self):

408 self._value += 1 409 return self._value

410 411 ########################################################################## 412 # Search for binaries 413 ########################################################################## 414

415 -def find_binary(name, path_to_bin=None, env_vars=(), 416 searchpath=(), binary_names=None, url=None, 417 verbose=True):

418 """ 419 Search for the binary for a program that is used by nltk. 420 421 @param name: The name of the program 422 @param path_to_bin: The user-supplied binary location, or None. 423 @param env_vars: A list of environment variable names to check 424 @param binary_names: A list of alternative binary names to check. 425 @param searchpath: List of directories to search. 426 """ 427 if binary_names is None: binary_names = [name] 428 assert isinstance(name, basestring) 429 assert not isinstance(binary_names, basestring) 430 assert not isinstance(searchpath, basestring) 431 if isinstance(env_vars, basestring): 432 env_vars = env_vars.split() 433 434 # If an explicit bin was given, then check it, and return it if 435 # it's present; otherwise, complain. 436 if path_to_bin is not None: 437 if os.path.isfile(path_to_bin): 438 return path_to_bin 439 for bin in binary_names: 440 if os.path.isfile(os.path.join(path_to_bin, bin)): 441 return os.path.join(path_to_bin, bin) 442 if os.path.isfile(os.path.join(path_to_bin, 'bin', bin)): 443 return os.path.join(path_to_bin, 'bin', bin) 444 raise ValueError('Could not find %s binary at %s' % 445 (name, path_to_bin)) 446 447 # Check environment variables 448 for env_var in env_vars: 449 if env_var in os.environ: 450 path_to_bin = os.environ[env_var] 451 if os.path.isfile(path_to_bin): 452 if verbose: print '[Found %s: %s]' % (name, path_to_bin) 453 return os.environ[env_var] 454 else: 455 for bin_name in binary_names: 456 path_to_bin = os.path.join(os.environ[env_var], bin_name) 457 if os.path.isfile(path_to_bin): 458 if verbose: print '[Found %s: %s]'%(name, path_to_bin) 459 return path_to_bin 460 path_to_bin = os.path.join(os.environ[env_var], 'bin', 461 bin_name) 462 if os.path.isfile(path_to_bin): 463 if verbose: print '[Found %s: %s]'%(name, path_to_bin) 464 return path_to_bin 465 466 # Check the path list. 467 for directory in searchpath: 468 for bin in binary_names: 469 path_to_bin = os.path.join(directory, bin) 470 if os.path.isfile(path_to_bin): 471 return path_to_bin 472 473 474 # If we're on a POSIX system, then try using the 'which' command 475 # to find the binary. 476 if os.name == 'posix': 477 for bin in binary_names: 478 try: 479 p = subprocess.Popen(['which', bin], stdout=subprocess.PIPE) 480 stdout, stderr = p.communicate() 481 path = stdout.strip() 482 if path.endswith(bin) and os.path.exists(path): 483 if verbose: print '[Found %s: %s]' % (name, path) 484 return path 485 except KeyboardInterrupt, SystemExit: 486 raise 487 except: 488 pass 489 490 msg = ("NLTK was unable to find the %s executable! Use " 491 "config_%s()" % (name, name)) 492 if env_vars: msg += ' or set the %s environment variable' % env_vars[0] 493 msg = textwrap.fill(msg+'.', initial_indent=' ', 494 subsequent_indent=' ') 495 msg += "\n\n >>> config_%s('/path/to/%s')" % (name, name) 496 if searchpath: 497 msg += '\n\n Searched in:' 498 msg += ''.join('\n - %s' % d for d in searchpath) 499 if url: msg += ('\n\n For more information, on %s, see:\n <%s>' % 500 (name, url)) 501 div = '='*75 502 raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div))

503 504 ########################################################################## 505 # Import Stdlib Module 506 ########################################################################## 507

508 -def import_from_stdlib(module):

509 """ 510 When python is run from within the nltk/ directory tree, the 511 current directory is included at the beginning of the search path. 512 Unfortunately, that means that modules within nltk can sometimes 513 shadow standard library modules. As an example, the stdlib 514 'inspect' module will attempt to import the stdlib 'tokenzie' 515 module, but will instead end up importing NLTK's 'tokenize' module 516 instead (causing the import to fail). 517 """ 518 old_path = sys.path 519 sys.path = [d for d in sys.path if d not in ('', '.')] 520 m = __import__(module) 521 sys.path = old_path 522 return m

523 524 ########################################################################## 525 # Abstract declaration 526 ########################################################################## 527

528 -def abstract(func):

529 """ 530 A decorator used to mark methods as abstract. I.e., methods that 531 are marked by this decorator must be overridden by subclasses. If 532 an abstract method is called (either in the base class or in a 533 subclass that does not override the base class method), it will 534 raise C{NotImplementedError}. 535 """ 536 # Avoid problems caused by nltk.tokenize shadowing the stdlib tokenize: 537 inspect = import_from_stdlib('inspect') 538 539 # Read the function's signature. 540 args, varargs, varkw, defaults = inspect.getargspec(func) 541 542 # Create a new function with the same signature (minus defaults) 543 # that raises NotImplementedError. 544 msg = '%s is an abstract method.' % func.__name__ 545 signature = inspect.formatargspec(args, varargs, varkw, ()) 546 exec ('def newfunc%s: raise NotImplementedError(%r)' % (signature, msg)) 547 548 # Substitute in the defaults after-the-fact, since eval(repr(val)) 549 # may not work for some default values. 550 newfunc.func_defaults = func.func_defaults 551 552 # Copy the name and docstring 553 newfunc.__name__ = func.__name__ 554 newfunc.__doc__ = func.__doc__ 555 newfunc.__abstract__ = True 556 _add_epytext_field(newfunc, "note", "This method is abstract.") 557 558 # Return the function. 559 return newfunc

560 561 ########################################################################## 562 # Wrapper for ElementTree Elements 563 ########################################################################## 564

565 -class ElementWrapper(object):

566 """ 567 A wrapper around ElementTree Element objects whose main purpose is 568 to provide nicer __repr__ and __str__ methods. In addition, any 569 of the wrapped Element's methods that return other Element objects 570 are overridden to wrap those values before returning them. 571 572 This makes Elements more convenient to work with in 573 interactive sessions and doctests, at the expense of some 574 efficiency. 575 """ 576 577 # Prevent double-wrapping:

578 - def __new__(cls, etree):

579 """ 580 Create and return a wrapper around a given Element object. 581 If C{etree} is an C{ElementWrapper}, then C{etree} is 582 returned as-is. 583 """ 584 if isinstance(etree, ElementWrapper): 585 return etree 586 else: 587 return object.__new__(ElementWrapper, etree)

588

589 - def __init__(self, etree):

590 """ 591 Initialize a new Element wrapper for C{etree}. If 592 C{etree} is a string, then it will be converted to an 593 Element object using C{ElementTree.fromstring()} first. 594 """ 595 if isinstance(etree, basestring): 596 etree = ElementTree.fromstring(etree) 597 self.__dict__['_etree'] = etree

598

599 - def unwrap(self):

600 """ 601 Return the Element object wrapped by this wrapper. 602 """ 603 return self._etree

604 605 ##//////////////////////////////////////////////////////////// 606 #{ String Representation 607 ##//////////////////////////////////////////////////////////// 608

609 - def __repr__(self):

610 s = ElementTree.tostring(self._etree) 611 if len(s) > 60: 612 e = s.rfind('<') 613 if (len(s)-e) > 30: e = -20 614 s = '%s...%s' % (s[:30], s[e:]) 615 return '<Element %r>' % s

616

617 - def __str__(self):

618 """ 619 @return: the result of applying C{ElementTree.tostring()} to 620 the wrapped Element object. 621 """ 622 return ElementTree.tostring(self._etree)

623 624 ##//////////////////////////////////////////////////////////// 625 #{ Element interface Delegation (pass-through) 626 ##//////////////////////////////////////////////////////////// 627

628 - def __getattr__(self, attrib):

629 return getattr(self._etree, attrib)

630

631 - def __setattr__(self, attr, value):

632 return setattr(self._etree, attr, value)

633

634 - def __delattr__(self, attr):

635 return delattr(self._etree, attr)

636

637 - def __setitem__(self, index, element):

638 self._etree[index] = element

639

640 - def __delitem__(self, index):

641 del self._etree[index]

642

643 - def __setslice__(self, start, stop, elements):

644 self._etree[start:stop] = elements

645

646 - def __delslice__(self, start, stop):

647 del self._etree[start:stop]

648

649 - def __len__(self):

650 return len(self._etree)

651 652 ##//////////////////////////////////////////////////////////// 653 #{ Element interface Delegation (wrap result) 654 ##//////////////////////////////////////////////////////////// 655

656 - def __getitem__(self, index):

657 return ElementWrapper(self._etree[index])

658

659 - def __getslice__(self, start, stop):

660 return [ElementWrapper(elt) for elt in self._etree[start:stop]]

661

662 - def getchildren(self):

663 return [ElementWrapper(elt) for elt in self._etree]

664

665 - def getiterator(self, tag=None):

666 return (ElementWrapper(elt) 667 for elt in self._etree.getiterator(tag))

668

669 - def makeelement(self, tag, attrib):

670 return ElementWrapper(self._etree.makeelement(tag, attrib))

671

672 - def find(self, path):

673 elt = self._etree.find(path) 674 if elt is None: return elt 675 else: return ElementWrapper(elt)

676

677 - def findall(self, path):

678 return [ElementWrapper(elt) for elt in self._etree.findall(path)]

679 680 ###################################################################### 681 # Helper for Handling Slicing 682 ###################################################################### 683

684 -def slice_bounds(sequence, slice_obj):

685 """ 686 Given a slice, return the corresponding (start, stop) bounds, 687 taking into account None indices and negative indices. The 688 following guarantees are made for the returned start and stop values: 689 690 - 0 <= start <= len(sequence) 691 - 0 <= stop <= len(sequence) 692 - start <= stop 693 694 @raise ValueError: If C{slice_obj.step} is not C{None}. 695 """ 696 if slice_obj.step is not None: 697 raise ValueError('slices with steps are not supported by %s' % 698 sequence.__class__.__name__) 699 start, stop = slice_obj.start, slice_obj.stop 700 701 # Handle None indices. 702 if start is None: start = 0 703 if stop is None: stop = len(sequence) 704 705 # Handle negative indices. 706 if start < 0: start = max(0, len(sequence)+start) 707 if stop < 0: stop = max(0, len(sequence)+stop) 708 709 # Make sure stop doesn't go past the end of the list. Note that 710 # we avoid calculating len(sequence) if possible, because for lazy 711 # sequences, calculating the length of a sequence can be expensive. 712 if stop > 0: 713 try: sequence[stop-1] 714 except IndexError: stop = len(sequence) 715 716 # Make sure start isn't past stop. 717 start = min(start, stop) 718 719 # That's all folks! 720 return start, stop

721

Source Code for Module nltk.internals