nltk.tokenize.punkt

1 # Natural Language Toolkit: Punkt sentence tokenizer 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Algorithm: Kiss & Strunk (2006) 5 # Author: Willy <[email protected]> (original Python port) 6 # Steven Bird <[email protected]> (additions) 7 # Edward Loper <[email protected]> (rewrite) 8 # Joel Nothman <[email protected]> (almost rewrite) 9 # URL: <http://nltk.org> 10 # For license information, see LICENSE.TXT 11 # 12 # $Id: probability.py 4865 2007-07-11 22:16:07Z edloper $ 13 14 """ 15 The Punkt sentence tokenizer. The algorithm for this tokenizer is 16 described in Kiss & Strunk (2006):: 17 18 Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence 19 Boundary Detection. Computational Linguistics 32: 485-525. 20 """ 21 22 import re 23 import math 24 25 from nltk import defaultdict 26 from nltk.probability import FreqDist 27 28 from api import TokenizerI 29 30 ###################################################################### 31 #{ Orthographic Context Constants 32 ###################################################################### 33 # The following constants are used to describe the orthographic 34 # contexts in which a word can occur. BEG=beginning, MID=middle, 35 # UNK=unknown, UC=uppercase, LC=lowercase, NC=no case. 36 37 _ORTHO_BEG_UC = 1 << 1 38 """Orthogaphic context: beginning of a sentence with upper case.""" 39 40 _ORTHO_MID_UC = 1 << 2 41 """Orthogaphic context: middle of a sentence with upper case.""" 42 43 _ORTHO_UNK_UC = 1 << 3 44 """Orthogaphic context: unknown position in a sentence with upper case.""" 45 46 _ORTHO_BEG_LC = 1 << 4 47 """Orthogaphic context: beginning of a sentence with lower case.""" 48 49 _ORTHO_MID_LC = 1 << 5 50 """Orthogaphic context: middle of a sentence with lower case.""" 51 52 _ORTHO_UNK_LC = 1 << 6 53 """Orthogaphic context: unknown position in a sentence with lower case.""" 54 55 _ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC 56 """Orthogaphic context: occurs with upper case.""" 57 58 _ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC 59 """Orthogaphic context: occurs with lower case.""" 60 61 _ORTHO_MAP = { 62 ('initial', 'upper'): _ORTHO_BEG_UC, 63 ('internal', 'upper'): _ORTHO_MID_UC, 64 ('unknown', 'upper'): _ORTHO_UNK_UC, 65 ('initial', 'lower'): _ORTHO_BEG_LC, 66 ('internal', 'lower'): _ORTHO_MID_LC, 67 ('unknown', 'lower'): _ORTHO_UNK_LC, 68 } 69 """A map from context position and first-letter case to the 70 appropriate orthographic context flag.""" 71 72 #} (end orthographic context constants) 73 ###################################################################### 74 75 ###################################################################### 76 #{ Regular expressions for annotation 77 ###################################################################### 78 79 _RE_NON_PUNCT = re.compile(r'[^\W\d]', re.UNICODE) 80 """Matches token types that are not merely punctuation. (Types for 81 numeric tokens are changed to ##number## and hence contain alpha.)""" 82 83 _RE_BOUNDARY_REALIGNMENT = re.compile(r'["\')\]}]+?(?: |(?=--)|$)', 84 re.MULTILINE) 85 """Used to realign punctuation that should be included in a sentence 86 although it follows the period (or ?, !).""" 87 88 #} (end regular expressions for annotation) 89 ###################################################################### 90 91 ###################################################################### 92 #{ Punkt Word Tokenizer 93 ###################################################################### 94

95 -class PunktWordTokenizer(TokenizerI):

96 - def tokenize(self, text):

97 return punkt_word_tokenize(text)

98

99 -def punkt_word_tokenize(s):

100 """ 101 Tokenize a string using the rules from the Punkt word tokenizer. 102 """ 103 for (regexp, repl) in _punkt_word_tokenize_regexps: 104 s = regexp.sub(repl, s) 105 return s.split()

106 107 #: A list of (regexp, repl) pairs applied in sequence by 108 #: L{punkt_word_tokenize}. The resulting string is split on 109 #: whitespace. 110 _punkt_word_tokenize_regexps = [ 111 # Separate punctuation (except period) from words: 112 (re.compile(r'(?=[$\"\`{\[:;&\#\*@])(.)'), r'\1 '), 113 114 (re.compile(r'(.)(?=[?!)\";}\]\*:@\'])'), r'\1 '), 115 (re.compile(r'(?=[$}\]])(.)'), r'\1 '), 116 (re.compile(r'(.)(?=[({\[])'), r'\1 '), 117 (re.compile(r'((^|\s)\-)(?=[^\-])'), r'\1 '), 118 119 # Treat double-hyphen as one token: 120 (re.compile(r'([^-])(\-\-+)([^-])'), r'\1 \2 \3'), 121 (re.compile(r'(\s|^)(,)(?=(\S))'), r'\1\2 '), 122 123 # Only separate comma if space follows: 124 (re.compile(r'(.)(,)(\s|$)'), r'\1 \2\3'), 125 126 # Combine dots separated by whitespace to be a single token: 127 (re.compile(r'\.\s\.\s\.'), r'...'), 128 129 # [xx] why is this one commented out? 130 ## Separate "No.6" 131 #(re.compile(r'([^\W\d]\.)(\d+)', re.UNICODE), r'\1 \2'), 132 133 # Separate words from ellipses 134 (re.compile(r'([^\.]|^)(\.{2,})(.?)'), r'\1 \2 \3'), 135 136 (re.compile(r'(^|\s)(\.{2,})([^\.\s])'), r'\1\2 \3'), 137 (re.compile(r'([^\.\s])(\.{2,})($|\s)'), r'\1 \2\3'), 138 ] 139 140 #: Regular expression to find only contexts that include a possible 141 #: sentence boundary within a given text. 142 _punkt_period_context_regexp = re.compile(r""" 143 \S* # some word material 144 ([.?!]) # a potential sentence ending 145 (?: 146 ([?!)\";}\]\*:@\'({\[]) # either other punctuation 147 | 148 \s+(\S+) # or whitespace and some other token 149 )""", re.UNICODE | re.VERBOSE) 150 151 ###################################################################### 152 #{ Punkt Parameters 153 ###################################################################### 154

155 -class PunktParameters(object):

156 """Stores data used to perform sentence boundary detection with punkt.""" 157

158 - def __init__(self):

159 self.abbrev_types = set() 160 """A set of word types for known abbreviations.""" 161 162 self.collocations = set() 163 """A set of word type tuples for known common collocations 164 where the first word ends in a period. E.g., ('S.', 'Bach') 165 is a common collocation in a text that discusses 'Johann 166 S. Bach'. These count as negative evidence for sentence 167 boundaries.""" 168 169 self.sent_starters = set() 170 """A set of word types for words that often appear at the 171 beginning of sentences.""" 172 173 self.ortho_context = defaultdict(int) 174 """A dictionary mapping word types to the set of orthographic 175 contexts that word type appears in. Contexts are represented 176 by adding orthographic context flags: ..."""

177

178 - def clear_abbrevs(self):

179 self.abbrev_types = set()

180

181 - def clear_collocations(self):

182 self.collocations = set()

183

184 - def clear_sent_starters(self):

185 self.sent_starters = set()

186

187 - def clear_ortho_context(self):

188 self.ortho_context = defaultdict(int)

189

190 - def add_ortho_context(self, typ, flag):

191 self.ortho_context[typ] |= flag

192 193 ###################################################################### 194 #{ PunktToken 195 ###################################################################### 196

197 -class PunktToken(object):

198 """Stores a token of text with annotations produced during 199 sentence boundary detection.""" 200 201 _properties = [ 202 'parastart', 'linestart', 203 'sentbreak', 'abbr', 'ellipsis' 204 ] 205 __slots__ = ['tok', 'type', 'period_final'] + _properties 206

207 - def __init__(self, tok, **params):

208 self.tok = tok 209 self.type = self._get_type() 210 self.period_final = tok.endswith('.') 211 212 for p in self._properties: 213 setattr(self, p, None) 214 for k, v in params.iteritems(): 215 setattr(self, k, v)

216 217 #//////////////////////////////////////////////////////////// 218 #{ Regular expressions for properties 219 #//////////////////////////////////////////////////////////// 220 # Note: [A-Za-z] is approximated by [^\W\d] in the general case. 221 _RE_ELLIPSIS = re.compile(r'\.\.+$') 222 _RE_NUMERIC = re.compile(r'^-?[\.,]?\d[\d,\.-]*\.?$') 223 _RE_INITIAL = re.compile(r'[^\W\d]\.$', re.UNICODE) 224 _RE_ALPHA = re.compile(r'[^\W\d]+$', re.UNICODE) 225 226 #//////////////////////////////////////////////////////////// 227 #{ Derived properties 228 #//////////////////////////////////////////////////////////// 229

230 - def _get_type(self):

231 """Returns a case-normalized representation of the token.""" 232 return self._RE_NUMERIC.sub('##number##', self.tok.lower())

233 234 @property

235 - def type_no_period(self):

236 """ 237 The type with its final period removed if it has one. 238 """ 239 if len(self.type) > 1 and self.type[-1] == '.': 240 return self.type[:-1] 241 return self.type

242 243 @property

244 - def type_no_sentperiod(self):

245 """ 246 The type with its final period removed if it is marked as a 247 sentence break. 248 """ 249 if self.sentbreak: 250 return self.type_no_period 251 return self.type

252 253 @property

254 - def first_upper(self):

255 """True if the token's first character is uppercase.""" 256 return self.tok[0].isupper()

257 258 @property

259 - def first_lower(self):

260 """True if the token's first character is lowercase.""" 261 return self.tok[0].islower()

262 263 @property

264 - def first_case(self):

265 if self.first_lower: 266 return 'lower' 267 elif self.first_upper: 268 return 'upper' 269 return 'none'

270 271 @property

272 - def is_ellipsis(self):

273 """True if the token text is that of an ellipsis.""" 274 return self._RE_ELLIPSIS.match(self.tok)

275 276 @property

277 - def is_number(self):

278 """True if the token text is that of a number.""" 279 return self.type.startswith('##number##')

280 281 @property

282 - def is_initial(self):

283 """True if the token text is that of an initial.""" 284 return self._RE_INITIAL.match(self.tok)

285 286 @property

287 - def is_alpha(self):

288 """True if the token text is all alphabetic.""" 289 return self._RE_ALPHA.match(self.tok)

290 291 @property

292 - def is_non_punct(self):

293 """True if the token is either a number or is alphabetic.""" 294 return _RE_NON_PUNCT.search(self.type)

295 296 #//////////////////////////////////////////////////////////// 297 #{ String representation 298 #//////////////////////////////////////////////////////////// 299

300 - def __repr__(self):

301 """ 302 A string representation of the token that can reproduce it 303 with eval(), which lists all the token's non-default 304 annotations. 305 """ 306 if self.type != self.tok: 307 typestr = ' type=%s,' % repr(self.type) 308 else: 309 typestr = '' 310 311 propvals = ', '.join( 312 '%s=%s' % (p, repr(getattr(self, p))) 313 for p in self._properties 314 if getattr(self, p) 315 ) 316 317 return '%s(%s,%s %s)' % (self.__class__.__name__, 318 repr(self.tok), typestr, propvals)

319

320 - def __str__(self):

321 """ 322 A string representation akin to that used by Kiss and Strunk. 323 """ 324 res = self.tok 325 if self.abbr: 326 res += '<A>' 327 if self.ellipsis: 328 res += '<E>' 329 if self.sentbreak: 330 res += '<S>' 331 return res

332 333 ###################################################################### 334 #{ Punkt base class 335 ###################################################################### 336

337 -class _PunktBaseClass(object):

338 """ 339 Includes common components of PunktTrainer and PunktSentenceTokenizer. 340 """ 341 342 _Token = PunktToken 343 """The token definition that should be used by this class. This allows for 344 redefinition of some parameters of the token type.""" 345

346 - def __init__(self):

347 self._params = PunktParameters() 348 """The collection of parameters that determines the behavior 349 of the punkt tokenizer."""

350 351 #//////////////////////////////////////////////////////////// 352 #{ Helper Functions 353 #//////////////////////////////////////////////////////////// 354 355 @staticmethod

356 - def pair_iter(it):

357 """ 358 Yields pairs of tokens from the given iterator such that each input 359 token will appear as the first element in a yielded tuple. The last 360 pair will have None as its second element. 361 """ 362 it = iter(it) 363 prev = it.next() 364 for el in it: 365 yield (prev, el) 366 prev = el 367 yield (prev, None)

368 369 #//////////////////////////////////////////////////////////// 370 #{ Word tokenization 371 #//////////////////////////////////////////////////////////// 372

373 - def _tokenize_words(self, plaintext):

374 """ 375 Divide the given text into tokens, using the punkt word 376 segmentation regular expression, and generate the resulting list 377 of tokens augmented as three-tuples with two boolean values for whether 378 the given token occurs at the start of a paragraph or a new line, 379 respectively. 380 """ 381 parastart = False 382 for line in plaintext.split('\n'): 383 if line.strip(): 384 line_toks = iter(punkt_word_tokenize(line)) 385 386 yield self._Token(line_toks.next(), 387 parastart=parastart, linestart=True) 388 parastart = False 389 390 for t in line_toks: 391 yield self._Token(t) 392 else: 393 parastart = True

394 395 396 #//////////////////////////////////////////////////////////// 397 #{ Annotation Procedures 398 #//////////////////////////////////////////////////////////// 399

400 - def _annotate_first_pass(self, tokens):

401 """ 402 Perform the first pass of annotation, which makes decisions 403 based purely based on the word type of each word: 404 405 - '?', '!', and '.' are marked as sentence breaks. 406 - sequences of two or more periods are marked as ellipsis. 407 - any word ending in '.' that's a known abbreviation is 408 marked as an abbreviation. 409 - any other word ending in '.' is marked as a sentence break. 410 411 Return these annotations as a tuple of three sets: 412 413 - sentbreak_toks: The indices of all sentence breaks. 414 - abbrev_toks: The indices of all abbreviations. 415 - ellipsis_toks: The indices of all ellipsis marks. 416 """ 417 for aug_tok in tokens: 418 self._first_pass_annotation(aug_tok) 419 yield aug_tok

420

421 - def _first_pass_annotation(self, aug_tok):

422 """ 423 Performs type-based annotation on a single token. 424 """ 425 426 tok = aug_tok.tok 427 428 if tok in ('?','!','.'): 429 aug_tok.sentbreak = True 430 elif aug_tok.is_ellipsis: 431 aug_tok.ellipsis = True 432 elif aug_tok.period_final and not tok.endswith('..'): 433 if (tok[:-1].lower() in self._params.abbrev_types or 434 tok[:-1].lower().split('-')[-1] in self._params.abbrev_types): 435 436 aug_tok.abbr = True 437 else: 438 aug_tok.sentbreak = True 439 440 return

441 442 ###################################################################### 443 #{ Punkt Trainer 444 ###################################################################### 445 446

447 -class PunktTrainer(_PunktBaseClass):

448 """Learns parameters used in Punkt sentence boundary detection.""" 449

450 - def __init__(self, train_text=None, verbose=False):

451 _PunktBaseClass.__init__(self) 452 453 self._type_fdist = FreqDist() 454 """A frequency distribution giving the frequency of each 455 case-normalized token type in the training data.""" 456 457 self._num_period_toks = 0 458 """The number of words ending in period in the training data.""" 459 460 self._collocation_fdist = FreqDist() 461 """A frequency distribution giving the frequency of all 462 bigrams in the training data where the first word ends in a 463 period. Bigrams are encoded as tuples of word types. 464 Especially common collocations are extracted from this 465 frequency distribution, and stored in 466 L{_params}.L{collocations <PunktParameters.collocations>}.""" 467 468 self._sent_starter_fdist = FreqDist() 469 """A frequency distribution giving the frequency of all words 470 that occur at the training data at the beginning of a sentence 471 (after the first pass of annotation). Especially common 472 sentence starters are extracted from this frequency 473 distribution, and stored in L{_params}.L{sent_starters 474 <PunktParameters.sent_starters>}. 475 """ 476 477 self._sentbreak_count = 0 478 """The total number of sentence breaks identified in training, used for 479 calculating the frequent sentence starter heuristic.""" 480 481 self._finalized = True 482 """A flag as to whether the training has been finalized by finding 483 collocations and sentence starters, or whether finalize_training() 484 still needs to be called.""" 485 486 if train_text: 487 self.train(train_text, verbose, finalize=True)

488

489 - def get_params(self):

490 """ 491 Calculates and returns parameters for sentence boundary detection as 492 derived from training.""" 493 if not self._finalized: 494 self.finalize_training() 495 return self._params

496 497 #//////////////////////////////////////////////////////////// 498 #{ Customization Variables 499 #//////////////////////////////////////////////////////////// 500 501 ABBREV = 0.3 502 """cut-off value whether a 'token' is an abbreviation""" 503 504 IGNORE_ABBREV_PENALTY = False 505 """allows the disabling of the abbreviation penalty heuristic, which 506 exponentially disadvantages words that are found at times without a 507 final period.""" 508 509 ABBREV_BACKOFF = 5 510 """upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" 511 512 COLLOCATION = 7.88 513 """minimal log-likelihood value that two tokens need to be considered 514 as a collocation""" 515 516 SENT_STARTER = 30 517 """minimal log-likelihood value that a token requires to be considered 518 as a frequent sentence starter""" 519 520 INTERNAL_PUNCTUATION = ',:;' # might want to extend this.. 521 """sentence internal punctuation, which indicates an abbreviation if 522 preceded by a period-final token.""" 523 524 INCLUDE_ALL_COLLOCS = False 525 """this includes as potential collocations all word pairs where the first 526 word ends in a period. It may be useful in corpora where there is a lot 527 of variation that makes abbreviations like Mr difficult to identify.""" 528 529 INCLUDE_ABBREV_COLLOCS = False 530 """this includes as potential collocations all word pairs where the first 531 word is an abbreviation. Such collocations override the orthographic 532 heuristic, but not the sentence starter heuristic. This is overridden by 533 INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials 534 and ordinals are considered.""" 535 """""" 536 537 MIN_COLLOC_FREQ = 1 538 """this sets a minimum bound on the number of times a bigram needs to 539 appear before it can be considered a collocation, in addition to log 540 likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" 541 542 #//////////////////////////////////////////////////////////// 543 #{ Training.. 544 #//////////////////////////////////////////////////////////// 545

546 - def train(self, text, verbose=False, finalize=True):

547 """ 548 Collects training data from a given text. If finalize is True, it 549 will determine all the parameters for sentence boundary detection. If 550 not, this will be delayed until get_params() or finalize_training() is 551 called. If verbose is True, abbreviations found will be listed. 552 """ 553 # Break the text into tokens; record which token indices correspond to 554 # line starts and paragraph starts; and determine their types. 555 self._train_tokens(self._tokenize_words(text), verbose) 556 if finalize: 557 self.finalize_training(verbose)

558

559 - def train_tokens(self, tokens, verbose=False, finalize=True):

560 """ 561 Collects training data from a given list of tokens. 562 """ 563 self._train_tokens((self._Token(t) for t in tokens), verbose) 564 if finalize: 565 self.finalize_training(verbose)

566

567 - def _train_tokens(self, tokens, verbose):

568 self._finalized = False 569 570 # Ensure tokens are a list 571 tokens = list(tokens) 572 573 # Find the frequency of each case-normalized type. (Don't 574 # strip off final periods.) Also keep track of the number of 575 # tokens that end in periods. 576 for aug_tok in tokens: 577 self._type_fdist.inc(aug_tok.type) 578 if aug_tok.period_final: 579 self._num_period_toks += 1 580 581 # Look for new abbreviations, and for types that no longer are 582 unique_types = self._unique_types(tokens) 583 for abbr, score, is_add in self._reclassify_abbrev_types(unique_types): 584 if score >= self.ABBREV: 585 if is_add: 586 self._params.abbrev_types.add(abbr) 587 if verbose: 588 print (' Abbreviation: [%6.4f] %s' % 589 (score, abbr)) 590 else: 591 if not is_add: 592 self._params.remove_abbrev(abbr) 593 if verbose: 594 print (' Removed abbreviation: [%6.4f] %s' % 595 (score, abbr)) 596 597 # Make a preliminary pass through the document, marking likely 598 # sentence breaks, abbreviations, and ellipsis tokens. 599 tokens = list(self._annotate_first_pass(tokens)) 600 601 # Check what contexts each word type can appear in, given the 602 # case of its first letter. 603 self._get_orthography_data(tokens) 604 605 # We need total number of sentence breaks to find sentence starters 606 self._sentbreak_count += self._get_sentbreak_count(tokens) 607 608 # The remaining heuristics relate to pairs of tokens where the first 609 # ends in a period. 610 for aug_tok1, aug_tok2 in self.pair_iter(tokens): 611 if not aug_tok1.period_final or not aug_tok2: 612 continue 613 614 # Is the first token a rare abbreviation? 615 if self._is_rare_abbrev_type(aug_tok1, aug_tok2): 616 self._params.abbrev_types.add(aug_tok1.type_no_period) 617 if verbose: 618 print (' Rare Abbrev: %s' % aug_tok1.type) 619 620 # Does second token have a high likelihood of starting a sentence? 621 if self._is_potential_sent_starter(aug_tok2, aug_tok1): 622 self._sent_starter_fdist.inc(aug_tok2.type) 623 624 # Is this bigram a potential collocation? 625 if self._is_potential_collocation(aug_tok1, aug_tok2): 626 self._collocation_fdist.inc( 627 (aug_tok1.type_no_period, aug_tok2.type_no_sentperiod))

628

629 - def _unique_types(self, tokens):

630 return set(aug_tok.type for aug_tok in tokens)

631

632 - def finalize_training(self, verbose=False):

633 """ 634 Uses data that has been gathered in training to determine likely 635 collocations and sentence starters. 636 """ 637 self._params.clear_sent_starters() 638 for typ, ll in self._find_sent_starters(): 639 self._params.sent_starters.add(typ) 640 if verbose: 641 print (' Sent Starter: [%6.4f] %r' % (ll, typ)) 642 643 self._params.clear_collocations() 644 for (typ1, typ2), ll in self._find_collocations(): 645 self._params.collocations.add( (typ1,typ2) ) 646 if verbose: 647 print (' Collocation: [%6.4f] %r+%r' % 648 (ll, typ1, typ2)) 649 650 self._finalized = True

651 652 #//////////////////////////////////////////////////////////// 653 #{ Overhead reduction 654 #//////////////////////////////////////////////////////////// 655

656 - def freq_threshold(self, ortho_thresh=2, type_thresh=2, colloc_thres=2, 657 sentstart_thresh=2):

658 """ 659 Allows memory use to be reduced after much training by removing data 660 about rare tokens that are unlikely to have a statistical effect with 661 further training. Entries occurring above the given thresholds will be 662 retained. 663 """ 664 if ortho_thresh > 1: 665 old_oc = self._params.ortho_context 666 self._params.clear_ortho_context() 667 for tok, count in self._type_fdist.iteritems(): 668 if count >= ortho_thresh: 669 self._params.ortho_context[tok] = old_oc[tok] 670 671 self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh) 672 self._collocation_fdist = self._freq_threshold( 673 self._collocation_fdist, colloc_thres) 674 self._sent_starter_fdist = self._freq_threshold( 675 self._sent_starter_fdist, sentstart_thresh)

676

677 - def _freq_threshold(self, fdist, threshold):

678 """ 679 Returns a FreqDist containing only data with counts below a given 680 threshold, as well as a mapping (None -> count_removed). 681 """ 682 # We assume that there is more data below the threshold than above it 683 # and so create a new FreqDist rather than working in place. 684 res = FreqDist() 685 num_removed = 0 686 for tok, count in fdist.iteritems(): 687 if count < threshold: 688 num_removed += 1 689 else: 690 res.inc(tok, count) 691 res.inc(None, num_removed) 692 return res

693 694 #//////////////////////////////////////////////////////////// 695 #{ Orthographic data 696 #//////////////////////////////////////////////////////////// 697

698 - def _get_orthography_data(self, tokens):

699 """ 700 Collect information about whether each token type occurs 701 with different case patterns (i) overall, (ii) at 702 sentence-initial positions, and (iii) at sentence-internal 703 positions. 704 """ 705 # 'initial' or 'internal' or 'unknown' 706 context = 'internal' 707 tokens = list(tokens) 708 709 for aug_tok in tokens: 710 # If we encounter a paragraph break, then it's a good sign 711 # that it's a sentence break. But err on the side of 712 # caution (by not positing a sentence break) if we just 713 # saw an abbreviation. 714 if aug_tok.parastart and context != 'unknown': 715 context = 'initial' 716 717 # If we're at the beginning of a line, then err on the 718 # side of calling our context 'initial'. 719 if aug_tok.linestart and context == 'internal': 720 context = 'unknown' 721 722 # Find the case-normalized type of the token. If it's a 723 # sentence-final token, strip off the period. 724 typ = aug_tok.type_no_sentperiod 725 726 # Update the orthographic context table. 727 flag = _ORTHO_MAP.get((context, aug_tok.first_case), 0) 728 if flag: 729 self._params.add_ortho_context(typ, flag) 730 731 # Decide whether the next word is at a sentence boundary. 732 if aug_tok.sentbreak: 733 if not (aug_tok.is_number or aug_tok.is_initial): 734 context = 'initial' 735 else: 736 context = 'unknown' 737 elif aug_tok.ellipsis or aug_tok.abbr: 738 context = 'unknown' 739 else: 740 context = 'internal'

741 742 #//////////////////////////////////////////////////////////// 743 #{ Abbreviations 744 #//////////////////////////////////////////////////////////// 745

746 - def _reclassify_abbrev_types(self, types):

747 """ 748 (Re)classifies each given token if 749 - it is period-final and not a known abbreviation; or 750 - it is not period-final and is otherwise a known abbreviation 751 by checking whether its previous classification still holds according 752 to the heuristics of section 3. 753 Yields triples (abbr, score, is_add) where abbr is the type in question, 754 score is its log-likelihood with penalties applied, and is_add specifies 755 whether the present type is a candidate for inclusion or exclusion as an 756 abbreviation, such that: 757 - (is_add and score >= 0.3) suggests a new abbreviation; and 758 - (not is_add and score < 0.3) suggests excluding an abbreviation. 759 """ 760 # (While one could recalculate abbreviations from all .-final tokens at 761 # every iteration, in cases requiring efficiency, the number of tokens 762 # in the present training document will be much less.) 763 764 for typ in types: 765 # Check some basic conditions, to rule out words that are 766 # clearly not abbrev_types. 767 if not _RE_NON_PUNCT.search(typ) or typ == '##number##': 768 continue 769 770 if typ.endswith('.'): 771 if typ in self._params.abbrev_types: 772 continue 773 typ = typ[:-1] 774 is_add = True 775 else: 776 if typ not in self._params.abbrev_types: 777 continue 778 is_add = False 779 780 # Count how many periods & nonperiods are in the 781 # candidate. 782 num_periods = typ.count('.') + 1 783 num_nonperiods = len(typ) - num_periods + 1 784 785 # Let <a> be the candidate without the period, and <b> 786 # be the period. Find a log likelihood ratio that 787 # indicates whether <ab> occurs as a single unit (high 788 # value of ll), or as two independent units <a> and 789 # <b> (low value of ll). 790 count_with_period = self._type_fdist[typ + '.'] 791 count_without_period = self._type_fdist[typ] 792 ll = self._dunning_log_likelihood( 793 count_with_period + count_without_period, 794 self._num_period_toks, count_with_period, 795 self._type_fdist.N()) 796 797 # Apply three scaling factors to 'tweak' the basic log 798 # likelihood ratio: 799 # F_length: long word -> less likely to be an abbrev 800 # F_periods: more periods -> more likely to be an abbrev 801 # F_penalty: penalize occurances w/o a period 802 f_length = math.exp(-num_nonperiods) 803 f_periods = num_periods 804 f_penalty = (int(self.IGNORE_ABBREV_PENALTY) 805 or math.pow(num_nonperiods, -count_without_period)) 806 score = ll * f_length * f_periods * f_penalty 807 808 yield typ, score, is_add

809

810 - def find_abbrev_types(self):

811 """ 812 Recalculates abbreviations given type frequencies, despite no prior 813 determination of abbreviations. 814 This fails to include abbreviations otherwise found as "rare". 815 """ 816 self._params.clear_abbrevs() 817 tokens = (typ for typ in self._type_fdist if typ and typ.endswith('.')) 818 for abbr, score, is_add in self._reclassify_abbrev_types(tokens): 819 if score >= self.ABBREV: 820 self._params.abbrev_types.add(abbr)

821 822 # This function combines the work done by the original code's 823 # functions `count_orthography_context`, `get_orthography_count`, 824 # and `get_rare_abbreviations`.

825 - def _is_rare_abbrev_type(self, cur_tok, next_tok):

826 """ 827 A word type is counted as a rare abbreviation if... 828 - it's not already marked as an abbreviation 829 - it occurs fewer than ABBREV_BACKOFF times 830 - either it is followed by a sentence-internal punctuation 831 mark, *or* it is followed by a lower-case word that 832 sometimes appears with upper case, but never occurs with 833 lower case at the beginning of sentences. 834 """ 835 if cur_tok.abbr or not cur_tok.sentbreak: 836 return False 837 838 # Find the case-normalized type of the token. If it's 839 # a sentence-final token, strip off the period. 840 typ = cur_tok.type_no_sentperiod 841 842 # Proceed only if the type hasn't been categorized as an 843 # abbreviation already, and is sufficiently rare... 844 count = self._type_fdist[typ] + self._type_fdist[typ[:-1]] 845 if (typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF): 846 return False 847 848 # Record this token as an abbreviation if the next 849 # token is a sentence-internal punctuation mark. 850 # [XX] :1 or check the whole thing?? 851 if next_tok.tok[:1] in self.INTERNAL_PUNCTUATION: 852 return True 853 854 # Record this type as an abbreviation if the next 855 # token... (i) starts with a lower case letter, 856 # (ii) sometimes occurs with an uppercase letter, 857 # and (iii) never occus with an uppercase letter 858 # sentence-internally. 859 # [xx] should the check for (ii) be modified?? 860 elif next_tok.first_lower: 861 typ2 = next_tok.type_no_sentperiod 862 typ2ortho_context = self._params.ortho_context[typ2] 863 if ( (typ2ortho_context & _ORTHO_BEG_UC) and 864 not (typ2ortho_context & _ORTHO_MID_UC) ): 865 return True

866 867 #//////////////////////////////////////////////////////////// 868 #{ Log Likelihoods 869 #//////////////////////////////////////////////////////////// 870 871 # helper for _reclassify_abbrev_types: 872 @staticmethod

873 - def _dunning_log_likelihood(count_a, count_b, count_ab, N):

874 """ 875 A function that calculates the modified Dunning log-likelihood 876 ratio scores for abbreviation candidates. The details of how 877 this works is available in the paper. 878 """ 879 p1 = float(count_b) / N 880 p2 = 0.99 881 882 null_hypo = (float(count_ab) * math.log(p1) + 883 (count_a - count_ab) * math.log(1.0 - p1)) 884 alt_hypo = (float(count_ab) * math.log(p2) + 885 (count_a - count_ab) * math.log(1.0 - p2)) 886 887 likelihood = null_hypo - alt_hypo 888 889 return (-2.0 * likelihood)

890 891 @staticmethod

892 - def _col_log_likelihood(count_a, count_b, count_ab, N):

893 """ 894 A function that will just compute log-likelihood estimate, in 895 the original paper it's decribed in algorithm 6 and 7. 896 897 This *should* be the original Dunning log-likelihood values, 898 unlike the previous log_l function where it used modified 899 Dunning log-likelihood values 900 """ 901 import math 902 903 p = 1.0 * count_b / N 904 p1 = 1.0 * count_ab / count_a 905 p2 = 1.0 * (count_b - count_ab) / (N - count_a) 906 907 summand1 = (count_ab * math.log(p) + 908 (count_a - count_ab) * math.log(1.0 - p)) 909 910 summand2 = ((count_b - count_ab) * math.log(p) + 911 (N - count_a - count_b + count_ab) * math.log(1.0 - p)) 912 913 if count_a == count_ab: 914 summand3 = 0 915 else: 916 summand3 = (count_ab * math.log(p1) + 917 (count_a - count_ab) * math.log(1.0 - p1)) 918 919 if count_b == count_ab: 920 summand4 = 0 921 else: 922 summand4 = ((count_b - count_ab) * math.log(p2) + 923 (N - count_a - count_b + count_ab) * math.log(1.0 - p2)) 924 925 likelihood = summand1 + summand2 - summand3 - summand4 926 927 return (-2.0 * likelihood)

928 929 #//////////////////////////////////////////////////////////// 930 #{ Collocation Finder 931 #//////////////////////////////////////////////////////////// 932

933 - def _is_potential_collocation(self, aug_tok1, aug_tok2):

934 """ 935 Returns True if the pair of tokens may form a collocation given 936 log-likelihood statistics. 937 """ 938 return ((self.INCLUDE_ALL_COLLOCS or 939 (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr) or 940 (aug_tok1.sentbreak and 941 (aug_tok1.is_number or aug_tok1.is_initial))) 942 and aug_tok1.is_non_punct 943 and aug_tok2.is_non_punct)

944

945 - def _find_collocations(self):

946 """ 947 Generates likely collocations and their log-likelihood. 948 """ 949 for types, col_count in self._collocation_fdist.iteritems(): 950 try: 951 typ1, typ2 = types 952 except TypeError: 953 # types may be None after calling freq_threshold() 954 continue 955 if typ2 in self._params.sent_starters: 956 continue 957 958 typ1_count = self._type_fdist[typ1]+self._type_fdist[typ1+'.'] 959 typ2_count = self._type_fdist[typ2]+self._type_fdist[typ2+'.'] 960 if (typ1_count > 1 and typ2_count > 1 961 and self.MIN_COLLOC_FREQ < 962 col_count <= min(typ1_count, typ2_count)): 963 964 ll = self._col_log_likelihood(typ1_count, typ2_count, 965 col_count, self._type_fdist.N()) 966 # Filter out the not-so-collocative 967 if (ll >= self.COLLOCATION and 968 (float(self._type_fdist.N())/typ1_count > 969 float(typ2_count)/col_count)): 970 yield (typ1, typ2), ll

971 972 #//////////////////////////////////////////////////////////// 973 #{ Sentence-Starter Finder 974 #//////////////////////////////////////////////////////////// 975

976 - def _is_potential_sent_starter(self, cur_tok, prev_tok):

977 """ 978 Returns True given a token and the token that preceds it if it 979 seems clear that the token is beginning a sentence. 980 """ 981 # If a token (i) is preceeded by a sentece break that is 982 # not a potential ordinal number or initial, and (ii) is 983 # alphabetic, then it is a a sentence-starter. 984 return ( prev_tok.sentbreak and 985 not (prev_tok.is_number or prev_tok.is_initial) and 986 cur_tok.is_alpha )

987

988 - def _find_sent_starters(self):

989 """ 990 Uses collocation heuristics for each candidate token to 991 determine if it frequently starts sentences. 992 """ 993 for (typ, typ_at_break_count) in self._sent_starter_fdist.iteritems(): 994 if not typ: 995 continue 996 997 typ_count = self._type_fdist[typ]+self._type_fdist[typ+'.'] 998 if typ_count < typ_at_break_count: 999 # needed after freq_threshold 1000 continue 1001 1002 ll = self._col_log_likelihood(self._sentbreak_count, typ_count, 1003 typ_at_break_count, 1004 self._type_fdist.N()) 1005 1006 if (ll >= self.SENT_STARTER and 1007 float(self._type_fdist.N())/self._sentbreak_count > 1008 float(typ_count)/typ_at_break_count): 1009 1010 yield typ, ll

1011

1012 - def _get_sentbreak_count(self, tokens):

1013 """ 1014 Returns the number of sentence breaks marked in a given set of 1015 augmented tokens. 1016 """ 1017 return sum(1 for aug_tok in tokens if aug_tok.sentbreak)

1018 1019 1020 ###################################################################### 1021 #{ Punkt Sentence Tokenizer 1022 ###################################################################### 1023 1024

1025 -class PunktSentenceTokenizer(_PunktBaseClass,TokenizerI):

1026 """ 1027 A sentence tokenizer which uses an unsupervised algorithm to build 1028 a model for abbreviation words, collocations, and words that start 1029 sentences; and then uses that model to find sentence boundaries. 1030 This approach has been shown to work well for many European 1031 languages. 1032 """

1033 - def __init__(self, train_text=None, verbose=False):

1034 """ 1035 train_text can either be the sole training text for this sentence 1036 boundary detector, or can be a PunktParameters object. 1037 """ 1038 _PunktBaseClass.__init__(self) 1039 1040 if train_text: 1041 self._params = self.train(train_text, verbose)

1042

1043 - def train(self, train_text, verbose=False):

1044 """ 1045 Derives parameters from a given training text, or uses the parameters 1046 given. Repeated calls to this method destroy previous parameters. For 1047 incremental training, instantiate a separate PunktTrainer instance. 1048 """ 1049 if type(train_text) not in (type(''), type(u'')): 1050 return train_text 1051 return PunktTrainer(train_text).get_params()

1052 1053 #//////////////////////////////////////////////////////////// 1054 #{ Tokenization 1055 #//////////////////////////////////////////////////////////// 1056

1057 - def tokenize(self, text, realign_boundaries=False):

1058 """ 1059 Given a text, returns a list of the sentences in that text. 1060 """ 1061 return list(self.sentences_from_text(text, realign_boundaries))

1062

1063 - def sentences_from_text(self, text, realign_boundaries=False):

1064 """ 1065 Given a text, generates the sentences in that text by only 1066 testing candidate sentence breaks. If realign_boundaries is 1067 True, includes in the sentence closing punctuation that 1068 follows the period. 1069 """ 1070 sents = self._sentences_from_text(text) 1071 if realign_boundaries: 1072 sents = self._realign_boundaries(sents) 1073 return sents

1074

1075 - def _sentences_from_text(self, text):

1076 last_break = 0 1077 for match in _punkt_period_context_regexp.finditer(text): 1078 if self.text_contains_sentbreak(match.group(0)): 1079 yield text[last_break:match.end(1)] 1080 if match.group(3): 1081 # next sentence starts after whitespace 1082 last_break = match.start(3) 1083 else: 1084 # next sentence starts at following punctuation 1085 last_break = match.start(2) 1086 yield text[last_break:]

1087 1088 @staticmethod

1089 - def _realign_boundaries(sents):

1090 """ 1091 Attempts to realign punctuation that falls after the period but 1092 should otherwise be included in the same sentence. 1093 1094 For example: "(Sent1.) Sent2." will otherwise be split as:: 1095 1096 ["(Sent1.", ") Sent1."]. 1097 1098 This method will produce:: 1099 1100 ["(Sent1.)", "Sent2."]. 1101 """ 1102 realign = 0 1103 for s1, s2 in self.pair_iter(sents): 1104 s1 = s1[realign:] 1105 if not s2: 1106 if s1: 1107 yield s1 1108 continue 1109 1110 m = _RE_BOUNDARY_REALIGNMENT.match(s2) 1111 if m: 1112 yield s1 + m.group(0).strip() 1113 realign = m.end() 1114 else: 1115 realign = 0 1116 if s1: 1117 yield s1

1118

1119 - def text_contains_sentbreak(self, text):

1120 """ 1121 Returns True if the given text includes a sentence break. 1122 """ 1123 found = False # used to ignore last token 1124 for t in self._annotate_tokens(self._tokenize_words(text)): 1125 if found: 1126 return True 1127 if t.sentbreak: 1128 found = True 1129 return False

1130

1131 - def sentences_from_text_legacy(self, text):

1132 """ 1133 Given a text, generates the sentences in that text. Annotates all 1134 tokens, rather than just those with possible sentence breaks. Should 1135 produce the same results as L{sentences_from_text}. 1136 """ 1137 tokens = self._annotate_tokens(self._tokenize_words(text)) 1138 return self._build_sentence_list(text, tokens)

1139

1140 - def sentences_from_tokens(self, tokens):

1141 """ 1142 Given a sequence of tokens, generates lists of tokens, each list 1143 corresponding to a sentence. 1144 """ 1145 tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens)) 1146 sentence = [] 1147 for aug_tok in tokens: 1148 sentence.append(aug_tok.tok) 1149 if aug_tok.sentbreak: 1150 yield sentence 1151 sentence = [] 1152 if sentence: 1153 yield sentence

1154

1155 - def _annotate_tokens(self, tokens):

1156 """ 1157 Given a set of tokens augmented with markers for line-start and 1158 paragraph-start, returns an iterator through those tokens with full 1159 annotation including predicted sentence breaks. 1160 """ 1161 # Make a preliminary pass through the document, marking likely 1162 # sentence breaks, abbreviations, and ellipsis tokens. 1163 tokens = self._annotate_first_pass(tokens) 1164 1165 # Make a second pass through the document, using token context 1166 # information to change our preliminary decisions about where 1167 # sentence breaks, abbreviations, and ellipsis occurs. 1168 tokens = self._annotate_second_pass(tokens) 1169 1170 ## [XX] TESTING 1171 #tokens = list(tokens) 1172 #self.dump(tokens) 1173 1174 return tokens

1175

1176 - def _build_sentence_list(self, text, tokens):

1177 """ 1178 Given the original text and the list of augmented word tokens, 1179 construct and return a tokenized list of sentence strings. 1180 """ 1181 # Most of the work here is making sure that we put the right 1182 # pieces of whitespace back in all the right places. 1183 1184 # Our position in the source text, used to keep track of which 1185 # whitespace to add: 1186 pos = 0 1187 1188 # A regular expression that finds pieces of whitespace: 1189 WS_REGEXP = re.compile(r'\s*') 1190 1191 sentence = '' 1192 for aug_tok in tokens: 1193 tok = aug_tok.tok 1194 1195 # Find the whitespace before this token, and update pos. 1196 ws = WS_REGEXP.match(text, pos).group() 1197 pos += len(ws) 1198 1199 # Some of the rules used by the punkt word tokenizer 1200 # strip whitespace out of the text, resulting in tokens 1201 # that contain whitespace in the source text. If our 1202 # token doesn't match, see if adding whitespace helps. 1203 # If so, then use the version with whitespace. 1204 if text[pos:pos+len(tok)] != tok: 1205 pat = '\s*'.join(re.escape(c) for c in tok) 1206 m = re.compile(pat).match(text,pos) 1207 if m: tok = m.group() 1208 1209 # Move our position pointer to the end of the token. 1210 assert text[pos:pos+len(tok)] == tok 1211 pos += len(tok) 1212 1213 # Add this token. If it's not at the beginning of the 1214 # sentence, then include any whitespace that separated it 1215 # from the previous token. 1216 if sentence: 1217 sentence += ws + tok 1218 else: 1219 sentence += tok 1220 1221 # If we're at a sentence break, then start a new sentence. 1222 if aug_tok.sentbreak: 1223 yield sentence 1224 sentence = '' 1225 1226 # If the last sentence is emtpy, discard it. 1227 if sentence: 1228 yield sentence

1229 1230 # [XX] TESTING

1231 - def dump(self, tokens):

1232 print 'writing to /tmp/punkt.new...' 1233 out = open('/tmp/punkt.new', 'w') 1234 for aug_tok in tokens: 1235 if aug_tok.parastart: 1236 out.write('\n\n') 1237 elif aug_tok.linestart: 1238 out.write('\n') 1239 else: 1240 out.write(' ') 1241 1242 out.write(str(aug_tok)) 1243 out.close()

1244 1245 #//////////////////////////////////////////////////////////// 1246 #{ Customization Variables 1247 #//////////////////////////////////////////////////////////// 1248 1249 PUNCTUATION = tuple(';:,.!?') 1250 1251 #//////////////////////////////////////////////////////////// 1252 #{ Annotation Procedures 1253 #//////////////////////////////////////////////////////////// 1254

1255 - def _annotate_second_pass(self, tokens):

1256 """ 1257 Performs a token-based classification (section 4) over the given 1258 tokens, making use of the orthographic heuristic (4.1.1), collocation 1259 heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3). 1260 """ 1261 for t1, t2 in self.pair_iter(tokens): 1262 self._second_pass_annotation(t1, t2) 1263 yield t1

1264

1265 - def _second_pass_annotation(self, aug_tok1, aug_tok2):

1266 """ 1267 Performs token-based classification over a pair of contiguous tokens 1268 returning an updated augmented token for the first of them. 1269 """ 1270 # Is it the last token? We can't do anything then. 1271 if not aug_tok2: 1272 return 1273 1274 tok = aug_tok1.tok 1275 if not aug_tok1.period_final: 1276 # We only care about words ending in periods. 1277 return 1278 1279 typ = aug_tok1.type_no_period 1280 next_tok = aug_tok2.tok 1281 next_typ = aug_tok2.type_no_sentperiod 1282 tok_is_initial = aug_tok1.is_initial 1283 1284 # [4.1.2. Collocational Heuristic] If there's a 1285 # collocation between the word before and after the 1286 # period, then label tok as an abbreviation and NOT 1287 # a sentence break. Note that collocations with 1288 # frequent sentence starters as their second word are 1289 # excluded in training. 1290 if (typ, next_typ) in self._params.collocations: 1291 aug_tok1.sentbreak = False 1292 aug_tok1.abbr = True 1293 return 1294 1295 # [4.2. Token-Based Reclassification of Abbreviations] If 1296 # the token is an abbreviation or an ellipsis, then decide 1297 # whether we should *also* classify it as a sentbreak. 1298 if ( (aug_tok1.abbr or aug_tok1.ellipsis) and 1299 (not tok_is_initial) ): 1300 # [4.1.1. Orthographic Heuristic] Check if there's 1301 # orthogrpahic evidence about whether the next word 1302 # starts a sentence or not. 1303 is_sent_starter = self._ortho_heuristic(aug_tok2) 1304 if is_sent_starter == True: 1305 aug_tok1.sentbreak = True 1306 return 1307 1308 # [4.1.3. Frequent Sentence Starter Heruistic] If the 1309 # next word is capitalized, and is a member of the 1310 # frequent-sentence-starters list, then label tok as a 1311 # sentence break. 1312 if ( aug_tok2.first_upper and 1313 next_typ in self._params.sent_starters): 1314 aug_tok1.sentbreak = True 1315 return 1316 1317 # [4.3. Token-Based Detection of Initials and Ordinals] 1318 # Check if any initials or ordinals tokens that are marked 1319 # as sentbreaks should be reclassified as abbreviations. 1320 if tok_is_initial or typ == '##number##': 1321 1322 # [4.1.1. Orthographic Heuristic] Check if there's 1323 # orthogrpahic evidence about whether the next word 1324 # starts a sentence or not. 1325 is_sent_starter = self._ortho_heuristic(aug_tok2) 1326 1327 if is_sent_starter == False: 1328 aug_tok1.sentbreak = False 1329 aug_tok1.abbr = True 1330 return 1331 1332 # Special heuristic for initials: if orthogrpahic 1333 # heuristc is unknown, and next word is always 1334 # capitalized, then mark as abbrev (eg: J. Bach). 1335 if ( is_sent_starter == 'unknown' and tok_is_initial and 1336 aug_tok2.first_upper and 1337 not (self._params.ortho_context[next_typ] & _ORTHO_LC) ): 1338 aug_tok1.sentbreak = False 1339 aug_tok1.abbr = True 1340 return 1341 1342 return

1343

1344 - def _ortho_heuristic(self, aug_tok):

1345 """ 1346 Decide whether the given token is the first token in a sentence. 1347 """ 1348 # Sentences don't start with punctuation marks: 1349 if aug_tok.tok in self.PUNCTUATION: 1350 return False 1351 1352 ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod] 1353 1354 # If the word is capitalized, occurs at least once with a 1355 # lower case first letter, and never occurs with an upper case 1356 # first letter sentence-internally, then it's a sentence starter. 1357 if ( aug_tok.first_upper and 1358 (ortho_context & _ORTHO_LC) and 1359 not (ortho_context & _ORTHO_MID_UC) ): 1360 return True 1361 1362 # If the word is lower case, and either (a) we've seen it used 1363 # with upper case, or (b) we've never seen it used 1364 # sentence-initially with lower case, then it's not a sentence 1365 # starter. 1366 if ( aug_tok.first_lower and 1367 ((ortho_context & _ORTHO_UC) or 1368 not (ortho_context & _ORTHO_BEG_LC)) ): 1369 return False 1370 1371 # Otherwise, we're not sure. 1372 return 'unknown'

1373

Source Code for Module nltk.tokenize.punkt