1
2
3
4
5
6
7
8
9
10
11
12
13
14 """
15 The Punkt sentence tokenizer. The algorithm for this tokenizer is
16 described in Kiss & Strunk (2006)::
17
18 Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
19 Boundary Detection. Computational Linguistics 32: 485-525.
20 """
21
22 import re
23 import math
24
25 from nltk import defaultdict
26 from nltk.probability import FreqDist
27
28 from api import TokenizerI
29
30
31
32
33
34
35
36
37 _ORTHO_BEG_UC = 1 << 1
38 """Orthogaphic context: beginning of a sentence with upper case."""
39
40 _ORTHO_MID_UC = 1 << 2
41 """Orthogaphic context: middle of a sentence with upper case."""
42
43 _ORTHO_UNK_UC = 1 << 3
44 """Orthogaphic context: unknown position in a sentence with upper case."""
45
46 _ORTHO_BEG_LC = 1 << 4
47 """Orthogaphic context: beginning of a sentence with lower case."""
48
49 _ORTHO_MID_LC = 1 << 5
50 """Orthogaphic context: middle of a sentence with lower case."""
51
52 _ORTHO_UNK_LC = 1 << 6
53 """Orthogaphic context: unknown position in a sentence with lower case."""
54
55 _ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC
56 """Orthogaphic context: occurs with upper case."""
57
58 _ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC
59 """Orthogaphic context: occurs with lower case."""
60
61 _ORTHO_MAP = {
62 ('initial', 'upper'): _ORTHO_BEG_UC,
63 ('internal', 'upper'): _ORTHO_MID_UC,
64 ('unknown', 'upper'): _ORTHO_UNK_UC,
65 ('initial', 'lower'): _ORTHO_BEG_LC,
66 ('internal', 'lower'): _ORTHO_MID_LC,
67 ('unknown', 'lower'): _ORTHO_UNK_LC,
68 }
69 """A map from context position and first-letter case to the
70 appropriate orthographic context flag."""
71
72
73
74
75
76
77
78
79 _RE_NON_PUNCT = re.compile(r'[^\W\d]', re.UNICODE)
80 """Matches token types that are not merely punctuation. (Types for
81 numeric tokens are changed to ##number## and hence contain alpha.)"""
82
83 _RE_BOUNDARY_REALIGNMENT = re.compile(r'["\')\]}]+?(?: |(?=--)|$)',
84 re.MULTILINE)
85 """Used to realign punctuation that should be included in a sentence
86 although it follows the period (or ?, !)."""
87
88
89
90
91
92
93
94
98
106
107
108
109
110 _punkt_word_tokenize_regexps = [
111
112 (re.compile(r'(?=[\(\"\`{\[:;&\#\*@])(.)'), r'\1 '),
113
114 (re.compile(r'(.)(?=[?!)\";}\]\*:@\'])'), r'\1 '),
115 (re.compile(r'(?=[\)}\]])(.)'), r'\1 '),
116 (re.compile(r'(.)(?=[({\[])'), r'\1 '),
117 (re.compile(r'((^|\s)\-)(?=[^\-])'), r'\1 '),
118
119
120 (re.compile(r'([^-])(\-\-+)([^-])'), r'\1 \2 \3'),
121 (re.compile(r'(\s|^)(,)(?=(\S))'), r'\1\2 '),
122
123
124 (re.compile(r'(.)(,)(\s|$)'), r'\1 \2\3'),
125
126
127 (re.compile(r'\.\s\.\s\.'), r'...'),
128
129
130
131
132
133
134 (re.compile(r'([^\.]|^)(\.{2,})(.?)'), r'\1 \2 \3'),
135
136 (re.compile(r'(^|\s)(\.{2,})([^\.\s])'), r'\1\2 \3'),
137 (re.compile(r'([^\.\s])(\.{2,})($|\s)'), r'\1 \2\3'),
138 ]
139
140
141
142 _punkt_period_context_regexp = re.compile(r"""
143 \S* # some word material
144 ([.?!]) # a potential sentence ending
145 (?:
146 ([?!)\";}\]\*:@\'({\[]) # either other punctuation
147 |
148 \s+(\S+) # or whitespace and some other token
149 )""", re.UNICODE | re.VERBOSE)
150
151
152
153
154
156 """Stores data used to perform sentence boundary detection with punkt."""
157
159 self.abbrev_types = set()
160 """A set of word types for known abbreviations."""
161
162 self.collocations = set()
163 """A set of word type tuples for known common collocations
164 where the first word ends in a period. E.g., ('S.', 'Bach')
165 is a common collocation in a text that discusses 'Johann
166 S. Bach'. These count as negative evidence for sentence
167 boundaries."""
168
169 self.sent_starters = set()
170 """A set of word types for words that often appear at the
171 beginning of sentences."""
172
173 self.ortho_context = defaultdict(int)
174 """A dictionary mapping word types to the set of orthographic
175 contexts that word type appears in. Contexts are represented
176 by adding orthographic context flags: ..."""
177
179 self.abbrev_types = set()
180
183
185 self.sent_starters = set()
186
188 self.ortho_context = defaultdict(int)
189
190 - def add_ortho_context(self, typ, flag):
191 self.ortho_context[typ] |= flag
192
193
194
195
196
198 """Stores a token of text with annotations produced during
199 sentence boundary detection."""
200
201 _properties = [
202 'parastart', 'linestart',
203 'sentbreak', 'abbr', 'ellipsis'
204 ]
205 __slots__ = ['tok', 'type', 'period_final'] + _properties
206
216
217
218
219
220
221 _RE_ELLIPSIS = re.compile(r'\.\.+$')
222 _RE_NUMERIC = re.compile(r'^-?[\.,]?\d[\d,\.-]*\.?$')
223 _RE_INITIAL = re.compile(r'[^\W\d]\.$', re.UNICODE)
224 _RE_ALPHA = re.compile(r'[^\W\d]+$', re.UNICODE)
225
226
227
228
229
231 """Returns a case-normalized representation of the token."""
232 return self._RE_NUMERIC.sub('##number##', self.tok.lower())
233
234 @property
236 """
237 The type with its final period removed if it has one.
238 """
239 if len(self.type) > 1 and self.type[-1] == '.':
240 return self.type[:-1]
241 return self.type
242
243 @property
245 """
246 The type with its final period removed if it is marked as a
247 sentence break.
248 """
249 if self.sentbreak:
250 return self.type_no_period
251 return self.type
252
253 @property
255 """True if the token's first character is uppercase."""
256 return self.tok[0].isupper()
257
258 @property
260 """True if the token's first character is lowercase."""
261 return self.tok[0].islower()
262
263 @property
270
271 @property
273 """True if the token text is that of an ellipsis."""
274 return self._RE_ELLIPSIS.match(self.tok)
275
276 @property
278 """True if the token text is that of a number."""
279 return self.type.startswith('##number##')
280
281 @property
283 """True if the token text is that of an initial."""
284 return self._RE_INITIAL.match(self.tok)
285
286 @property
288 """True if the token text is all alphabetic."""
289 return self._RE_ALPHA.match(self.tok)
290
291 @property
295
296
297
298
299
301 """
302 A string representation of the token that can reproduce it
303 with eval(), which lists all the token's non-default
304 annotations.
305 """
306 if self.type != self.tok:
307 typestr = ' type=%s,' % repr(self.type)
308 else:
309 typestr = ''
310
311 propvals = ', '.join(
312 '%s=%s' % (p, repr(getattr(self, p)))
313 for p in self._properties
314 if getattr(self, p)
315 )
316
317 return '%s(%s,%s %s)' % (self.__class__.__name__,
318 repr(self.tok), typestr, propvals)
319
321 """
322 A string representation akin to that used by Kiss and Strunk.
323 """
324 res = self.tok
325 if self.abbr:
326 res += '<A>'
327 if self.ellipsis:
328 res += '<E>'
329 if self.sentbreak:
330 res += '<S>'
331 return res
332
333
334
335
336
338 """
339 Includes common components of PunktTrainer and PunktSentenceTokenizer.
340 """
341
342 _Token = PunktToken
343 """The token definition that should be used by this class. This allows for
344 redefinition of some parameters of the token type."""
345
347 self._params = PunktParameters()
348 """The collection of parameters that determines the behavior
349 of the punkt tokenizer."""
350
351
352
353
354
355 @staticmethod
357 """
358 Yields pairs of tokens from the given iterator such that each input
359 token will appear as the first element in a yielded tuple. The last
360 pair will have None as its second element.
361 """
362 it = iter(it)
363 prev = it.next()
364 for el in it:
365 yield (prev, el)
366 prev = el
367 yield (prev, None)
368
369
370
371
372
374 """
375 Divide the given text into tokens, using the punkt word
376 segmentation regular expression, and generate the resulting list
377 of tokens augmented as three-tuples with two boolean values for whether
378 the given token occurs at the start of a paragraph or a new line,
379 respectively.
380 """
381 parastart = False
382 for line in plaintext.split('\n'):
383 if line.strip():
384 line_toks = iter(punkt_word_tokenize(line))
385
386 yield self._Token(line_toks.next(),
387 parastart=parastart, linestart=True)
388 parastart = False
389
390 for t in line_toks:
391 yield self._Token(t)
392 else:
393 parastart = True
394
395
396
397
398
399
401 """
402 Perform the first pass of annotation, which makes decisions
403 based purely based on the word type of each word:
404
405 - '?', '!', and '.' are marked as sentence breaks.
406 - sequences of two or more periods are marked as ellipsis.
407 - any word ending in '.' that's a known abbreviation is
408 marked as an abbreviation.
409 - any other word ending in '.' is marked as a sentence break.
410
411 Return these annotations as a tuple of three sets:
412
413 - sentbreak_toks: The indices of all sentence breaks.
414 - abbrev_toks: The indices of all abbreviations.
415 - ellipsis_toks: The indices of all ellipsis marks.
416 """
417 for aug_tok in tokens:
418 self._first_pass_annotation(aug_tok)
419 yield aug_tok
420
422 """
423 Performs type-based annotation on a single token.
424 """
425
426 tok = aug_tok.tok
427
428 if tok in ('?','!','.'):
429 aug_tok.sentbreak = True
430 elif aug_tok.is_ellipsis:
431 aug_tok.ellipsis = True
432 elif aug_tok.period_final and not tok.endswith('..'):
433 if (tok[:-1].lower() in self._params.abbrev_types or
434 tok[:-1].lower().split('-')[-1] in self._params.abbrev_types):
435
436 aug_tok.abbr = True
437 else:
438 aug_tok.sentbreak = True
439
440 return
441
442
443
444
445
446
448 """Learns parameters used in Punkt sentence boundary detection."""
449
450 - def __init__(self, train_text=None, verbose=False):
451 _PunktBaseClass.__init__(self)
452
453 self._type_fdist = FreqDist()
454 """A frequency distribution giving the frequency of each
455 case-normalized token type in the training data."""
456
457 self._num_period_toks = 0
458 """The number of words ending in period in the training data."""
459
460 self._collocation_fdist = FreqDist()
461 """A frequency distribution giving the frequency of all
462 bigrams in the training data where the first word ends in a
463 period. Bigrams are encoded as tuples of word types.
464 Especially common collocations are extracted from this
465 frequency distribution, and stored in
466 L{_params}.L{collocations <PunktParameters.collocations>}."""
467
468 self._sent_starter_fdist = FreqDist()
469 """A frequency distribution giving the frequency of all words
470 that occur at the training data at the beginning of a sentence
471 (after the first pass of annotation). Especially common
472 sentence starters are extracted from this frequency
473 distribution, and stored in L{_params}.L{sent_starters
474 <PunktParameters.sent_starters>}.
475 """
476
477 self._sentbreak_count = 0
478 """The total number of sentence breaks identified in training, used for
479 calculating the frequent sentence starter heuristic."""
480
481 self._finalized = True
482 """A flag as to whether the training has been finalized by finding
483 collocations and sentence starters, or whether finalize_training()
484 still needs to be called."""
485
486 if train_text:
487 self.train(train_text, verbose, finalize=True)
488
490 """
491 Calculates and returns parameters for sentence boundary detection as
492 derived from training."""
493 if not self._finalized:
494 self.finalize_training()
495 return self._params
496
497
498
499
500
501 ABBREV = 0.3
502 """cut-off value whether a 'token' is an abbreviation"""
503
504 IGNORE_ABBREV_PENALTY = False
505 """allows the disabling of the abbreviation penalty heuristic, which
506 exponentially disadvantages words that are found at times without a
507 final period."""
508
509 ABBREV_BACKOFF = 5
510 """upper cut-off for Mikheev's(2002) abbreviation detection algorithm"""
511
512 COLLOCATION = 7.88
513 """minimal log-likelihood value that two tokens need to be considered
514 as a collocation"""
515
516 SENT_STARTER = 30
517 """minimal log-likelihood value that a token requires to be considered
518 as a frequent sentence starter"""
519
520 INTERNAL_PUNCTUATION = ',:;'
521 """sentence internal punctuation, which indicates an abbreviation if
522 preceded by a period-final token."""
523
524 INCLUDE_ALL_COLLOCS = False
525 """this includes as potential collocations all word pairs where the first
526 word ends in a period. It may be useful in corpora where there is a lot
527 of variation that makes abbreviations like Mr difficult to identify."""
528
529 INCLUDE_ABBREV_COLLOCS = False
530 """this includes as potential collocations all word pairs where the first
531 word is an abbreviation. Such collocations override the orthographic
532 heuristic, but not the sentence starter heuristic. This is overridden by
533 INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
534 and ordinals are considered."""
535 """"""
536
537 MIN_COLLOC_FREQ = 1
538 """this sets a minimum bound on the number of times a bigram needs to
539 appear before it can be considered a collocation, in addition to log
540 likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""
541
542
543
544
545
546 - def train(self, text, verbose=False, finalize=True):
547 """
548 Collects training data from a given text. If finalize is True, it
549 will determine all the parameters for sentence boundary detection. If
550 not, this will be delayed until get_params() or finalize_training() is
551 called. If verbose is True, abbreviations found will be listed.
552 """
553
554
555 self._train_tokens(self._tokenize_words(text), verbose)
556 if finalize:
557 self.finalize_training(verbose)
558
559 - def train_tokens(self, tokens, verbose=False, finalize=True):
560 """
561 Collects training data from a given list of tokens.
562 """
563 self._train_tokens((self._Token(t) for t in tokens), verbose)
564 if finalize:
565 self.finalize_training(verbose)
566
628
631
633 """
634 Uses data that has been gathered in training to determine likely
635 collocations and sentence starters.
636 """
637 self._params.clear_sent_starters()
638 for typ, ll in self._find_sent_starters():
639 self._params.sent_starters.add(typ)
640 if verbose:
641 print (' Sent Starter: [%6.4f] %r' % (ll, typ))
642
643 self._params.clear_collocations()
644 for (typ1, typ2), ll in self._find_collocations():
645 self._params.collocations.add( (typ1,typ2) )
646 if verbose:
647 print (' Collocation: [%6.4f] %r+%r' %
648 (ll, typ1, typ2))
649
650 self._finalized = True
651
652
653
654
655
656 - def freq_threshold(self, ortho_thresh=2, type_thresh=2, colloc_thres=2,
657 sentstart_thresh=2):
658 """
659 Allows memory use to be reduced after much training by removing data
660 about rare tokens that are unlikely to have a statistical effect with
661 further training. Entries occurring above the given thresholds will be
662 retained.
663 """
664 if ortho_thresh > 1:
665 old_oc = self._params.ortho_context
666 self._params.clear_ortho_context()
667 for tok, count in self._type_fdist.iteritems():
668 if count >= ortho_thresh:
669 self._params.ortho_context[tok] = old_oc[tok]
670
671 self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh)
672 self._collocation_fdist = self._freq_threshold(
673 self._collocation_fdist, colloc_thres)
674 self._sent_starter_fdist = self._freq_threshold(
675 self._sent_starter_fdist, sentstart_thresh)
676
678 """
679 Returns a FreqDist containing only data with counts below a given
680 threshold, as well as a mapping (None -> count_removed).
681 """
682
683
684 res = FreqDist()
685 num_removed = 0
686 for tok, count in fdist.iteritems():
687 if count < threshold:
688 num_removed += 1
689 else:
690 res.inc(tok, count)
691 res.inc(None, num_removed)
692 return res
693
694
695
696
697
741
742
743
744
745
747 """
748 (Re)classifies each given token if
749 - it is period-final and not a known abbreviation; or
750 - it is not period-final and is otherwise a known abbreviation
751 by checking whether its previous classification still holds according
752 to the heuristics of section 3.
753 Yields triples (abbr, score, is_add) where abbr is the type in question,
754 score is its log-likelihood with penalties applied, and is_add specifies
755 whether the present type is a candidate for inclusion or exclusion as an
756 abbreviation, such that:
757 - (is_add and score >= 0.3) suggests a new abbreviation; and
758 - (not is_add and score < 0.3) suggests excluding an abbreviation.
759 """
760
761
762
763
764 for typ in types:
765
766
767 if not _RE_NON_PUNCT.search(typ) or typ == '##number##':
768 continue
769
770 if typ.endswith('.'):
771 if typ in self._params.abbrev_types:
772 continue
773 typ = typ[:-1]
774 is_add = True
775 else:
776 if typ not in self._params.abbrev_types:
777 continue
778 is_add = False
779
780
781
782 num_periods = typ.count('.') + 1
783 num_nonperiods = len(typ) - num_periods + 1
784
785
786
787
788
789
790 count_with_period = self._type_fdist[typ + '.']
791 count_without_period = self._type_fdist[typ]
792 ll = self._dunning_log_likelihood(
793 count_with_period + count_without_period,
794 self._num_period_toks, count_with_period,
795 self._type_fdist.N())
796
797
798
799
800
801
802 f_length = math.exp(-num_nonperiods)
803 f_periods = num_periods
804 f_penalty = (int(self.IGNORE_ABBREV_PENALTY)
805 or math.pow(num_nonperiods, -count_without_period))
806 score = ll * f_length * f_periods * f_penalty
807
808 yield typ, score, is_add
809
811 """
812 Recalculates abbreviations given type frequencies, despite no prior
813 determination of abbreviations.
814 This fails to include abbreviations otherwise found as "rare".
815 """
816 self._params.clear_abbrevs()
817 tokens = (typ for typ in self._type_fdist if typ and typ.endswith('.'))
818 for abbr, score, is_add in self._reclassify_abbrev_types(tokens):
819 if score >= self.ABBREV:
820 self._params.abbrev_types.add(abbr)
821
822
823
824
826 """
827 A word type is counted as a rare abbreviation if...
828 - it's not already marked as an abbreviation
829 - it occurs fewer than ABBREV_BACKOFF times
830 - either it is followed by a sentence-internal punctuation
831 mark, *or* it is followed by a lower-case word that
832 sometimes appears with upper case, but never occurs with
833 lower case at the beginning of sentences.
834 """
835 if cur_tok.abbr or not cur_tok.sentbreak:
836 return False
837
838
839
840 typ = cur_tok.type_no_sentperiod
841
842
843
844 count = self._type_fdist[typ] + self._type_fdist[typ[:-1]]
845 if (typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF):
846 return False
847
848
849
850
851 if next_tok.tok[:1] in self.INTERNAL_PUNCTUATION:
852 return True
853
854
855
856
857
858
859
860 elif next_tok.first_lower:
861 typ2 = next_tok.type_no_sentperiod
862 typ2ortho_context = self._params.ortho_context[typ2]
863 if ( (typ2ortho_context & _ORTHO_BEG_UC) and
864 not (typ2ortho_context & _ORTHO_MID_UC) ):
865 return True
866
867
868
869
870
871
872 @staticmethod
874 """
875 A function that calculates the modified Dunning log-likelihood
876 ratio scores for abbreviation candidates. The details of how
877 this works is available in the paper.
878 """
879 p1 = float(count_b) / N
880 p2 = 0.99
881
882 null_hypo = (float(count_ab) * math.log(p1) +
883 (count_a - count_ab) * math.log(1.0 - p1))
884 alt_hypo = (float(count_ab) * math.log(p2) +
885 (count_a - count_ab) * math.log(1.0 - p2))
886
887 likelihood = null_hypo - alt_hypo
888
889 return (-2.0 * likelihood)
890
891 @staticmethod
893 """
894 A function that will just compute log-likelihood estimate, in
895 the original paper it's decribed in algorithm 6 and 7.
896
897 This *should* be the original Dunning log-likelihood values,
898 unlike the previous log_l function where it used modified
899 Dunning log-likelihood values
900 """
901 import math
902
903 p = 1.0 * count_b / N
904 p1 = 1.0 * count_ab / count_a
905 p2 = 1.0 * (count_b - count_ab) / (N - count_a)
906
907 summand1 = (count_ab * math.log(p) +
908 (count_a - count_ab) * math.log(1.0 - p))
909
910 summand2 = ((count_b - count_ab) * math.log(p) +
911 (N - count_a - count_b + count_ab) * math.log(1.0 - p))
912
913 if count_a == count_ab:
914 summand3 = 0
915 else:
916 summand3 = (count_ab * math.log(p1) +
917 (count_a - count_ab) * math.log(1.0 - p1))
918
919 if count_b == count_ab:
920 summand4 = 0
921 else:
922 summand4 = ((count_b - count_ab) * math.log(p2) +
923 (N - count_a - count_b + count_ab) * math.log(1.0 - p2))
924
925 likelihood = summand1 + summand2 - summand3 - summand4
926
927 return (-2.0 * likelihood)
928
929
930
931
932
944
946 """
947 Generates likely collocations and their log-likelihood.
948 """
949 for types, col_count in self._collocation_fdist.iteritems():
950 try:
951 typ1, typ2 = types
952 except TypeError:
953
954 continue
955 if typ2 in self._params.sent_starters:
956 continue
957
958 typ1_count = self._type_fdist[typ1]+self._type_fdist[typ1+'.']
959 typ2_count = self._type_fdist[typ2]+self._type_fdist[typ2+'.']
960 if (typ1_count > 1 and typ2_count > 1
961 and self.MIN_COLLOC_FREQ <
962 col_count <= min(typ1_count, typ2_count)):
963
964 ll = self._col_log_likelihood(typ1_count, typ2_count,
965 col_count, self._type_fdist.N())
966
967 if (ll >= self.COLLOCATION and
968 (float(self._type_fdist.N())/typ1_count >
969 float(typ2_count)/col_count)):
970 yield (typ1, typ2), ll
971
972
973
974
975
977 """
978 Returns True given a token and the token that preceds it if it
979 seems clear that the token is beginning a sentence.
980 """
981
982
983
984 return ( prev_tok.sentbreak and
985 not (prev_tok.is_number or prev_tok.is_initial) and
986 cur_tok.is_alpha )
987
989 """
990 Uses collocation heuristics for each candidate token to
991 determine if it frequently starts sentences.
992 """
993 for (typ, typ_at_break_count) in self._sent_starter_fdist.iteritems():
994 if not typ:
995 continue
996
997 typ_count = self._type_fdist[typ]+self._type_fdist[typ+'.']
998 if typ_count < typ_at_break_count:
999
1000 continue
1001
1002 ll = self._col_log_likelihood(self._sentbreak_count, typ_count,
1003 typ_at_break_count,
1004 self._type_fdist.N())
1005
1006 if (ll >= self.SENT_STARTER and
1007 float(self._type_fdist.N())/self._sentbreak_count >
1008 float(typ_count)/typ_at_break_count):
1009
1010 yield typ, ll
1011
1013 """
1014 Returns the number of sentence breaks marked in a given set of
1015 augmented tokens.
1016 """
1017 return sum(1 for aug_tok in tokens if aug_tok.sentbreak)
1018
1019
1020
1021
1022
1023
1024
1026 """
1027 A sentence tokenizer which uses an unsupervised algorithm to build
1028 a model for abbreviation words, collocations, and words that start
1029 sentences; and then uses that model to find sentence boundaries.
1030 This approach has been shown to work well for many European
1031 languages.
1032 """
1033 - def __init__(self, train_text=None, verbose=False):
1034 """
1035 train_text can either be the sole training text for this sentence
1036 boundary detector, or can be a PunktParameters object.
1037 """
1038 _PunktBaseClass.__init__(self)
1039
1040 if train_text:
1041 self._params = self.train(train_text, verbose)
1042
1043 - def train(self, train_text, verbose=False):
1044 """
1045 Derives parameters from a given training text, or uses the parameters
1046 given. Repeated calls to this method destroy previous parameters. For
1047 incremental training, instantiate a separate PunktTrainer instance.
1048 """
1049 if type(train_text) not in (type(''), type(u'')):
1050 return train_text
1051 return PunktTrainer(train_text).get_params()
1052
1053
1054
1055
1056
1057 - def tokenize(self, text, realign_boundaries=False):
1058 """
1059 Given a text, returns a list of the sentences in that text.
1060 """
1061 return list(self.sentences_from_text(text, realign_boundaries))
1062
1063 - def sentences_from_text(self, text, realign_boundaries=False):
1064 """
1065 Given a text, generates the sentences in that text by only
1066 testing candidate sentence breaks. If realign_boundaries is
1067 True, includes in the sentence closing punctuation that
1068 follows the period.
1069 """
1070 sents = self._sentences_from_text(text)
1071 if realign_boundaries:
1072 sents = self._realign_boundaries(sents)
1073 return sents
1074
1075 - def _sentences_from_text(self, text):
1076 last_break = 0
1077 for match in _punkt_period_context_regexp.finditer(text):
1078 if self.text_contains_sentbreak(match.group(0)):
1079 yield text[last_break:match.end(1)]
1080 if match.group(3):
1081
1082 last_break = match.start(3)
1083 else:
1084
1085 last_break = match.start(2)
1086 yield text[last_break:]
1087
1088 @staticmethod
1090 """
1091 Attempts to realign punctuation that falls after the period but
1092 should otherwise be included in the same sentence.
1093
1094 For example: "(Sent1.) Sent2." will otherwise be split as::
1095
1096 ["(Sent1.", ") Sent1."].
1097
1098 This method will produce::
1099
1100 ["(Sent1.)", "Sent2."].
1101 """
1102 realign = 0
1103 for s1, s2 in self.pair_iter(sents):
1104 s1 = s1[realign:]
1105 if not s2:
1106 if s1:
1107 yield s1
1108 continue
1109
1110 m = _RE_BOUNDARY_REALIGNMENT.match(s2)
1111 if m:
1112 yield s1 + m.group(0).strip()
1113 realign = m.end()
1114 else:
1115 realign = 0
1116 if s1:
1117 yield s1
1118
1120 """
1121 Returns True if the given text includes a sentence break.
1122 """
1123 found = False
1124 for t in self._annotate_tokens(self._tokenize_words(text)):
1125 if found:
1126 return True
1127 if t.sentbreak:
1128 found = True
1129 return False
1130
1132 """
1133 Given a text, generates the sentences in that text. Annotates all
1134 tokens, rather than just those with possible sentence breaks. Should
1135 produce the same results as L{sentences_from_text}.
1136 """
1137 tokens = self._annotate_tokens(self._tokenize_words(text))
1138 return self._build_sentence_list(text, tokens)
1139
1141 """
1142 Given a sequence of tokens, generates lists of tokens, each list
1143 corresponding to a sentence.
1144 """
1145 tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens))
1146 sentence = []
1147 for aug_tok in tokens:
1148 sentence.append(aug_tok.tok)
1149 if aug_tok.sentbreak:
1150 yield sentence
1151 sentence = []
1152 if sentence:
1153 yield sentence
1154
1156 """
1157 Given a set of tokens augmented with markers for line-start and
1158 paragraph-start, returns an iterator through those tokens with full
1159 annotation including predicted sentence breaks.
1160 """
1161
1162
1163 tokens = self._annotate_first_pass(tokens)
1164
1165
1166
1167
1168 tokens = self._annotate_second_pass(tokens)
1169
1170
1171
1172
1173
1174 return tokens
1175
1177 """
1178 Given the original text and the list of augmented word tokens,
1179 construct and return a tokenized list of sentence strings.
1180 """
1181
1182
1183
1184
1185
1186 pos = 0
1187
1188
1189 WS_REGEXP = re.compile(r'\s*')
1190
1191 sentence = ''
1192 for aug_tok in tokens:
1193 tok = aug_tok.tok
1194
1195
1196 ws = WS_REGEXP.match(text, pos).group()
1197 pos += len(ws)
1198
1199
1200
1201
1202
1203
1204 if text[pos:pos+len(tok)] != tok:
1205 pat = '\s*'.join(re.escape(c) for c in tok)
1206 m = re.compile(pat).match(text,pos)
1207 if m: tok = m.group()
1208
1209
1210 assert text[pos:pos+len(tok)] == tok
1211 pos += len(tok)
1212
1213
1214
1215
1216 if sentence:
1217 sentence += ws + tok
1218 else:
1219 sentence += tok
1220
1221
1222 if aug_tok.sentbreak:
1223 yield sentence
1224 sentence = ''
1225
1226
1227 if sentence:
1228 yield sentence
1229
1230
1231 - def dump(self, tokens):
1232 print 'writing to /tmp/punkt.new...'
1233 out = open('/tmp/punkt.new', 'w')
1234 for aug_tok in tokens:
1235 if aug_tok.parastart:
1236 out.write('\n\n')
1237 elif aug_tok.linestart:
1238 out.write('\n')
1239 else:
1240 out.write(' ')
1241
1242 out.write(str(aug_tok))
1243 out.close()
1244
1245
1246
1247
1248
1249 PUNCTUATION = tuple(';:,.!?')
1250
1251
1252
1253
1254
1256 """
1257 Performs a token-based classification (section 4) over the given
1258 tokens, making use of the orthographic heuristic (4.1.1), collocation
1259 heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
1260 """
1261 for t1, t2 in self.pair_iter(tokens):
1262 self._second_pass_annotation(t1, t2)
1263 yield t1
1264
1266 """
1267 Performs token-based classification over a pair of contiguous tokens
1268 returning an updated augmented token for the first of them.
1269 """
1270
1271 if not aug_tok2:
1272 return
1273
1274 tok = aug_tok1.tok
1275 if not aug_tok1.period_final:
1276
1277 return
1278
1279 typ = aug_tok1.type_no_period
1280 next_tok = aug_tok2.tok
1281 next_typ = aug_tok2.type_no_sentperiod
1282 tok_is_initial = aug_tok1.is_initial
1283
1284
1285
1286
1287
1288
1289
1290 if (typ, next_typ) in self._params.collocations:
1291 aug_tok1.sentbreak = False
1292 aug_tok1.abbr = True
1293 return
1294
1295
1296
1297
1298 if ( (aug_tok1.abbr or aug_tok1.ellipsis) and
1299 (not tok_is_initial) ):
1300
1301
1302
1303 is_sent_starter = self._ortho_heuristic(aug_tok2)
1304 if is_sent_starter == True:
1305 aug_tok1.sentbreak = True
1306 return
1307
1308
1309
1310
1311
1312 if ( aug_tok2.first_upper and
1313 next_typ in self._params.sent_starters):
1314 aug_tok1.sentbreak = True
1315 return
1316
1317
1318
1319
1320 if tok_is_initial or typ == '##number##':
1321
1322
1323
1324
1325 is_sent_starter = self._ortho_heuristic(aug_tok2)
1326
1327 if is_sent_starter == False:
1328 aug_tok1.sentbreak = False
1329 aug_tok1.abbr = True
1330 return
1331
1332
1333
1334
1335 if ( is_sent_starter == 'unknown' and tok_is_initial and
1336 aug_tok2.first_upper and
1337 not (self._params.ortho_context[next_typ] & _ORTHO_LC) ):
1338 aug_tok1.sentbreak = False
1339 aug_tok1.abbr = True
1340 return
1341
1342 return
1343
1345 """
1346 Decide whether the given token is the first token in a sentence.
1347 """
1348
1349 if aug_tok.tok in self.PUNCTUATION:
1350 return False
1351
1352 ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod]
1353
1354
1355
1356
1357 if ( aug_tok.first_upper and
1358 (ortho_context & _ORTHO_LC) and
1359 not (ortho_context & _ORTHO_MID_UC) ):
1360 return True
1361
1362
1363
1364
1365
1366 if ( aug_tok.first_lower and
1367 ((ortho_context & _ORTHO_UC) or
1368 not (ortho_context & _ORTHO_BEG_LC)) ):
1369 return False
1370
1371
1372 return 'unknown'
1373