1
2
3
4
5
6
7
8
9 import re
10
11 from nltk.internals import deprecated
12
14 """
15 Given the string representation of a tagged token, return the
16 corresponding tuple representation. The rightmost occurence of
17 C{sep} in C{s} will be used to divide C{s} into a word string and
18 a tag string. If C{sep} does not occur in C{s}, return
19 C{(s, None)}.
20
21 @type s: C{str}
22 @param s: The string representaiton of a tagged token.
23 @type sep: C{str}
24 @param sep: The separator string used to separate word strings
25 from tags.
26 """
27 loc = s.rfind(sep)
28 if loc >= 0:
29 return (s[:loc], s[loc+1:].upper())
30 else:
31 return (s, None)
32
34 """
35 Given the tuple representation of a tagged token, return the
36 corresponding string representation. This representation is
37 formed by concatenating the token's word string, followed by the
38 separator, followed by the token's tag. (If the tag is None,
39 then just return the bare word string.)
40
41 @type tagged_token: C{(str, str)}
42 @param tagged_token: The tuple representation of a tagged token.
43 @type sep: C{str}
44 @param sep: The separator string used to separate word strings
45 from tags.
46 """
47 word, tag = tagged_token
48 if tag is None:
49 return word
50 else:
51 assert sep not in tag, 'tag may not contain sep!'
52 return '%s%s%s' % (word, sep, tag)
53
54 -def untag(tagged_sentence):
55 """
56 Given a tagged sentence, return an untagged version of that
57 sentence. I.e., return a list containing the first element
58 of each tuple in C{tagged_sentence}.
59
60 >>> untag([('John', 'NNP'), ('saw', 'VBD'), ('Mary', 'NNP')]
61 ['John', 'saw', 'mary']
62 """
63 return [w for (w, t) in tagged_sentence]
64
65 from nltk import evaluate
67 """
68 Score the accuracy of the tagger against the gold standard.
69 Strip the tags from the gold standard text, retag it using
70 the tagger, then compute the accuracy score.
71
72 @type tagger: C{TaggerI}
73 @param tagger: The tagger being evaluated.
74 @type gold: C{list} of C{Token}
75 @param gold: The list of tagged tokens to score the tagger on.
76 @rtype: C{float}
77 """
78 tagged_sents = tagger.batch_tag([untag(sent) for sent in gold])
79 gold_tokens = sum(gold, [])
80 test_tokens = sum(tagged_sents, [])
81 return evaluate.accuracy(gold_tokens, test_tokens)
82
83
84
85
86 @deprecated("Use nltk.tag.str2tuple(s, sep) instead.")
89
90 @deprecated("Use [nltk.tag.str2tuple(t, sep) for t in s.split()] instead.")
93
94 @deprecated("Use ' '.join(nltk.tag.tuple2str(w, sep) for w in t) instead.")
97
98 @deprecated("Use [nltk.tag.str2tuple(t, sep)[0] for t in s.split()] instead.")
101