nltk.tag.util

1 # Natural Language Toolkit: Tagger Utilities 2 # 3 # Copyright (C) 2001-2008 NLTK Project 4 # Author: Edward Loper <[email protected]> 5 # Steven Bird <[email protected]> 6 # URL: <http://nltk.org> 7 # For license information, see LICENSE.TXT 8 9 import re 10 11 from nltk.internals import deprecated 12

13 -def str2tuple(s, sep='/'):

14 """ 15 Given the string representation of a tagged token, return the 16 corresponding tuple representation. The rightmost occurence of 17 C{sep} in C{s} will be used to divide C{s} into a word string and 18 a tag string. If C{sep} does not occur in C{s}, return 19 C{(s, None)}. 20 21 @type s: C{str} 22 @param s: The string representaiton of a tagged token. 23 @type sep: C{str} 24 @param sep: The separator string used to separate word strings 25 from tags. 26 """ 27 loc = s.rfind(sep) 28 if loc >= 0: 29 return (s[:loc], s[loc+1:].upper()) 30 else: 31 return (s, None)

32

33 -def tuple2str(tagged_token, sep='/'):

34 """ 35 Given the tuple representation of a tagged token, return the 36 corresponding string representation. This representation is 37 formed by concatenating the token's word string, followed by the 38 separator, followed by the token's tag. (If the tag is None, 39 then just return the bare word string.) 40 41 @type tagged_token: C{(str, str)} 42 @param tagged_token: The tuple representation of a tagged token. 43 @type sep: C{str} 44 @param sep: The separator string used to separate word strings 45 from tags. 46 """ 47 word, tag = tagged_token 48 if tag is None: 49 return word 50 else: 51 assert sep not in tag, 'tag may not contain sep!' 52 return '%s%s%s' % (word, sep, tag)

53

54 -def untag(tagged_sentence):

55 """ 56 Given a tagged sentence, return an untagged version of that 57 sentence. I.e., return a list containing the first element 58 of each tuple in C{tagged_sentence}. 59 60 >>> untag([('John', 'NNP'), ('saw', 'VBD'), ('Mary', 'NNP')] 61 ['John', 'saw', 'mary'] 62 """ 63 return [w for (w, t) in tagged_sentence]

64 65 from nltk import evaluate

66 -def accuracy(tagger, gold):

67 """ 68 Score the accuracy of the tagger against the gold standard. 69 Strip the tags from the gold standard text, retag it using 70 the tagger, then compute the accuracy score. 71 72 @type tagger: C{TaggerI} 73 @param tagger: The tagger being evaluated. 74 @type gold: C{list} of C{Token} 75 @param gold: The list of tagged tokens to score the tagger on. 76 @rtype: C{float} 77 """ 78 tagged_sents = tagger.batch_tag([untag(sent) for sent in gold]) 79 gold_tokens = sum(gold, []) 80 test_tokens = sum(tagged_sents, []) 81 return evaluate.accuracy(gold_tokens, test_tokens)

82 83 ###################################################################### 84 #{ Deprecated 85 ###################################################################### 86 @deprecated("Use nltk.tag.str2tuple(s, sep) instead.")

87 -def tag2tuple(s, sep='/'):

88 return str2tuple(s, sep)

89 90 @deprecated("Use [nltk.tag.str2tuple(t, sep) for t in s.split()] instead.")

91 -def string2tags(s, sep='/'):

92 return [str2tuple(t, sep) for t in s.split()]

93 94 @deprecated("Use ' '.join(nltk.tag.tuple2str(w, sep) for w in t) instead.")

95 -def tags2string(t, sep='/'):

96 return ' '.join(tuple2str(w, sep) for w in t)

97 98 @deprecated("Use [nltk.tag.str2tuple(t, sep)[0] for t in s.split()] instead.")

99 -def string2words(s, sep='/'):

100 return [str2tuple(t, sep)[0] for t in s.split()]

101

Source Code for Module nltk.tag.util