Package nltk :: Package corpus :: Package reader :: Module nps_chat
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.nps_chat

 1  # Natural Language Toolkit: NPS Chat Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2008 NLTK Project 
 4  # Author: Edward Loper <[email protected]> 
 5  # URL: <http://nltk.org> 
 6  # For license information, see LICENSE.TXT 
 7   
 8  from nltk.compat import * 
 9  from nltk.corpus.reader.util import * 
10  from nltk.corpus.reader.api import * 
11  from nltk.corpus.reader.xmldocs import * 
12  from nltk.util import LazyConcatenation 
13  from nltk.internals import ElementWrapper 
14  import re, textwrap 
15   
16 -class NPSChatCorpusReader(XMLCorpusReader):
17
18 - def __init__(self, root, files, wrap_etree=False, tag_mapping_function=None):
19 XMLCorpusReader.__init__(self, root, files, wrap_etree) 20 self._tag_mapping_function = tag_mapping_function
21
22 - def xml_posts(self, files=None):
23 if self._wrap_etree: 24 return concat([XMLCorpusView(filename, 'Session/Posts/Post', 25 self._wrap_elt) 26 for filename in self.abspaths(files)]) 27 else: 28 return concat([XMLCorpusView(filename, 'Session/Posts/Post') 29 for filename in self.abspaths(files)])
30
31 - def posts(self, files=None):
32 return concat([XMLCorpusView(filename, 'Session/Posts/Post/terminals', 33 self._elt_to_words) 34 for filename in self.abspaths(files)])
35
36 - def tagged_posts(self, files=None, simplify_tags=False):
37 def reader(elt, handler): 38 return self._elt_to_tagged_words(elt, handler, simplify_tags)
39 return concat([XMLCorpusView(filename, 'Session/Posts/Post/terminals', 40 reader) 41 for filename in self.abspaths(files)])
42
43 - def words(self, files=None):
44 return LazyConcatenation(self.posts(files))
45
46 - def tagged_words(self, files=None, simplify_tags=False):
47 return LazyConcatenation(self.tagged_posts(files, simplify_tags))
48
49 - def _wrap_elt(self, elt, handler):
50 return ElementWrapper(elt)
51
52 - def _elt_to_words(self, elt, handler):
53 return [self._simplify_username(t.attrib['word']) 54 for t in elt.findall('t')]
55
56 - def _elt_to_tagged_words(self, elt, handler, simplify_tags=False):
57 tagged_post = [(self._simplify_username(t.attrib['word']), 58 t.attrib['pos']) for t in elt.findall('t')] 59 if simplify_tags: 60 tagged_post = [(w, self._tag_mapping_function(t)) 61 for (w,t) in tagged_post] 62 return tagged_post
63 64 @staticmethod
65 - def _simplify_username(word):
66 if 'User' in word: 67 word = 'U' + word.split('User', 1)[1] 68 return word
69