Source code for nltk.text

# Natural Language Toolkit: Texts
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Bird <[email protected]>
#         Edward Loper <[email protected]>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
This module brings together a variety of NLTK functionality for
text analysis, and provides simple, interactive interfaces.
Functionality includes: concordancing, collocation discovery,
regular expression search over tokenized strings, and
distributional similarity.
"""
from __future__ import print_function, division, unicode_literals, absolute_import

from math import log
from collections import defaultdict, Counter, namedtuple
from functools import reduce
import re

from six import text_type

from nltk.probability import FreqDist
from nltk.probability import ConditionalFreqDist as CFD
from nltk.util import tokenwrap, LazyConcatenation
from nltk.metrics import f_measure, BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.compat import python_2_unicode_compatible

ConcordanceLine = namedtuple(
    'ConcordanceLine',
    ['left', 'query', 'right', 'offset', 'left_print', 'right_print', 'line'],
)


[docs]class ContextIndex(object): """ A bidirectional index between words and their 'contexts' in a text. The context of a word is usually defined to be the words that occur in a fixed window around the word; but other definitions may also be used by providing a custom context function. """ @staticmethod def _default_context(tokens, i): """One left token and one right token, normalized to lowercase""" left = tokens[i - 1].lower() if i != 0 else '*START*' right = tokens[i + 1].lower() if i != len(tokens) - 1 else '*END*' return (left, right) def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD( (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens) ) self._context_to_words = CFD( (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens) )
[docs] def tokens(self): """ :rtype: list(str) :return: The document that this context index was created from. """ return self._tokens
[docs] def word_similarity_dict(self, word): """ Return a dictionary mapping from words to 'similarity scores,' indicating how often these two words occur in the same context. """ word = self._key(word) word_contexts = set(self._word_to_contexts[word]) scores = {} for w, w_contexts in self._word_to_contexts.items(): scores[w] = f_measure(word_contexts, set(w_contexts)) return scores
[docs] def similar_words(self, word, n=20): scores = defaultdict(int) for c in self._word_to_contexts[self._key(word)]: for w in self._context_to_words[c]: if w != word: scores[w] += ( self._context_to_words[c][word] * self._context_to_words[c][w] ) return sorted(scores, key=scores.get, reverse=True)[:n]
[docs] def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist( c for w in words for c in self._word_to_contexts[w] if c in common ) return fd
[docs]@python_2_unicode_compatible class ConcordanceIndex(object): """ An index that can be used to look up the offset locations at which a given word occurs in a document. """ def __init__(self, tokens, key=lambda x: x): """ Construct a new concordance index. :param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurrence. :param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use ``key=lambda s:s.lower()``, then the index will be case-insensitive. """ self._tokens = tokens """The document (list of tokens) that this concordance index was created from.""" self._key = key """Function mapping each token to an index key (or None).""" self._offsets = defaultdict(list) """Dictionary mapping words (or keys) to lists of offset indices.""" # Initialize the index (self._offsets) for index, word in enumerate(tokens): word = self._key(word) self._offsets[word].append(index)
[docs] def tokens(self): """ :rtype: list(str) :return: The document that this concordance index was created from. """ return self._tokens
[docs] def offsets(self, word): """ :rtype: list(int) :return: A list of the offset positions at which the given word occurs. If a key function was specified for the index, then given word's key will be looked up. """ word = self._key(word) return self._offsets[word]
def __repr__(self): return '<ConcordanceIndex for %d tokens (%d types)>' % ( len(self._tokens), len(self._offsets), )
[docs] def find_concordance(self, word, width=80): """ Find all concordance lines given the query word. """ half_width = (width - len(word) - 2) // 2 context = width // 4 # approx number of words of context # Find the instances of the word to create the ConcordanceLine concordance_list = [] offsets = self.offsets(word) if offsets: for i in offsets: query_word = self._tokens[i] # Find the context of query word. left_context = self._tokens[max(0, i - context) : i] right_context = self._tokens[i + 1 : i + context] # Create the pretty lines with the query_word in the middle. left_print = ' '.join(left_context)[-half_width:] right_print = ' '.join(right_context)[:half_width] # The WYSIWYG line of the concordance. line_print = ' '.join([left_print, query_word, right_print]) # Create the ConcordanceLine concordance_line = ConcordanceLine( left_context, query_word, right_context, i, left_print, right_print, line_print, ) concordance_list.append(concordance_line) return concordance_list
[docs] def print_concordance(self, word, width=80, lines=25): """ Print concordance lines given the query word. :param word: The target word :type word: str :param lines: The number of lines to display (default=25) :type lines: int :param width: The width of each line, in characters (default=80) :type width: int :param save: The option to save the concordance. :type save: bool """ concordance_list = self.find_concordance(word, width=width) if not concordance_list: print("no matches") else: lines = min(lines, len(concordance_list)) print("Displaying {} of {} matches:".format(lines, len(concordance_list))) for i, concordance_line in enumerate(concordance_list[:lines]): print(concordance_line.line)
[docs]class TokenSearcher(object): """ A class that makes it easier to use regular expressions to search over tokenized strings. The tokenized string is converted to a string where tokens are marked with angle brackets -- e.g., ``'<the><window><is><still><open>'``. The regular expression passed to the ``findall()`` method is modified to treat angle brackets as non-capturing parentheses, in addition to matching the token boundaries; and to have ``'.'`` not match the angle brackets. """ def __init__(self, tokens): self._raw = ''.join('<' + w + '>' for w in tokens)
[docs] def findall(self, regexp): """ Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> from nltk.text import TokenSearcher >>> print('hack'); from nltk.book import text1, text5, text9 hack... >>> text5.findall("<.*><.*><bro>") you rule bro; telling you bro; u twizted bro >>> text1.findall("<a>(<.*>)<man>") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("<th.*>{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str """ # preprocess the regular expression regexp = re.sub(r'\s', '', regexp) regexp = re.sub(r'<', '(?:<(?:', regexp) regexp = re.sub(r'>', ')>)', regexp) regexp = re.sub(r'(?<!\\)\.', '[^>]', regexp) # perform the search hits = re.findall(regexp, self._raw) # Sanity check for h in hits: if not h.startswith('<') and h.endswith('>'): raise ValueError('Bad regexp for TokenSearcher.findall') # postprocess the output hits = [h[1:-1].split('><') for h in hits] return hits
[docs]@python_2_unicode_compatible class Text(object): """ A wrapper around a sequence of simple (string) tokens, which is intended to support initial exploration of texts (via the interactive console). Its methods perform a variety of analyses on the text's contexts (e.g., counting, concordancing, collocation discovery), and display the results. If you wish to write a program which makes use of these analyses, then you should bypass the ``Text`` class, and use the appropriate analysis function or class directly instead. A ``Text`` is typically initialized from a given document or corpus. E.g.: >>> import nltk.corpus >>> from nltk.text import Text >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt')) """ # This defeats lazy loading, but makes things faster. This # *shouldn't* be necessary because the corpus view *should* be # doing intelligent caching, but without this it's running slow. # Look into whether the caching is working correctly. _COPY_TOKENS = True def __init__(self, tokens, name=None): """ Create a Text object. :param tokens: The source text. :type tokens: sequence of str """ if self._COPY_TOKENS: tokens = list(tokens) self.tokens = tokens if name: self.name = name elif ']' in tokens[:20]: end = tokens[:20].index(']') self.name = " ".join(text_type(tok) for tok in tokens[1:end]) else: self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..." # //////////////////////////////////////////////////////////// # Support item & slice access # //////////////////////////////////////////////////////////// def __getitem__(self, i): return self.tokens[i] def __len__(self): return len(self.tokens) # //////////////////////////////////////////////////////////// # Interactive console methods # ////////////////////////////////////////////////////////////
[docs] def concordance(self, word, width=79, lines=25): """ Prints a concordance for ``word`` with the specified context window. Word matching is not case-sensitive. :param word: The target word :type word: str :param width: The width of each line, in characters (default=80) :type width: int :param lines: The number of lines to display (default=25) :type lines: int :seealso: ``ConcordanceIndex`` """ if '_concordance_index' not in self.__dict__: self._concordance_index = ConcordanceIndex( self.tokens, key=lambda s: s.lower() ) return self._concordance_index.print_concordance(word, width, lines)
[docs] def concordance_list(self, word, width=79, lines=25): """ Generate a concordance for ``word`` with the specified context window. Word matching is not case-sensitive. :param word: The target word :type word: str :param width: The width of each line, in characters (default=80) :type width: int :param lines: The number of lines to display (default=25) :type lines: int :seealso: ``ConcordanceIndex`` """ if '_concordance_index' not in self.__dict__: self._concordance_index = ConcordanceIndex( self.tokens, key=lambda s: s.lower() ) return self._concordance_index.find_concordance(word, width)[:lines]
[docs] def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ( '_collocations' in self.__dict__ and self._num == num and self._window_size == window_size ): self._num = num self._window_size = window_size # print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
[docs] def count(self, word): """ Count the number of times this word appears in the text. """ return self.tokens.count(word)
[docs] def index(self, word): """ Find the index of the first occurrence of the word in the text. """ return self.tokens.index(word)
[docs] def readability(self, method): # code from nltk_contrib.readability raise NotImplementedError
[docs] def similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() """ if '_word_context_index' not in self.__dict__: # print('Building word-context index...') self._word_context_index = ContextIndex( self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower() ) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = Counter( w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word ) words = [w for w, _ in fd.most_common(num)] print(tokenwrap(words)) else: print("No matches")
[docs] def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if '_word_context_index' not in self.__dict__: # print('Building word-context index...') self._word_context_index = ContextIndex( self.tokens, key=lambda s: s.lower() ) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print("No common contexts were found") else: ranked_contexts = [w for w, _ in fd.most_common(num)] print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts)) except ValueError as e: print(e)
[docs] def dispersion_plot(self, words): """ Produce a plot showing the distribution of the words through the text. Requires pylab to be installed. :param words: The words to be plotted :type words: list(str) :seealso: nltk.draw.dispersion_plot() """ from nltk.draw import dispersion_plot dispersion_plot(self, words)
[docs] def generate(self, words): """ Issues a reminder to users following the book online """ import warnings warnings.warn( 'The generate() method is no longer available.', DeprecationWarning )
[docs] def plot(self, *args): """ See documentation for FreqDist.plot() :seealso: nltk.prob.FreqDist.plot() """ self.vocab().plot(*args)
[docs] def vocab(self): """ :seealso: nltk.prob.FreqDist """ if "_vocab" not in self.__dict__: # print("Building vocabulary index...") self._vocab = FreqDist(self) return self._vocab
[docs] def findall(self, regexp): """ Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> print('hack'); from nltk.book import text1, text5, text9 hack... >>> text5.findall("<.*><.*><bro>") you rule bro; telling you bro; u twizted bro >>> text1.findall("<a>(<.*>)<man>") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("<th.*>{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str """ if "_token_searcher" not in self.__dict__: self._token_searcher = TokenSearcher(self) hits = self._token_searcher.findall(regexp) hits = [' '.join(h) for h in hits] print(tokenwrap(hits, "; "))
# //////////////////////////////////////////////////////////// # Helper Methods # //////////////////////////////////////////////////////////// _CONTEXT_RE = re.compile('\w+|[\.\!\?]') def _context(self, tokens, i): """ One left & one right token, both case-normalized. Skip over non-sentence-final punctuation. Used by the ``ContextIndex`` that is created for ``similar()`` and ``common_contexts()``. """ # Left context j = i - 1 while j >= 0 and not self._CONTEXT_RE.match(tokens[j]): j -= 1 left = tokens[j] if j != 0 else '*START*' # Right context j = i + 1 while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]): j += 1 right = tokens[j] if j != len(tokens) else '*END*' return (left, right) # //////////////////////////////////////////////////////////// # String Display # //////////////////////////////////////////////////////////// def __str__(self): return '<Text: %s>' % self.name def __repr__(self): return '<Text: %s>' % self.name
# Prototype only; this approach will be slow to load
[docs]class TextCollection(Text): """A collection of texts, which can be loaded with list of texts, or with a corpus consisting of one or more texts, and which supports counting, concordancing, collocation discovery, etc. Initialize a TextCollection as follows: >>> import nltk.corpus >>> from nltk.text import TextCollection >>> print('hack'); from nltk.book import text1, text2, text3 hack... >>> gutenberg = TextCollection(nltk.corpus.gutenberg) >>> mytexts = TextCollection([text1, text2, text3]) Iterating over a TextCollection produces all the tokens of all the texts in order. """ def __init__(self, source): if hasattr(source, 'words'): # bridge to the text corpus reader source = [source.words(f) for f in source.fileids()] self._texts = source Text.__init__(self, LazyConcatenation(source)) self._idf_cache = {}
[docs] def tf(self, term, text): """ The frequency of the term in text. """ return text.count(term) / len(text)
[docs] def idf(self, term): """ The number of texts in the corpus divided by the number of texts that the term appears in. If a term does not appear in the corpus, 0.0 is returned. """ # idf values are cached for performance. idf = self._idf_cache.get(term) if idf is None: matches = len([True for text in self._texts if term in text]) if len(self._texts) == 0: raise ValueError('IDF undefined for empty document collection') idf = log(len(self._texts) / matches) if matches else 0.0 self._idf_cache[term] = idf return idf
[docs] def tf_idf(self, term, text): return self.tf(term, text) * self.idf(term)
def demo(): from nltk.corpus import brown text = Text(brown.words(categories='news')) print(text) print() print("Concordance:") text.concordance('news') print() print("Distributionally similar words:") text.similar('news') print() print("Collocations:") text.collocations() print() # print("Automatically generated text:") # text.generate() # print() print("Dispersion plot:") text.dispersion_plot(['news', 'report', 'said', 'announced']) print() print("Vocabulary plot:") text.plot(50) print() print("Indexing:") print("text[3]:", text[3]) print("text[3:5]:", text[3:5]) print("text.vocab()['news']:", text.vocab()['news']) if __name__ == '__main__': demo() __all__ = [ "ContextIndex", "ConcordanceIndex", "TokenSearcher", "Text", "TextCollection", ]