File size: 8,495 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# Natural Language Toolkit: Language Models
#
# Copyright (C) 2001-2023 NLTK Project
# Authors: Ilia Kurenkov <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""Language Model Interface."""

import random
import warnings
from abc import ABCMeta, abstractmethod
from bisect import bisect
from itertools import accumulate

from nltk.lm.counter import NgramCounter
from nltk.lm.util import log_base2
from nltk.lm.vocabulary import Vocabulary


class Smoothing(metaclass=ABCMeta):
    """Ngram Smoothing Interface



    Implements Chen & Goodman 1995's idea that all smoothing algorithms have

    certain features in common. This should ideally allow smoothing algorithms to

    work both with Backoff and Interpolation.

    """

    def __init__(self, vocabulary, counter):
        """

        :param vocabulary: The Ngram vocabulary object.

        :type vocabulary: nltk.lm.vocab.Vocabulary

        :param counter: The counts of the vocabulary items.

        :type counter: nltk.lm.counter.NgramCounter

        """
        self.vocab = vocabulary
        self.counts = counter

    @abstractmethod
    def unigram_score(self, word):
        raise NotImplementedError()

    @abstractmethod
    def alpha_gamma(self, word, context):
        raise NotImplementedError()


def _mean(items):
    """Return average (aka mean) for sequence of items."""
    return sum(items) / len(items)


def _random_generator(seed_or_generator):
    if isinstance(seed_or_generator, random.Random):
        return seed_or_generator
    return random.Random(seed_or_generator)


def _weighted_choice(population, weights, random_generator=None):
    """Like random.choice, but with weights.



    Heavily inspired by python 3.6 `random.choices`.

    """
    if not population:
        raise ValueError("Can't choose from empty population")
    if len(population) != len(weights):
        raise ValueError("The number of weights does not match the population")
    cum_weights = list(accumulate(weights))
    total = cum_weights[-1]
    threshold = random_generator.random()
    return population[bisect(cum_weights, total * threshold)]


class LanguageModel(metaclass=ABCMeta):
    """ABC for Language Models.



    Cannot be directly instantiated itself.



    """

    def __init__(self, order, vocabulary=None, counter=None):
        """Creates new LanguageModel.



        :param vocabulary: If provided, this vocabulary will be used instead

            of creating a new one when training.

        :type vocabulary: `nltk.lm.Vocabulary` or None

        :param counter: If provided, use this object to count ngrams.

        :type counter: `nltk.lm.NgramCounter` or None

        :param ngrams_fn: If given, defines how sentences in training text are turned to ngram

            sequences.

        :type ngrams_fn: function or None

        :param pad_fn: If given, defines how sentences in training text are padded.

        :type pad_fn: function or None

        """
        self.order = order
        if vocabulary and not isinstance(vocabulary, Vocabulary):
            warnings.warn(
                f"The `vocabulary` argument passed to {self.__class__.__name__!r} "
                "must be an instance of `nltk.lm.Vocabulary`.",
                stacklevel=3,
            )
        self.vocab = Vocabulary() if vocabulary is None else vocabulary
        self.counts = NgramCounter() if counter is None else counter

    def fit(self, text, vocabulary_text=None):
        """Trains the model on a text.



        :param text: Training text as a sequence of sentences.



        """
        if not self.vocab:
            if vocabulary_text is None:
                raise ValueError(
                    "Cannot fit without a vocabulary or text to create it from."
                )
            self.vocab.update(vocabulary_text)
        self.counts.update(self.vocab.lookup(sent) for sent in text)

    def score(self, word, context=None):
        """Masks out of vocab (OOV) words and computes their model score.



        For model-specific logic of calculating scores, see the `unmasked_score`

        method.

        """
        return self.unmasked_score(
            self.vocab.lookup(word), self.vocab.lookup(context) if context else None
        )

    @abstractmethod
    def unmasked_score(self, word, context=None):
        """Score a word given some optional context.



        Concrete models are expected to provide an implementation.

        Note that this method does not mask its arguments with the OOV label.

        Use the `score` method for that.



        :param str word: Word for which we want the score

        :param tuple(str) context: Context the word is in.

            If `None`, compute unigram score.

        :param context: tuple(str) or None

        :rtype: float

        """
        raise NotImplementedError()

    def logscore(self, word, context=None):
        """Evaluate the log score of this word in this context.



        The arguments are the same as for `score` and `unmasked_score`.



        """
        return log_base2(self.score(word, context))

    def context_counts(self, context):
        """Helper method for retrieving counts for a given context.



        Assumes context has been checked and oov words in it masked.

        :type context: tuple(str) or None



        """
        return (
            self.counts[len(context) + 1][context] if context else self.counts.unigrams
        )

    def entropy(self, text_ngrams):
        """Calculate cross-entropy of model for given evaluation text.



        :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples.

        :rtype: float



        """
        return -1 * _mean(
            [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams]
        )

    def perplexity(self, text_ngrams):
        """Calculates the perplexity of the given text.



        This is simply 2 ** cross-entropy for the text, so the arguments are the same.



        """
        return pow(2.0, self.entropy(text_ngrams))

    def generate(self, num_words=1, text_seed=None, random_seed=None):
        """Generate words from the model.



        :param int num_words: How many words to generate. By default 1.

        :param text_seed: Generation can be conditioned on preceding context.

        :param random_seed: A random seed or an instance of `random.Random`. If provided,

            makes the random sampling part of generation reproducible.

        :return: One (str) word or a list of words generated from model.



        Examples:



        >>> from nltk.lm import MLE

        >>> lm = MLE(2)

        >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c'])

        >>> lm.fit([[("a",), ("b",), ("c",)]])

        >>> lm.generate(random_seed=3)

        'a'

        >>> lm.generate(text_seed=['a'])

        'b'



        """
        text_seed = [] if text_seed is None else list(text_seed)
        random_generator = _random_generator(random_seed)
        # This is the base recursion case.
        if num_words == 1:
            context = (
                text_seed[-self.order + 1 :]
                if len(text_seed) >= self.order
                else text_seed
            )
            samples = self.context_counts(self.vocab.lookup(context))
            while context and not samples:
                context = context[1:] if len(context) > 1 else []
                samples = self.context_counts(self.vocab.lookup(context))
            # Sorting samples achieves two things:
            # - reproducible randomness when sampling
            # - turns Mapping into Sequence which `_weighted_choice` expects
            samples = sorted(samples)
            return _weighted_choice(
                samples,
                tuple(self.score(w, context) for w in samples),
                random_generator,
            )
        # We build up text one word at a time using the preceding context.
        generated = []
        for _ in range(num_words):
            generated.append(
                self.generate(
                    num_words=1,
                    text_seed=text_seed + generated,
                    random_seed=random_generator,
                )
            )
        return generated