"""N-gram Maximum Likelihood Probabilistic Language Model. Supports n-gram up to 5-gram. """ from __future__ import annotations import math as calc import os import pickle from collections import Counter, defaultdict from typing import List from nltk.util import ngrams class NGramMLE: def __init__(self, n: int = 1, corpus_file: str = None) -> None: """Initializes the model. Args: n: n-gram order. Defaults to 1. corpus_file: File containing the corpus words. Defaults to None. Raises: FileNotFoundError: If corpus_file is defined but not found. """ if corpus_file and not os.path.exists(corpus_file): raise FileNotFoundError(f"Corpus file not found: {corpus_file}") self.n = n self.corpus_file = corpus_file self.ngrams = defaultdict(Counter) def _read_corpus(self) -> List[str]: """Reads the corpus from the file. Returns: List of words in the corpus. """ with open(self.corpus_file, "r") as f: return f.read().split("\n") def create_ngrams(self): """Creates n-grams from the corpus.""" corpus_words = self._read_corpus() self.total_words = len(corpus_words) self.ngrams[1] = Counter(corpus_words) for i in range(self.n - 1): self.ngrams[i + 2] = Counter(ngrams(corpus_words, i + 2)) def probability( self, ngram: str, higher_order_ngram: str = "", n: int = 1 ) -> float: """Computes maximum likelihood probability. Args: ngram: n-gram. higher_order_ngram: Higher order n-gram. Defaults to "". n: n-gram order. Defaults to 1. Returns: Maximum likelihood probability. """ if n == 1: return calc.log( (self.ngrams[1][ngram] + 1) / (self.total_words + len(self.ngrams[1])) ) assert n <= self.n, f"n must be less than or equal to {self.n}" return calc.log( (self.ngrams[n][higher_order_ngram] + 1) / (self.ngrams[n - 1][ngram] + len(self.ngrams[1])) ) def sentence_probability(self, sentence: str, n: int = 1) -> float: """Computes cumulative n-gram ML probability of a sentence. Args: sentence: Sentence. n: n-gram order. Defaults to 1. Returns: Cumulative n-gram maximum likelihood probability. """ words = sentence.lower().split() cumulative_prob = 0 if n == 1: for word in words: cumulative_prob += self.probability(word) return cumulative_prob for i, word in enumerate(words): if i >= len(words) - n - 1: break cumulative_prob += self.probability( " ".join(words[i : i + n - 2]), " ".join(words[i : i + n - 1]), n, ) return cumulative_prob @classmethod def load(cls, model_file: str) -> NGramMLE: """Loads the model from a file. Args: model_file: File containing the model. Returns: Loaded model. """ return pickle.load(open(model_file, "rb")) def save(self, model_file: str) -> None: """Saves the model to a file. Args: model_file: File to save the model. """ pickle.dump(self, open(model_file, "wb"))