Nolwenn
Initial commit
b599481
raw
history blame
3.55 kB
"""N-gram Maximum Likelihood Probabilistic Language Model.
Supports n-gram up to 5-gram.
"""
from __future__ import annotations
import math as calc
import os
import pickle
from collections import Counter, defaultdict
from typing import List
from nltk.util import ngrams
class NGramMLE:
def __init__(self, n: int = 1, corpus_file: str = None) -> None:
"""Initializes the model.
Args:
n: n-gram order. Defaults to 1.
corpus_file: File containing the corpus words. Defaults to None.
Raises:
FileNotFoundError: If corpus_file is defined but not found.
"""
if corpus_file and not os.path.exists(corpus_file):
raise FileNotFoundError(f"Corpus file not found: {corpus_file}")
self.n = n
self.corpus_file = corpus_file
self.ngrams = defaultdict(Counter)
def _read_corpus(self) -> List[str]:
"""Reads the corpus from the file.
Returns:
List of words in the corpus.
"""
with open(self.corpus_file, "r") as f:
return f.read().split("\n")
def create_ngrams(self):
"""Creates n-grams from the corpus."""
corpus_words = self._read_corpus()
self.total_words = len(corpus_words)
self.ngrams[1] = Counter(corpus_words)
for i in range(self.n - 1):
self.ngrams[i + 2] = Counter(ngrams(corpus_words, i + 2))
def probability(
self, ngram: str, higher_order_ngram: str = "", n: int = 1
) -> float:
"""Computes maximum likelihood probability.
Args:
ngram: n-gram.
higher_order_ngram: Higher order n-gram. Defaults to "".
n: n-gram order. Defaults to 1.
Returns:
Maximum likelihood probability.
"""
if n == 1:
return calc.log(
(self.ngrams[1][ngram] + 1)
/ (self.total_words + len(self.ngrams[1]))
)
assert n <= self.n, f"n must be less than or equal to {self.n}"
return calc.log(
(self.ngrams[n][higher_order_ngram] + 1)
/ (self.ngrams[n - 1][ngram] + len(self.ngrams[1]))
)
def sentence_probability(self, sentence: str, n: int = 1) -> float:
"""Computes cumulative n-gram ML probability of a sentence.
Args:
sentence: Sentence.
n: n-gram order. Defaults to 1.
Returns:
Cumulative n-gram maximum likelihood probability.
"""
words = sentence.lower().split()
cumulative_prob = 0
if n == 1:
for word in words:
cumulative_prob += self.probability(word)
return cumulative_prob
for i, word in enumerate(words):
if i >= len(words) - n - 1:
break
cumulative_prob += self.probability(
" ".join(words[i : i + n - 2]),
" ".join(words[i : i + n - 1]),
n,
)
return cumulative_prob
@classmethod
def load(cls, model_file: str) -> NGramMLE:
"""Loads the model from a file.
Args:
model_file: File containing the model.
Returns:
Loaded model.
"""
return pickle.load(open(model_file, "rb"))
def save(self, model_file: str) -> None:
"""Saves the model to a file.
Args:
model_file: File to save the model.
"""
pickle.dump(self, open(model_file, "wb"))