Spaces:
Sleeping
Sleeping
"""Term-Frequency / Inverse Document Frequency (TF-IDF) model.""" | |
from collections import Counter | |
from functools import cached_property | |
import numpy as np | |
from scipy import sparse | |
class TFIDFModel: | |
"""Term-Frequency / Inverse Document Frequency (TF-IDF) model. | |
Mimics sklearn.feature_extraction.text.TfidfVectorizer with sublinear_tf=True | |
""" | |
def __init__(self, vocabulary_path: str, idf_path: str): | |
with open(vocabulary_path, "r") as f: | |
self.vocabulary = f.read().strip().split("\n") | |
with open(idf_path, "rb") as f: | |
self.idf_ = np.load(f) | |
assert self.idf_.ndim == 1 | |
assert ( | |
len(self.idf_) == len(self.vocabulary) | |
), f"IDF size must match vocabulary size, got {len(self.idf_)} and {len(self.vocabulary)}" | |
def vocab_to_index(self) -> dict[str, int]: | |
return {term: index for index, term in enumerate(self.vocabulary)} | |
def encode(self, terms: list[str]) -> sparse.csr_matrix: | |
"""Encodes terms as TF-IDF vectors. | |
Args: | |
terms: list of terms to encode. | |
Returns: | |
TF-IDF vector encoded as sparse matrix of shape (1, num_terms) | |
""" | |
counter = Counter(filter(self.vocabulary.__contains__, terms)) | |
indices = [self.vocab_to_index[term] for term in counter] | |
tf = np.array([count for term, count in counter.items()]) | |
idf = np.take(self.idf_, indices) | |
values = (1 + np.log(tf)) * idf | |
values /= np.linalg.norm(values) | |
return sparse.csr_matrix( | |
(values, (np.zeros_like(indices), indices)), | |
shape=(1, len(self.vocabulary)), | |
) | |
def decode(self, vec: sparse.csr_matrix) -> list[str]: | |
"""Extract terms from TF-IDF.""" | |
return [self.vocabulary[i] for i in vec.indices] | |