anonymousforpaper's picture
Upload 103 files
224a33f verified
"""Term-Frequency / Inverse Document Frequency (TF-IDF) model."""
from collections import Counter
from functools import cached_property
import numpy as np
from scipy import sparse
class TFIDFModel:
"""Term-Frequency / Inverse Document Frequency (TF-IDF) model.
Mimics sklearn.feature_extraction.text.TfidfVectorizer with sublinear_tf=True
"""
def __init__(self, vocabulary_path: str, idf_path: str):
with open(vocabulary_path, "r") as f:
self.vocabulary = f.read().strip().split("\n")
with open(idf_path, "rb") as f:
self.idf_ = np.load(f)
assert self.idf_.ndim == 1
assert (
len(self.idf_) == len(self.vocabulary)
), f"IDF size must match vocabulary size, got {len(self.idf_)} and {len(self.vocabulary)}"
@cached_property
def vocab_to_index(self) -> dict[str, int]:
return {term: index for index, term in enumerate(self.vocabulary)}
def encode(self, terms: list[str]) -> sparse.csr_matrix:
"""Encodes terms as TF-IDF vectors.
Args:
terms: list of terms to encode.
Returns:
TF-IDF vector encoded as sparse matrix of shape (1, num_terms)
"""
counter = Counter(filter(self.vocabulary.__contains__, terms))
indices = [self.vocab_to_index[term] for term in counter]
tf = np.array([count for term, count in counter.items()])
idf = np.take(self.idf_, indices)
values = (1 + np.log(tf)) * idf
values /= np.linalg.norm(values)
return sparse.csr_matrix(
(values, (np.zeros_like(indices), indices)),
shape=(1, len(self.vocabulary)),
)
def decode(self, vec: sparse.csr_matrix) -> list[str]:
"""Extract terms from TF-IDF."""
return [self.vocabulary[i] for i in vec.indices]