Spaces:

anonymousforpaper
/

M3Site

Sleeping

App Files Files Community

M3Site / esm /utils /function /tfidf.py

anonymousforpaper

Upload 103 files

224a33f verified 2 months ago

raw

history blame contribute delete

1.85 kB

	"""Term-Frequency / Inverse Document Frequency (TF-IDF) model."""
	from collections import Counter
	from functools import cached_property

	import numpy as np
	from scipy import sparse


	class TFIDFModel:
	"""Term-Frequency / Inverse Document Frequency (TF-IDF) model.
	Mimics sklearn.feature_extraction.text.TfidfVectorizer with sublinear_tf=True
	"""

	def __init__(self, vocabulary_path: str, idf_path: str):
	with open(vocabulary_path, "r") as f:
	self.vocabulary = f.read().strip().split("\n")

	with open(idf_path, "rb") as f:
	self.idf_ = np.load(f)

	assert self.idf_.ndim == 1
	assert (
	len(self.idf_) == len(self.vocabulary)
	), f"IDF size must match vocabulary size, got {len(self.idf_)} and {len(self.vocabulary)}"

	@cached_property
	def vocab_to_index(self) -> dict[str, int]:
	return {term: index for index, term in enumerate(self.vocabulary)}

	def encode(self, terms: list[str]) -> sparse.csr_matrix:
	"""Encodes terms as TF-IDF vectors.

	Args:
	terms: list of terms to encode.

	Returns:
	TF-IDF vector encoded as sparse matrix of shape (1, num_terms)
	"""
	counter = Counter(filter(self.vocabulary.__contains__, terms))
	indices = [self.vocab_to_index[term] for term in counter]

	tf = np.array([count for term, count in counter.items()])
	idf = np.take(self.idf_, indices)

	values = (1 + np.log(tf)) * idf
	values /= np.linalg.norm(values)

	return sparse.csr_matrix(
	(values, (np.zeros_like(indices), indices)),
	shape=(1, len(self.vocabulary)),
	)

	def decode(self, vec: sparse.csr_matrix) -> list[str]:
	"""Extract terms from TF-IDF."""
	return [self.vocabulary[i] for i in vec.indices]