ACL2Vec / utils.py
4kasha
initial commit
bcdda34
from __future__ import annotations
import logging
import argparse
import re
import string
import nltk
import pandas
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
logger = logging.getLogger(__name__)
def load_matrix(
d_file: str,
r_file: str,
word_to_id_: dict[str, int]
):
D = np.load(d_file)
R = np.memmap(r_file, dtype='float32', mode='r', shape=(D.shape[-1],len(word_to_id_)))
logger.info(f'D size: {D.shape}, R size: {R.shape}')
return D, R
def query_to_ids(
query: str,
word_to_id_: dict[str, int],
stemming: bool,
lower: bool = True,
):
from nltk.stem.porter import PorterStemmer
if lower:
query = query.lower()
# TODO: weight "*" process
query = "".join([char for char in query if char not in string.punctuation])
words = nltk.word_tokenize(query)
if stemming:
porter = PorterStemmer()
words = [porter.stem(word) for word in words]
# Consider out-of-vocabulary cases, if y == []: no matched results
y = [word_to_id_[word] for word in words if word in word_to_id_]
return y
def query_to_vec(
R: np.ndarray,
y: list[int]
):
qvec = np.zeros((R.shape[0], ))
for ind in y:
qvec += R[:,ind]
return qvec
def search(
args: argparse.Namespace,
df: pandas.DataFrame,
k: int,
y: list[int],
R: np.ndarray,
D: np.ndarray
):
qvec = query_to_vec(R, y)
if args.metric=='COSINE':
scores = cosine_similarity([qvec], D)[0]
elif args.metric=='INNER_PRODUCT':
scores = D @ qvec
docids = np.argsort(scores)[::-1][:k]
return scores, docids