|
from __future__ import annotations |
|
|
|
import logging |
|
import argparse |
|
import re |
|
import string |
|
|
|
import nltk |
|
import pandas |
|
import pandas as pd |
|
import numpy as np |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
def load_matrix( |
|
d_file: str, |
|
r_file: str, |
|
word_to_id_: dict[str, int] |
|
): |
|
D = np.load(d_file) |
|
R = np.memmap(r_file, dtype='float32', mode='r', shape=(D.shape[-1],len(word_to_id_))) |
|
logger.info(f'D size: {D.shape}, R size: {R.shape}') |
|
return D, R |
|
|
|
def query_to_ids( |
|
query: str, |
|
word_to_id_: dict[str, int], |
|
stemming: bool, |
|
lower: bool = True, |
|
): |
|
from nltk.stem.porter import PorterStemmer |
|
|
|
if lower: |
|
query = query.lower() |
|
|
|
query = "".join([char for char in query if char not in string.punctuation]) |
|
words = nltk.word_tokenize(query) |
|
if stemming: |
|
porter = PorterStemmer() |
|
words = [porter.stem(word) for word in words] |
|
|
|
|
|
y = [word_to_id_[word] for word in words if word in word_to_id_] |
|
|
|
return y |
|
|
|
def query_to_vec( |
|
R: np.ndarray, |
|
y: list[int] |
|
): |
|
qvec = np.zeros((R.shape[0], )) |
|
for ind in y: |
|
qvec += R[:,ind] |
|
return qvec |
|
|
|
|
|
def search( |
|
args: argparse.Namespace, |
|
df: pandas.DataFrame, |
|
k: int, |
|
y: list[int], |
|
R: np.ndarray, |
|
D: np.ndarray |
|
): |
|
qvec = query_to_vec(R, y) |
|
if args.metric=='COSINE': |
|
scores = cosine_similarity([qvec], D)[0] |
|
elif args.metric=='INNER_PRODUCT': |
|
scores = D @ qvec |
|
docids = np.argsort(scores)[::-1][:k] |
|
|
|
return scores, docids |