from __future__ import annotations import logging import argparse import re import string import nltk import pandas import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity logger = logging.getLogger(__name__) def load_matrix( d_file: str, r_file: str, word_to_id_: dict[str, int] ): D = np.load(d_file) R = np.memmap(r_file, dtype='float32', mode='r', shape=(D.shape[-1],len(word_to_id_))) logger.info(f'D size: {D.shape}, R size: {R.shape}') return D, R def query_to_ids( query: str, word_to_id_: dict[str, int], stemming: bool, lower: bool = True, ): from nltk.stem.porter import PorterStemmer if lower: query = query.lower() # TODO: weight "*" process query = "".join([char for char in query if char not in string.punctuation]) words = nltk.word_tokenize(query) if stemming: porter = PorterStemmer() words = [porter.stem(word) for word in words] # Consider out-of-vocabulary cases, if y == []: no matched results y = [word_to_id_[word] for word in words if word in word_to_id_] return y def query_to_vec( R: np.ndarray, y: list[int] ): qvec = np.zeros((R.shape[0], )) for ind in y: qvec += R[:,ind] return qvec def search( args: argparse.Namespace, df: pandas.DataFrame, k: int, y: list[int], R: np.ndarray, D: np.ndarray ): qvec = query_to_vec(R, y) if args.metric=='COSINE': scores = cosine_similarity([qvec], D)[0] elif args.metric=='INNER_PRODUCT': scores = D @ qvec docids = np.argsort(scores)[::-1][:k] return scores, docids