File size: 1,744 Bytes
bcdda34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
from __future__ import annotations
import logging
import argparse
import re
import string
import nltk
import pandas
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
logger = logging.getLogger(__name__)
def load_matrix(
d_file: str,
r_file: str,
word_to_id_: dict[str, int]
):
D = np.load(d_file)
R = np.memmap(r_file, dtype='float32', mode='r', shape=(D.shape[-1],len(word_to_id_)))
logger.info(f'D size: {D.shape}, R size: {R.shape}')
return D, R
def query_to_ids(
query: str,
word_to_id_: dict[str, int],
stemming: bool,
lower: bool = True,
):
from nltk.stem.porter import PorterStemmer
if lower:
query = query.lower()
# TODO: weight "*" process
query = "".join([char for char in query if char not in string.punctuation])
words = nltk.word_tokenize(query)
if stemming:
porter = PorterStemmer()
words = [porter.stem(word) for word in words]
# Consider out-of-vocabulary cases, if y == []: no matched results
y = [word_to_id_[word] for word in words if word in word_to_id_]
return y
def query_to_vec(
R: np.ndarray,
y: list[int]
):
qvec = np.zeros((R.shape[0], ))
for ind in y:
qvec += R[:,ind]
return qvec
def search(
args: argparse.Namespace,
df: pandas.DataFrame,
k: int,
y: list[int],
R: np.ndarray,
D: np.ndarray
):
qvec = query_to_vec(R, y)
if args.metric=='COSINE':
scores = cosine_similarity([qvec], D)[0]
elif args.metric=='INNER_PRODUCT':
scores = D @ qvec
docids = np.argsort(scores)[::-1][:k]
return scores, docids |