Spaces:

Yuichiroh
/

ACL2Vec

Running

File size: 1,744 Bytes

bcdda34

from __future__ import annotations

import logging
import argparse
import re
import string

import nltk
import pandas
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

logger = logging.getLogger(__name__)

def load_matrix(
    d_file: str,
    r_file: str,
    word_to_id_: dict[str, int]
):
    D = np.load(d_file)
    R = np.memmap(r_file, dtype='float32', mode='r', shape=(D.shape[-1],len(word_to_id_)))
    logger.info(f'D size: {D.shape}, R size: {R.shape}')
    return D, R

def query_to_ids(
        query: str,
        word_to_id_: dict[str, int],
        stemming: bool,
        lower: bool = True,
    ):
    from nltk.stem.porter import PorterStemmer

    if lower:
        query = query.lower()
    # TODO: weight "*" process
    query = "".join([char for char in query if char not in string.punctuation])
    words = nltk.word_tokenize(query)
    if stemming:
        porter = PorterStemmer()
        words = [porter.stem(word) for word in words]
    
    # Consider out-of-vocabulary cases, if y == []: no matched results
    y = [word_to_id_[word] for word in words if word in word_to_id_]

    return y

def query_to_vec(
        R: np.ndarray,
        y: list[int]
    ):
    qvec = np.zeros((R.shape[0], ))
    for ind in y:
        qvec += R[:,ind]
    return qvec


def search(
        args: argparse.Namespace,
        df: pandas.DataFrame, 
        k: int,
        y: list[int],
        R: np.ndarray,
        D: np.ndarray
    ):
    qvec = query_to_vec(R, y)
    if args.metric=='COSINE':
        scores = cosine_similarity([qvec], D)[0]
    elif args.metric=='INNER_PRODUCT':
        scores = D @ qvec
    docids = np.argsort(scores)[::-1][:k]
    
    return scores, docids