Spaces:

Yuichiroh
/

ACL2Vec

Running

ACL2Vec / utils.py

4kasha

initial commit

bcdda34 over 2 years ago

1.74 kB

	from __future__ import annotations

	import logging
	import argparse
	import re
	import string

	import nltk
	import pandas
	import pandas as pd
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity

	logger = logging.getLogger(__name__)

	def load_matrix(
	d_file: str,
	r_file: str,
	word_to_id_: dict[str, int]
	):
	D = np.load(d_file)
	R = np.memmap(r_file, dtype='float32', mode='r', shape=(D.shape[-1],len(word_to_id_)))
	logger.info(f'D size: {D.shape}, R size: {R.shape}')
	return D, R

	def query_to_ids(
	query: str,
	word_to_id_: dict[str, int],
	stemming: bool,
	lower: bool = True,
	):
	from nltk.stem.porter import PorterStemmer

	if lower:
	query = query.lower()
	# TODO: weight "*" process
	query = "".join([char for char in query if char not in string.punctuation])
	words = nltk.word_tokenize(query)
	if stemming:
	porter = PorterStemmer()
	words = [porter.stem(word) for word in words]

	# Consider out-of-vocabulary cases, if y == []: no matched results
	y = [word_to_id_[word] for word in words if word in word_to_id_]

	return y

	def query_to_vec(
	R: np.ndarray,
	y: list[int]
	):
	qvec = np.zeros((R.shape[0], ))
	for ind in y:
	qvec += R[:,ind]
	return qvec


	def search(
	args: argparse.Namespace,
	df: pandas.DataFrame,
	k: int,
	y: list[int],
	R: np.ndarray,
	D: np.ndarray
	):
	qvec = query_to_vec(R, y)
	if args.metric=='COSINE':
	scores = cosine_similarity([qvec], D)[0]
	elif args.metric=='INNER_PRODUCT':
	scores = D @ qvec
	docids = np.argsort(scores)[::-1][:k]

	return scores, docids