import logging import re import nltk import pandas as pd import numpy as np import gradio as gr fmt = "%(asctime)s %(levelname)s %(name)s :%(message)s" logging.basicConfig(level=logging.WARNING, format=fmt) logger = logging.getLogger() logger.setLevel(logging.INFO) from utils import load_matrix, query_to_ids, search from _arxiv import extract_title_abst, doc_to_ids nltk.download('punkt') def get_args(): return { 'acl_data_file': 'data/acl-pub-info-2019-title-abst.parquet', 'docs_rep_file': 'data/Docs-rep-2019-h500.npy', 'r_matrix_file': 'data/Rmatrix-2019-h500.dat', 'vocab_file': 'data/vocab_2019.npy', 'topk': 20, 'metric': 'INNER_PRODUCT', # choices=['COSINE', 'INNER_PRODUCT'] } class ObjectView(object): def __init__(self, d): self.__dict__ = d def _format(s: float, year: str, authors: str, title: str, url: str): authors = ', '.join(authors.replace(',','').replace('\\', '').split(' and\n')) authors = re.sub('[{}]', '', authors) title = re.sub('[{\}]', '', title) title_with_url_markdown = f'[{title}]({url})' url = url.rstrip('/') pdf_url = f'[click]({url}.pdf)' return [round(s,2), year, title_with_url_markdown, authors, pdf_url] def main(args: ObjectView): df = pd.read_parquet(args.acl_data_file) #logger.info(f'document size: {len(df)}') word_to_id_ = np.load(args.vocab_file, allow_pickle=True).item() D, R = load_matrix(args.docs_rep_file, args.r_matrix_file, word_to_id_) def _search(query: str): results = [] y = query_to_ids(query, word_to_id_, stemming=True) if y==[]: return [[None,'N/A', 'N/A', 'N/A', 'N/A']] else: scores, docids = search(args, df, args.topk, y, R, D) for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]): results.append(_format(s, year, authors, title, url)) return results def _search_arxiv(arxiv_id: str): results = [] doc = extract_title_abst(arxiv_id) y = doc_to_ids(doc, word_to_id_, stemming=True) if y==[]: return [[None,'N/A', 'N/A', 'N/A', 'N/A']] else: scores, docids = search(args, df, args.topk, y, R, D) for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]): results.append(_format(s, year, authors, title, url)) return results with gr.Blocks() as demo: gr.HTML( """
This is a light-weighted version of ACL2Vec keyword search, implemented in a totally statistical manner. Start typing below to search papers limited to 2019 onwards and up to September 2022.