ACL2Vec / app.py
4kasha
initial commit
bcdda34
import logging
import re
import nltk
import pandas as pd
import numpy as np
import gradio as gr
fmt = "%(asctime)s %(levelname)s %(name)s :%(message)s"
logging.basicConfig(level=logging.WARNING, format=fmt)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
from utils import load_matrix, query_to_ids, search
from _arxiv import extract_title_abst, doc_to_ids
nltk.download('punkt')
def get_args():
return {
'acl_data_file': 'data/acl-pub-info-2019-title-abst.parquet',
'docs_rep_file': 'data/Docs-rep-2019-h500.npy',
'r_matrix_file': 'data/Rmatrix-2019-h500.dat',
'vocab_file': 'data/vocab_2019.npy',
'topk': 20,
'metric': 'INNER_PRODUCT', # choices=['COSINE', 'INNER_PRODUCT']
}
class ObjectView(object):
def __init__(self, d): self.__dict__ = d
def _format(s: float, year: str, authors: str, title: str, url: str):
authors = ', '.join(authors.replace(',','').replace('\\', '').split(' and\n'))
authors = re.sub('[{}]', '', authors)
title = re.sub('[{\}]', '', title)
title_with_url_markdown = f'[{title}]({url})'
url = url.rstrip('/')
pdf_url = f'[click]({url}.pdf)'
return [round(s,2), year, title_with_url_markdown, authors, pdf_url]
def main(args: ObjectView):
df = pd.read_parquet(args.acl_data_file)
#logger.info(f'document size: {len(df)}')
word_to_id_ = np.load(args.vocab_file, allow_pickle=True).item()
D, R = load_matrix(args.docs_rep_file, args.r_matrix_file, word_to_id_)
def _search(query: str):
results = []
y = query_to_ids(query, word_to_id_, stemming=True)
if y==[]:
return [[None,'N/A', 'N/A', 'N/A', 'N/A']]
else:
scores, docids = search(args, df, args.topk, y, R, D)
for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]):
results.append(_format(s, year, authors, title, url))
return results
def _search_arxiv(arxiv_id: str):
results = []
doc = extract_title_abst(arxiv_id)
y = doc_to_ids(doc, word_to_id_, stemming=True)
if y==[]:
return [[None,'N/A', 'N/A', 'N/A', 'N/A']]
else:
scores, docids = search(args, df, args.topk, y, R, D)
for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]):
results.append(_format(s, year, authors, title, url))
return results
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 650px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 1rem;
font-size: 1.75rem;
"
>
<svg width="68" height="46" xmlns="http://www.w3.org/2000/svg">
<path
d="M 41.977553,-2.8421709e-014 C 41.977553,1.76178 41.977553,1.44211 41.977553,3.0158 L 7.4869054,3.0158 L 0,3.0158 L 0,10.50079 L 0,38.47867 L 0,46 L 7.4869054,46 L 49.500802,46 L 56.987708,46 L 68,46 L 68,30.99368 L 56.987708,30.99368 L 56.987708,10.50079 L 56.987708,3.0158 C 56.987708,1.44211 56.987708,1.76178 56.987708,-2.8421709e-014 L 41.977553,-2.8421709e-014 z M 15.010155,17.98578 L 41.977553,17.98578 L 41.977553,30.99368 L 15.010155,30.99368 L 15.010155,17.98578 z "
style="fill:#ed1c24;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:12.89541149;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
/>
</svg>
<h1 style="font-weight: 900; margin-bottom: 0">
ACL2Vec
</h1>
</div>
<p style="margin: 15px 0 5px; font-size: 100%; text-align: justify">
This is a light-weighted version of <a href=http://clml.ism.ac.jp/ACL2Vec/>ACL2Vec keyword search</a>, implemented in a totally statistical manner.
Start typing below to search papers limited to 2019 onwards and up to September 2022.
</p>
</div>
""")
with gr.Row():
inputs = gr.Textbox(placeholder="Input keywords separated by spaces.", show_label=False)
inputs_arxiv = gr.Textbox(placeholder="Input arxiv number and press Enter to find similar papers.", show_label=False)
outputs = gr.Dataframe(
headers=['score', 'year', 'title', 'authors', 'PDF'],
datatype=["number", "str", "markdown", "str", "markdown"],
col_count=(5, "fixed"),
wrap=True,
label=f"top-{args.topk} results"
)
inputs.change(_search, inputs, outputs)
inputs_arxiv.submit(_search_arxiv, inputs_arxiv, outputs)
demo.launch(
#share=True,
debug=True
)
if __name__ == '__main__':
args = ObjectView(get_args())
main(args)