|
import logging |
|
import re |
|
import nltk |
|
import pandas as pd |
|
import numpy as np |
|
import gradio as gr |
|
|
|
fmt = "%(asctime)s %(levelname)s %(name)s :%(message)s" |
|
logging.basicConfig(level=logging.WARNING, format=fmt) |
|
logger = logging.getLogger() |
|
logger.setLevel(logging.INFO) |
|
|
|
from utils import load_matrix, query_to_ids, search |
|
from _arxiv import extract_title_abst, doc_to_ids |
|
|
|
nltk.download('punkt') |
|
|
|
def get_args(): |
|
return { |
|
'acl_data_file': 'data/acl-pub-info-2019-title-abst.parquet', |
|
'docs_rep_file': 'data/Docs-rep-2019-h500.npy', |
|
'r_matrix_file': 'data/Rmatrix-2019-h500.dat', |
|
'vocab_file': 'data/vocab_2019.npy', |
|
'topk': 20, |
|
'metric': 'INNER_PRODUCT', |
|
} |
|
|
|
class ObjectView(object): |
|
def __init__(self, d): self.__dict__ = d |
|
|
|
|
|
def _format(s: float, year: str, authors: str, title: str, url: str): |
|
authors = ', '.join(authors.replace(',','').replace('\\', '').split(' and\n')) |
|
authors = re.sub('[{}]', '', authors) |
|
title = re.sub('[{\}]', '', title) |
|
title_with_url_markdown = f'[{title}]({url})' |
|
url = url.rstrip('/') |
|
pdf_url = f'[click]({url}.pdf)' |
|
return [round(s,2), year, title_with_url_markdown, authors, pdf_url] |
|
|
|
def main(args: ObjectView): |
|
df = pd.read_parquet(args.acl_data_file) |
|
|
|
word_to_id_ = np.load(args.vocab_file, allow_pickle=True).item() |
|
D, R = load_matrix(args.docs_rep_file, args.r_matrix_file, word_to_id_) |
|
|
|
def _search(query: str): |
|
results = [] |
|
y = query_to_ids(query, word_to_id_, stemming=True) |
|
if y==[]: |
|
return [[None,'N/A', 'N/A', 'N/A', 'N/A']] |
|
else: |
|
scores, docids = search(args, df, args.topk, y, R, D) |
|
for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]): |
|
results.append(_format(s, year, authors, title, url)) |
|
return results |
|
|
|
def _search_arxiv(arxiv_id: str): |
|
results = [] |
|
doc = extract_title_abst(arxiv_id) |
|
y = doc_to_ids(doc, word_to_id_, stemming=True) |
|
if y==[]: |
|
return [[None,'N/A', 'N/A', 'N/A', 'N/A']] |
|
else: |
|
scores, docids = search(args, df, args.topk, y, R, D) |
|
for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]): |
|
results.append(_format(s, year, authors, title, url)) |
|
return results |
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; max-width: 650px; margin: 0 auto;"> |
|
<div |
|
style=" |
|
display: inline-flex; |
|
align-items: center; |
|
gap: 1rem; |
|
font-size: 1.75rem; |
|
" |
|
> |
|
<svg width="68" height="46" xmlns="http://www.w3.org/2000/svg"> |
|
<path |
|
d="M 41.977553,-2.8421709e-014 C 41.977553,1.76178 41.977553,1.44211 41.977553,3.0158 L 7.4869054,3.0158 L 0,3.0158 L 0,10.50079 L 0,38.47867 L 0,46 L 7.4869054,46 L 49.500802,46 L 56.987708,46 L 68,46 L 68,30.99368 L 56.987708,30.99368 L 56.987708,10.50079 L 56.987708,3.0158 C 56.987708,1.44211 56.987708,1.76178 56.987708,-2.8421709e-014 L 41.977553,-2.8421709e-014 z M 15.010155,17.98578 L 41.977553,17.98578 L 41.977553,30.99368 L 15.010155,30.99368 L 15.010155,17.98578 z " |
|
style="fill:#ed1c24;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:12.89541149;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1" |
|
/> |
|
</svg> |
|
<h1 style="font-weight: 900; margin-bottom: 0"> |
|
ACL2Vec |
|
</h1> |
|
</div> |
|
<p style="margin: 15px 0 5px; font-size: 100%; text-align: justify"> |
|
This is a light-weighted version of <a href=http://clml.ism.ac.jp/ACL2Vec/>ACL2Vec keyword search</a>, implemented in a totally statistical manner. |
|
Start typing below to search papers limited to 2019 onwards and up to September 2022. |
|
</p> |
|
</div> |
|
""") |
|
with gr.Row(): |
|
inputs = gr.Textbox(placeholder="Input keywords separated by spaces.", show_label=False) |
|
inputs_arxiv = gr.Textbox(placeholder="Input arxiv number and press Enter to find similar papers.", show_label=False) |
|
|
|
outputs = gr.Dataframe( |
|
headers=['score', 'year', 'title', 'authors', 'PDF'], |
|
datatype=["number", "str", "markdown", "str", "markdown"], |
|
col_count=(5, "fixed"), |
|
wrap=True, |
|
label=f"top-{args.topk} results" |
|
) |
|
inputs.change(_search, inputs, outputs) |
|
inputs_arxiv.submit(_search_arxiv, inputs_arxiv, outputs) |
|
|
|
demo.launch( |
|
|
|
debug=True |
|
) |
|
|
|
if __name__ == '__main__': |
|
args = ObjectView(get_args()) |
|
main(args) |