Spaces:

Yuichiroh
/

ACL2Vec

Running

ACL2Vec / app.py

4kasha

initial commit

bcdda34 over 2 years ago

5.3 kB

	import logging
	import re
	import nltk
	import pandas as pd
	import numpy as np
	import gradio as gr

	fmt = "%(asctime)s %(levelname)s %(name)s :%(message)s"
	logging.basicConfig(level=logging.WARNING, format=fmt)
	logger = logging.getLogger()
	logger.setLevel(logging.INFO)

	from utils import load_matrix, query_to_ids, search
	from _arxiv import extract_title_abst, doc_to_ids

	nltk.download('punkt')

	def get_args():
	return {
	'acl_data_file': 'data/acl-pub-info-2019-title-abst.parquet',
	'docs_rep_file': 'data/Docs-rep-2019-h500.npy',
	'r_matrix_file': 'data/Rmatrix-2019-h500.dat',
	'vocab_file': 'data/vocab_2019.npy',
	'topk': 20,
	'metric': 'INNER_PRODUCT', # choices=['COSINE', 'INNER_PRODUCT']
	}

	class ObjectView(object):
	def __init__(self, d): self.__dict__ = d


	def _format(s: float, year: str, authors: str, title: str, url: str):
	authors = ', '.join(authors.replace(',','').replace('\\', '').split(' and\n'))
	authors = re.sub('[{}]', '', authors)
	title = re.sub('[{\}]', '', title)
	title_with_url_markdown = f'[{title}]({url})'
	url = url.rstrip('/')
	pdf_url = f'[click]({url}.pdf)'
	return [round(s,2), year, title_with_url_markdown, authors, pdf_url]

	def main(args: ObjectView):
	df = pd.read_parquet(args.acl_data_file)
	#logger.info(f'document size: {len(df)}')
	word_to_id_ = np.load(args.vocab_file, allow_pickle=True).item()
	D, R = load_matrix(args.docs_rep_file, args.r_matrix_file, word_to_id_)

	def _search(query: str):
	results = []
	y = query_to_ids(query, word_to_id_, stemming=True)
	if y==[]:
	return [[None,'N/A', 'N/A', 'N/A', 'N/A']]
	else:
	scores, docids = search(args, df, args.topk, y, R, D)
	for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]):
	results.append(_format(s, year, authors, title, url))
	return results

	def _search_arxiv(arxiv_id: str):
	results = []
	doc = extract_title_abst(arxiv_id)
	y = doc_to_ids(doc, word_to_id_, stemming=True)
	if y==[]:
	return [[None,'N/A', 'N/A', 'N/A', 'N/A']]
	else:
	scores, docids = search(args, df, args.topk, y, R, D)
	for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]):
	results.append(_format(s, year, authors, title, url))
	return results

	with gr.Blocks() as demo:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 650px; margin: 0 auto;">
	<div
	style="
	display: inline-flex;
	align-items: center;
	gap: 1rem;
	font-size: 1.75rem;
	"
	>
	<svg width="68" height="46" xmlns="http://www.w3.org/2000/svg">
	<path
	d="M 41.977553,-2.8421709e-014 C 41.977553,1.76178 41.977553,1.44211 41.977553,3.0158 L 7.4869054,3.0158 L 0,3.0158 L 0,10.50079 L 0,38.47867 L 0,46 L 7.4869054,46 L 49.500802,46 L 56.987708,46 L 68,46 L 68,30.99368 L 56.987708,30.99368 L 56.987708,10.50079 L 56.987708,3.0158 C 56.987708,1.44211 56.987708,1.76178 56.987708,-2.8421709e-014 L 41.977553,-2.8421709e-014 z M 15.010155,17.98578 L 41.977553,17.98578 L 41.977553,30.99368 L 15.010155,30.99368 L 15.010155,17.98578 z "
	style="fill:#ed1c24;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:12.89541149;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
	/>
	</svg>
	<h1 style="font-weight: 900; margin-bottom: 0">
	ACL2Vec
	</h1>
	</div>
	<p style="margin: 15px 0 5px; font-size: 100%; text-align: justify">
	This is a light-weighted version of <a href=http://clml.ism.ac.jp/ACL2Vec/>ACL2Vec keyword search</a>, implemented in a totally statistical manner.
	Start typing below to search papers limited to 2019 onwards and up to September 2022.
	</p>
	</div>
	""")
	with gr.Row():
	inputs = gr.Textbox(placeholder="Input keywords separated by spaces.", show_label=False)
	inputs_arxiv = gr.Textbox(placeholder="Input arxiv number and press Enter to find similar papers.", show_label=False)

	outputs = gr.Dataframe(
	headers=['score', 'year', 'title', 'authors', 'PDF'],
	datatype=["number", "str", "markdown", "str", "markdown"],
	col_count=(5, "fixed"),
	wrap=True,
	label=f"top-{args.topk} results"
	)
	inputs.change(_search, inputs, outputs)
	inputs_arxiv.submit(_search_arxiv, inputs_arxiv, outputs)

	demo.launch(
	#share=True,
	debug=True
	)

	if __name__ == '__main__':
	args = ObjectView(get_args())
	main(args)