Spaces:

Yuichiroh
/

ACL2Vec

Running

File size: 5,301 Bytes

bcdda34

import logging
import re
import nltk
import pandas as pd
import numpy as np
import gradio as gr

fmt = "%(asctime)s %(levelname)s %(name)s :%(message)s"
logging.basicConfig(level=logging.WARNING, format=fmt)
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from utils import load_matrix, query_to_ids, search
from _arxiv import extract_title_abst, doc_to_ids

nltk.download('punkt')

def get_args():
    return {
        'acl_data_file': 'data/acl-pub-info-2019-title-abst.parquet',
        'docs_rep_file': 'data/Docs-rep-2019-h500.npy',
        'r_matrix_file': 'data/Rmatrix-2019-h500.dat',
        'vocab_file': 'data/vocab_2019.npy',
        'topk': 20,
        'metric': 'INNER_PRODUCT', # choices=['COSINE', 'INNER_PRODUCT']
    }

class ObjectView(object):
    def __init__(self, d): self.__dict__ = d


def _format(s: float, year: str, authors: str, title: str, url: str):
    authors = ', '.join(authors.replace(',','').replace('\\', '').split('  and\n'))
    authors = re.sub('[{}]', '', authors)
    title = re.sub('[{\}]', '', title)
    title_with_url_markdown = f'[{title}]({url})'
    url = url.rstrip('/')
    pdf_url = f'[click]({url}.pdf)'
    return [round(s,2), year, title_with_url_markdown, authors, pdf_url]

def main(args: ObjectView):
    df = pd.read_parquet(args.acl_data_file)
    #logger.info(f'document size: {len(df)}')
    word_to_id_ = np.load(args.vocab_file, allow_pickle=True).item()
    D, R = load_matrix(args.docs_rep_file, args.r_matrix_file, word_to_id_)

    def _search(query: str):
        results = []
        y = query_to_ids(query, word_to_id_, stemming=True)
        if y==[]:
            return [[None,'N/A', 'N/A', 'N/A', 'N/A']]
        else:
            scores, docids = search(args, df, args.topk, y, R, D)
            for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]):
                results.append(_format(s, year, authors, title, url))
        return results
    
    def _search_arxiv(arxiv_id: str):
        results = []
        doc = extract_title_abst(arxiv_id)
        y = doc_to_ids(doc, word_to_id_, stemming=True)
        if y==[]:
            return [[None,'N/A', 'N/A', 'N/A', 'N/A']]
        else:
            scores, docids = search(args, df, args.topk, y, R, D)
            for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]):
                results.append(_format(s, year, authors, title, url))
        return results

    with gr.Blocks() as demo:
        gr.HTML(
        """
            <div style="text-align: center; max-width: 650px; margin: 0 auto;">
                <div
                    style="
                      display: inline-flex;
                      align-items: center;
                      gap: 1rem;
                      font-size: 1.75rem;
                    "
                >
                    <svg width="68" height="46" xmlns="http://www.w3.org/2000/svg">
                        <path
                            d="M 41.977553,-2.8421709e-014 C 41.977553,1.76178 41.977553,1.44211 41.977553,3.0158 L 7.4869054,3.0158 L 0,3.0158 L 0,10.50079 L 0,38.47867 L 0,46 L 7.4869054,46 L 49.500802,46 L 56.987708,46 L 68,46 L 68,30.99368 L 56.987708,30.99368 L 56.987708,10.50079 L 56.987708,3.0158 C 56.987708,1.44211 56.987708,1.76178 56.987708,-2.8421709e-014 L 41.977553,-2.8421709e-014 z M 15.010155,17.98578 L 41.977553,17.98578 L 41.977553,30.99368 L 15.010155,30.99368 L 15.010155,17.98578 z "
                            style="fill:#ed1c24;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:12.89541149;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
                        />
                    </svg>
                    <h1 style="font-weight: 900; margin-bottom: 0">
                        ACL2Vec
                    </h1>
                </div>
                <p style="margin: 15px 0 5px; font-size: 100%; text-align: justify">
                    This is a light-weighted version of <a href=http://clml.ism.ac.jp/ACL2Vec/>ACL2Vec keyword search</a>, implemented in a totally statistical manner.
                    Start typing below to search papers limited to 2019 onwards and up to September 2022.
                </p>
            </div>
        """)
        with gr.Row():
            inputs = gr.Textbox(placeholder="Input keywords separated by spaces.", show_label=False)
            inputs_arxiv = gr.Textbox(placeholder="Input arxiv number and press Enter to find similar papers.", show_label=False)
        
        outputs = gr.Dataframe(
            headers=['score', 'year', 'title', 'authors', 'PDF'],
            datatype=["number", "str", "markdown", "str", "markdown"],
            col_count=(5, "fixed"),
            wrap=True,
            label=f"top-{args.topk} results"
        )
        inputs.change(_search, inputs, outputs)
        inputs_arxiv.submit(_search_arxiv, inputs_arxiv, outputs)

        demo.launch(
            #share=True, 
            debug=True
        )

if __name__ == '__main__':
    args = ObjectView(get_args())
    main(args)