File size: 5,301 Bytes
bcdda34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import logging
import re
import nltk
import pandas as pd
import numpy as np
import gradio as gr

fmt = "%(asctime)s %(levelname)s %(name)s :%(message)s"
logging.basicConfig(level=logging.WARNING, format=fmt)
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from utils import load_matrix, query_to_ids, search
from _arxiv import extract_title_abst, doc_to_ids

nltk.download('punkt')

def get_args():
    return {
        'acl_data_file': 'data/acl-pub-info-2019-title-abst.parquet',
        'docs_rep_file': 'data/Docs-rep-2019-h500.npy',
        'r_matrix_file': 'data/Rmatrix-2019-h500.dat',
        'vocab_file': 'data/vocab_2019.npy',
        'topk': 20,
        'metric': 'INNER_PRODUCT', # choices=['COSINE', 'INNER_PRODUCT']
    }

class ObjectView(object):
    def __init__(self, d): self.__dict__ = d


def _format(s: float, year: str, authors: str, title: str, url: str):
    authors = ', '.join(authors.replace(',','').replace('\\', '').split('  and\n'))
    authors = re.sub('[{}]', '', authors)
    title = re.sub('[{\}]', '', title)
    title_with_url_markdown = f'[{title}]({url})'
    url = url.rstrip('/')
    pdf_url = f'[click]({url}.pdf)'
    return [round(s,2), year, title_with_url_markdown, authors, pdf_url]

def main(args: ObjectView):
    df = pd.read_parquet(args.acl_data_file)
    #logger.info(f'document size: {len(df)}')
    word_to_id_ = np.load(args.vocab_file, allow_pickle=True).item()
    D, R = load_matrix(args.docs_rep_file, args.r_matrix_file, word_to_id_)

    def _search(query: str):
        results = []
        y = query_to_ids(query, word_to_id_, stemming=True)
        if y==[]:
            return [[None,'N/A', 'N/A', 'N/A', 'N/A']]
        else:
            scores, docids = search(args, df, args.topk, y, R, D)
            for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]):
                results.append(_format(s, year, authors, title, url))
        return results
    
    def _search_arxiv(arxiv_id: str):
        results = []
        doc = extract_title_abst(arxiv_id)
        y = doc_to_ids(doc, word_to_id_, stemming=True)
        if y==[]:
            return [[None,'N/A', 'N/A', 'N/A', 'N/A']]
        else:
            scores, docids = search(args, df, args.topk, y, R, D)
            for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]):
                results.append(_format(s, year, authors, title, url))
        return results

    with gr.Blocks() as demo:
        gr.HTML(
        """
            <div style="text-align: center; max-width: 650px; margin: 0 auto;">
                <div
                    style="
                      display: inline-flex;
                      align-items: center;
                      gap: 1rem;
                      font-size: 1.75rem;
                    "
                >
                    <svg width="68" height="46" xmlns="http://www.w3.org/2000/svg">
                        <path
                            d="M 41.977553,-2.8421709e-014 C 41.977553,1.76178 41.977553,1.44211 41.977553,3.0158 L 7.4869054,3.0158 L 0,3.0158 L 0,10.50079 L 0,38.47867 L 0,46 L 7.4869054,46 L 49.500802,46 L 56.987708,46 L 68,46 L 68,30.99368 L 56.987708,30.99368 L 56.987708,10.50079 L 56.987708,3.0158 C 56.987708,1.44211 56.987708,1.76178 56.987708,-2.8421709e-014 L 41.977553,-2.8421709e-014 z M 15.010155,17.98578 L 41.977553,17.98578 L 41.977553,30.99368 L 15.010155,30.99368 L 15.010155,17.98578 z "
                            style="fill:#ed1c24;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:12.89541149;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
                        />
                    </svg>
                    <h1 style="font-weight: 900; margin-bottom: 0">
                        ACL2Vec
                    </h1>
                </div>
                <p style="margin: 15px 0 5px; font-size: 100%; text-align: justify">
                    This is a light-weighted version of <a href=http://clml.ism.ac.jp/ACL2Vec/>ACL2Vec keyword search</a>, implemented in a totally statistical manner.
                    Start typing below to search papers limited to 2019 onwards and up to September 2022.
                </p>
            </div>
        """)
        with gr.Row():
            inputs = gr.Textbox(placeholder="Input keywords separated by spaces.", show_label=False)
            inputs_arxiv = gr.Textbox(placeholder="Input arxiv number and press Enter to find similar papers.", show_label=False)
        
        outputs = gr.Dataframe(
            headers=['score', 'year', 'title', 'authors', 'PDF'],
            datatype=["number", "str", "markdown", "str", "markdown"],
            col_count=(5, "fixed"),
            wrap=True,
            label=f"top-{args.topk} results"
        )
        inputs.change(_search, inputs, outputs)
        inputs_arxiv.submit(_search_arxiv, inputs_arxiv, outputs)

        demo.launch(
            #share=True, 
            debug=True
        )

if __name__ == '__main__':
    args = ObjectView(get_args())
    main(args)