Spaces:

Yuichiroh
/

ACL2Vec

Running

App Files Files Community

4kasha commited on Feb 14, 2023

Commit

bcdda34

1 Parent(s): eba5fa8

initial commit

Browse files

Files changed (9) hide show

.gitattributes +1 -0
_arxiv.py +42 -0
app.py +119 -0
data/Docs-rep-2019-h500.npy +3 -0
data/Rmatrix-2019-h500.dat +3 -0
data/acl-pub-info-2019-title-abst.parquet +3 -0
data/vocab_2019.npy +3 -0
requirements.txt +6 -0
utils.py +73 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.dat filter=lfs diff=lfs merge=lfs -text

_arxiv.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from __future__ import annotations
+import logging
+from typing import Optional
+import string
+import nltk
+import arxiv
+logger = logging.getLogger(__name__)
+def extract_title_abst(arxiv_id: str):
+    try:
+        paper = next(arxiv.Search(id_list=[arxiv_id]).results())
+        doc = paper.title + ' ' + paper.summary
+    except:
+        doc = None
+    return doc
+def doc_to_ids(
+        doc: Optional[str],
+        word_to_id_: dict[str, int],
+        stemming: bool,
+        lower: bool = True,
+    ):
+    from nltk.stem.porter import PorterStemmer
+    if not doc:
+        y = []
+    else:
+        if lower:
+            doc = doc.lower()
+        doc = "".join([char for char in doc if char not in string.punctuation])
+        words = nltk.word_tokenize(doc)
+        if stemming:
+            porter = PorterStemmer()
+            words = [porter.stem(word) for word in words]
+        # Consider out-of-vocabulary cases, if y == []: no matched results
+        y = [word_to_id_[word] for word in words if word in word_to_id_]
+        # pick up keywords only once
+        #y = list(set(y))
+    return y

app.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import logging
+import re
+import nltk
+import pandas as pd
+import numpy as np
+import gradio as gr
+fmt = "%(asctime)s %(levelname)s %(name)s :%(message)s"
+logging.basicConfig(level=logging.WARNING, format=fmt)
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+from utils import load_matrix, query_to_ids, search
+from _arxiv import extract_title_abst, doc_to_ids
+nltk.download('punkt')
+def get_args():
+    return {
+        'acl_data_file': 'data/acl-pub-info-2019-title-abst.parquet',
+        'docs_rep_file': 'data/Docs-rep-2019-h500.npy',
+        'r_matrix_file': 'data/Rmatrix-2019-h500.dat',
+        'vocab_file': 'data/vocab_2019.npy',
+        'topk': 20,
+        'metric': 'INNER_PRODUCT', # choices=['COSINE', 'INNER_PRODUCT']
+    }
+class ObjectView(object):
+    def __init__(self, d): self.__dict__ = d
+def _format(s: float, year: str, authors: str, title: str, url: str):
+    authors = ', '.join(authors.replace(',','').replace('\\', '').split('  and\n'))
+    authors = re.sub('[{}]', '', authors)
+    title = re.sub('[{\}]', '', title)
+    title_with_url_markdown = f'[{title}]({url})'
+    url = url.rstrip('/')
+    pdf_url = f'[click]({url}.pdf)'
+    return [round(s,2), year, title_with_url_markdown, authors, pdf_url]
+def main(args: ObjectView):
+    df = pd.read_parquet(args.acl_data_file)
+    #logger.info(f'document size: {len(df)}')
+    word_to_id_ = np.load(args.vocab_file, allow_pickle=True).item()
+    D, R = load_matrix(args.docs_rep_file, args.r_matrix_file, word_to_id_)
+    def _search(query: str):
+        results = []
+        y = query_to_ids(query, word_to_id_, stemming=True)
+        if y==[]:
+            return [[None,'N/A', 'N/A', 'N/A', 'N/A']]
+        else:
+            scores, docids = search(args, df, args.topk, y, R, D)
+            for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]):
+                results.append(_format(s, year, authors, title, url))
+        return results
+    def _search_arxiv(arxiv_id: str):
+        results = []
+        doc = extract_title_abst(arxiv_id)
+        y = doc_to_ids(doc, word_to_id_, stemming=True)
+        if y==[]:
+            return [[None,'N/A', 'N/A', 'N/A', 'N/A']]
+        else:
+            scores, docids = search(args, df, args.topk, y, R, D)
+            for s, year, authors, title, url in zip(scores[docids], df.iloc[docids]["year"], df.iloc[docids]["author"], df.iloc[docids]["title"], df.iloc[docids]["url"]):
+                results.append(_format(s, year, authors, title, url))
+        return results
+    with gr.Blocks() as demo:
+        gr.HTML(
+        """
+            <div style="text-align: center; max-width: 650px; margin: 0 auto;">
+                <div
+                    style="
+                      display: inline-flex;
+                      align-items: center;
+                      gap: 1rem;
+                      font-size: 1.75rem;
+                    "
+                >
+                    <svg width="68" height="46" xmlns="http://www.w3.org/2000/svg">
+                        <path
+                            d="M 41.977553,-2.8421709e-014 C 41.977553,1.76178 41.977553,1.44211 41.977553,3.0158 L 7.4869054,3.0158 L 0,3.0158 L 0,10.50079 L 0,38.47867 L 0,46 L 7.4869054,46 L 49.500802,46 L 56.987708,46 L 68,46 L 68,30.99368 L 56.987708,30.99368 L 56.987708,10.50079 L 56.987708,3.0158 C 56.987708,1.44211 56.987708,1.76178 56.987708,-2.8421709e-014 L 41.977553,-2.8421709e-014 z M 15.010155,17.98578 L 41.977553,17.98578 L 41.977553,30.99368 L 15.010155,30.99368 L 15.010155,17.98578 z "
+                            style="fill:#ed1c24;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:12.89541149;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
+                        />
+                    </svg>
+                    <h1 style="font-weight: 900; margin-bottom: 0">
+                        ACL2Vec
+                    </h1>
+                </div>
+                <p style="margin: 15px 0 5px; font-size: 100%; text-align: justify">
+                    This is a light-weighted version of <a href=http://clml.ism.ac.jp/ACL2Vec/>ACL2Vec keyword search</a>, implemented in a totally statistical manner.
+                    Start typing below to search papers limited to 2019 onwards and up to September 2022.
+                </p>
+            </div>
+        """)
+        with gr.Row():
+            inputs = gr.Textbox(placeholder="Input keywords separated by spaces.", show_label=False)
+            inputs_arxiv = gr.Textbox(placeholder="Input arxiv number and press Enter to find similar papers.", show_label=False)
+        outputs = gr.Dataframe(
+            headers=['score', 'year', 'title', 'authors', 'PDF'],
+            datatype=["number", "str", "markdown", "str", "markdown"],
+            col_count=(5, "fixed"),
+            wrap=True,
+            label=f"top-{args.topk} results"
+        )
+        inputs.change(_search, inputs, outputs)
+        inputs_arxiv.submit(_search_arxiv, inputs_arxiv, outputs)
+        demo.launch(
+            #share=True,
+            debug=True
+        )
+if __name__ == '__main__':
+    args = ObjectView(get_args())
+    main(args)

data/Docs-rep-2019-h500.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea0d111544401419d19ced34fa254e6f1f4a9a1f21056c45857a0290bc0735f3
+size 39740128

data/Rmatrix-2019-h500.dat ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbcf1fdb7633825254a2de82008be03371ccabc600704382334161baa6c0cfb0
+size 5610000

data/acl-pub-info-2019-title-abst.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc0e1f45980d44ebbfb265a4e56b4cd694d791ad695c8156f8ca09ca71d57146
+size 12547039

data/vocab_2019.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dcf906ee75dbc2c8f2c09a2de25c8b90b104e48cf7bd1581953210af6ebabd6
+size 53098

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+arxiv==1.4.3
+gradio==3.18.0
+nltk==3.7
+numpy==1.21.6
+pandas==1.3.5
+scikit-learn==1.0.2

utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from __future__ import annotations
+import logging
+import argparse
+import re
+import string
+import nltk
+import pandas
+import pandas as pd
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+logger = logging.getLogger(__name__)
+def load_matrix(
+    d_file: str,
+    r_file: str,
+    word_to_id_: dict[str, int]
+):
+    D = np.load(d_file)
+    R = np.memmap(r_file, dtype='float32', mode='r', shape=(D.shape[-1],len(word_to_id_)))
+    logger.info(f'D size: {D.shape}, R size: {R.shape}')
+    return D, R
+def query_to_ids(
+        query: str,
+        word_to_id_: dict[str, int],
+        stemming: bool,
+        lower: bool = True,
+    ):
+    from nltk.stem.porter import PorterStemmer
+    if lower:
+        query = query.lower()
+    # TODO: weight "*" process
+    query = "".join([char for char in query if char not in string.punctuation])
+    words = nltk.word_tokenize(query)
+    if stemming:
+        porter = PorterStemmer()
+        words = [porter.stem(word) for word in words]
+    # Consider out-of-vocabulary cases, if y == []: no matched results
+    y = [word_to_id_[word] for word in words if word in word_to_id_]
+    return y
+def query_to_vec(
+        R: np.ndarray,
+        y: list[int]
+    ):
+    qvec = np.zeros((R.shape[0], ))
+    for ind in y:
+        qvec += R[:,ind]
+    return qvec
+def search(
+        args: argparse.Namespace,
+        df: pandas.DataFrame,
+        k: int,
+        y: list[int],
+        R: np.ndarray,
+        D: np.ndarray
+    ):
+    qvec = query_to_vec(R, y)
+    if args.metric=='COSINE':
+        scores = cosine_similarity([qvec], D)[0]
+    elif args.metric=='INNER_PRODUCT':
+        scores = D @ qvec
+    docids = np.argsort(scores)[::-1][:k]
+    return scores, docids