File size: 5,919 Bytes
ea4d0fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16fdb1b
ea4d0fa
1f24a5f
 
c38b160
 
ea4d0fa
 
466c476
 
ea4d0fa
f5ede9e
 
ea4d0fa
 
 
 
 
 
 
 
 
 
 
 
d106587
 
 
8472fb4
3cd20df
 
d106587
 
 
 
 
 
5966956
d106587
2f27244
 
 
 
 
6c04839
ea4d0fa
 
3f2f31e
 
320d802
 
ea4d0fa
708bff1
8472fb4
 
619873d
 
8472fb4
767009a
5f0076c
708bff1
1fee4b2
8472fb4
 
 
 
1fee4b2
6f5ef35
b5793e9
 
b04cbde
3f2f31e
16fdb1b
 
f558c28
cf8d7e8
b04cbde
3cd20df
 
 
 
0d8ea05
cf8d7e8
ea4d0fa
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import httpx
from cytoolz import groupby
from functools import lru_cache
from rich import print
from functools import partial
import gradio as gr
from typing import Optional


def query_author(author_name: str):
    url = f"https://api.semanticscholar.org/graph/v1/author/search?query={author_name}&fields=name,url,externalIds,papers.externalIds,papers.title,papers.year"
    resp = httpx.get(url)
    resp.raise_for_status()
    return resp.json()["data"]


def get_arxiv_paper(papers):
    papers_with_externalIds = [paper for paper in papers if paper.get("externalIds")]
    return [
        paper for paper in papers_with_externalIds if paper["externalIds"].get("ArXiv")
    ]


def check_arxiv_in_papers(arxiv_ids, papers):
    papers_with_externalIds = [paper for paper in papers if paper.get("externalIds")]
    papers_with_arxiv_ids = [
        paper for paper in papers_with_externalIds if paper["externalIds"].get("ArXiv")
    ]
    return any(
        paper
        for paper in papers_with_arxiv_ids
        if paper["externalIds"].get("ArXiv") in arxiv_ids
    )


def get_author_from_options(potential_authors, positive_arxiv_ids):
    return next(
        (
            author
            for author in potential_authors
            if check_arxiv_in_papers(set(positive_arxiv_ids), author["papers"])
        ),
        None,
    )


def sort_by_date(papers):
    return sorted(papers, key=lambda paper: paper["year"], reverse=True)


@lru_cache()
def lookup_hf_paper(arxiv_id):
    url = f"https://huggingface.co/api/papers/{arxiv_id}"
    resp = httpx.get(url)
    return resp.json()


def check_if_index_hf_paper(paper):
    arxiv_id = paper["externalIds"]["ArXiv"]
    data = lookup_hf_paper(arxiv_id)
    return not data.get("error")


def groupby_indexed_by_hf_papers(papers):
    return groupby(check_if_index_hf_paper, papers)


def check_hf_user_in_authors(paper, hf_user_name):
    authors = paper["authors"]
    authors = [author for author in authors if author.get("user")]
    return any(author["user"]["user"] == hf_user_name for author in authors)


def groupby_hf_user_papers(papers, hf_user_name):
    check_hf_user_in_authors_partial = partial(
        check_hf_user_in_authors, hf_user_name=hf_user_name
    )
    return groupby(check_hf_user_in_authors_partial, papers)


def get_papers(
    author_name: str, positive_arxiv_ids: str, hf_user_name: Optional[gr.OAuthProfile]
):
    if not hf_user_name:
        raise gr.Error("You must be logged in to use this Space")
    if not positive_arxiv_ids:
        raise gr.Error("You must enter at least one ArXiv ID")
    hf_user_name = hf_user_name.preferred_username
    positive_arxiv_ids = positive_arxiv_ids.split(",")
    # strip whitespace
    positive_arxiv_ids = [arxiv_id.strip() for arxiv_id in positive_arxiv_ids]
    potential_authors = query_author(author_name)
    if not potential_authors:
        raise gr.Error("No authors found with that name")
    author = get_author_from_options(potential_authors, positive_arxiv_ids)
    papers = get_arxiv_paper(author["papers"])
    papers = sort_by_date(papers)
    papers_indexed_by_hf = groupby_indexed_by_hf_papers(papers)
    # print(papers_indexed_by_hf[True])

    indexed_papers = [
        lookup_hf_paper(paper["externalIds"]["ArXiv"])
        for paper in papers_indexed_by_hf[True]
    ]

    already_claimed = groupby_hf_user_papers(indexed_papers, hf_user_name)
    if already_claimed.get(False):
        results = (
            "# Papers already indexed by Hugging Face which you haven't claimed\n"
            + "These papers are already indexed by Hugging Face, but you haven't"
            " claimed them yet. You can claim them by clicking on the link to the"
            " paper and then clicking on your name in the author list.\n"
        )
        for paper in already_claimed[False]:
            url = f"https://huggingface.co/papers/{paper['id']}"
            results += f"- [{paper['title']}]({url})\n"
    else:
        results = "You have claimed all papers indexed by Hugging Face!\n"
    if papers_indexed_by_hf.get(False):
        results += "# Papers not yet indexed by Hugging Face which you can claim\n"
        for paper in papers_indexed_by_hf[False]:
            paper_title = paper["title"]
            arxiv_id = paper["externalIds"]["ArXiv"]
            url = f"https://huggingface.co/papers/{arxiv_id}"
            results += f"- [{paper_title}]({url})\n"
    return results


def get_name(hf_user_name: Optional[gr.OAuthProfile] = None):
    return hf_user_name.name if hf_user_name else ""


with gr.Blocks() as demo:
    gr.HTML(
        "<h1 style='text-align:center;'> &#x1F4C3; Hugging Face Paper Claimer &#x1F4C3;"
        " </h1>"
    )
    gr.HTML(
        """<div style='text-align:center;'>You can use this Space to help you find arXiv papers you can still claim.
        You need to be logged in to use this Space. 
        Once you login your name will be prepopulated but you can change this if the name you publish under is different.</div>"""
    )
    gr.Markdown(
        "**NOTE** This Space uses the [Semantic Scholar"
        " API](https://www.semanticscholar.org/product/api) to find papers you have"
        " authored. Occasionaly this API returns false positives i.e. papers which you"
        " did not author"
    )
    with gr.Row():
        gr.LoginButton(size="sm")
        gr.LogoutButton(size="sm")
    author_name = gr.Textbox(
        value=get_name,
        label="The name you publish under",
        interactive=True,
    )
    positive_arxiv_ids = gr.Textbox(
        placeholder="1910.01108",
        label=(
            "ArXiv ID for a paper for which you are an author, separate multiple IDs"
            " with commas"
        ),
        interactive=True,
    )
    btn = gr.Button("Get papers")
    btn.click(get_papers, [author_name, positive_arxiv_ids], gr.Markdown())

demo.launch(debug=True)