Spaces:
Sleeping
Sleeping
File size: 5,919 Bytes
ea4d0fa 16fdb1b ea4d0fa 1f24a5f c38b160 ea4d0fa 466c476 ea4d0fa f5ede9e ea4d0fa d106587 8472fb4 3cd20df d106587 5966956 d106587 2f27244 6c04839 ea4d0fa 3f2f31e 320d802 ea4d0fa 708bff1 8472fb4 619873d 8472fb4 767009a 5f0076c 708bff1 1fee4b2 8472fb4 1fee4b2 6f5ef35 b5793e9 b04cbde 3f2f31e 16fdb1b f558c28 cf8d7e8 b04cbde 3cd20df 0d8ea05 cf8d7e8 ea4d0fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import httpx
from cytoolz import groupby
from functools import lru_cache
from rich import print
from functools import partial
import gradio as gr
from typing import Optional
def query_author(author_name: str):
url = f"https://api.semanticscholar.org/graph/v1/author/search?query={author_name}&fields=name,url,externalIds,papers.externalIds,papers.title,papers.year"
resp = httpx.get(url)
resp.raise_for_status()
return resp.json()["data"]
def get_arxiv_paper(papers):
papers_with_externalIds = [paper for paper in papers if paper.get("externalIds")]
return [
paper for paper in papers_with_externalIds if paper["externalIds"].get("ArXiv")
]
def check_arxiv_in_papers(arxiv_ids, papers):
papers_with_externalIds = [paper for paper in papers if paper.get("externalIds")]
papers_with_arxiv_ids = [
paper for paper in papers_with_externalIds if paper["externalIds"].get("ArXiv")
]
return any(
paper
for paper in papers_with_arxiv_ids
if paper["externalIds"].get("ArXiv") in arxiv_ids
)
def get_author_from_options(potential_authors, positive_arxiv_ids):
return next(
(
author
for author in potential_authors
if check_arxiv_in_papers(set(positive_arxiv_ids), author["papers"])
),
None,
)
def sort_by_date(papers):
return sorted(papers, key=lambda paper: paper["year"], reverse=True)
@lru_cache()
def lookup_hf_paper(arxiv_id):
url = f"https://huggingface.co/api/papers/{arxiv_id}"
resp = httpx.get(url)
return resp.json()
def check_if_index_hf_paper(paper):
arxiv_id = paper["externalIds"]["ArXiv"]
data = lookup_hf_paper(arxiv_id)
return not data.get("error")
def groupby_indexed_by_hf_papers(papers):
return groupby(check_if_index_hf_paper, papers)
def check_hf_user_in_authors(paper, hf_user_name):
authors = paper["authors"]
authors = [author for author in authors if author.get("user")]
return any(author["user"]["user"] == hf_user_name for author in authors)
def groupby_hf_user_papers(papers, hf_user_name):
check_hf_user_in_authors_partial = partial(
check_hf_user_in_authors, hf_user_name=hf_user_name
)
return groupby(check_hf_user_in_authors_partial, papers)
def get_papers(
author_name: str, positive_arxiv_ids: str, hf_user_name: Optional[gr.OAuthProfile]
):
if not hf_user_name:
raise gr.Error("You must be logged in to use this Space")
if not positive_arxiv_ids:
raise gr.Error("You must enter at least one ArXiv ID")
hf_user_name = hf_user_name.preferred_username
positive_arxiv_ids = positive_arxiv_ids.split(",")
# strip whitespace
positive_arxiv_ids = [arxiv_id.strip() for arxiv_id in positive_arxiv_ids]
potential_authors = query_author(author_name)
if not potential_authors:
raise gr.Error("No authors found with that name")
author = get_author_from_options(potential_authors, positive_arxiv_ids)
papers = get_arxiv_paper(author["papers"])
papers = sort_by_date(papers)
papers_indexed_by_hf = groupby_indexed_by_hf_papers(papers)
# print(papers_indexed_by_hf[True])
indexed_papers = [
lookup_hf_paper(paper["externalIds"]["ArXiv"])
for paper in papers_indexed_by_hf[True]
]
already_claimed = groupby_hf_user_papers(indexed_papers, hf_user_name)
if already_claimed.get(False):
results = (
"# Papers already indexed by Hugging Face which you haven't claimed\n"
+ "These papers are already indexed by Hugging Face, but you haven't"
" claimed them yet. You can claim them by clicking on the link to the"
" paper and then clicking on your name in the author list.\n"
)
for paper in already_claimed[False]:
url = f"https://huggingface.co/papers/{paper['id']}"
results += f"- [{paper['title']}]({url})\n"
else:
results = "You have claimed all papers indexed by Hugging Face!\n"
if papers_indexed_by_hf.get(False):
results += "# Papers not yet indexed by Hugging Face which you can claim\n"
for paper in papers_indexed_by_hf[False]:
paper_title = paper["title"]
arxiv_id = paper["externalIds"]["ArXiv"]
url = f"https://huggingface.co/papers/{arxiv_id}"
results += f"- [{paper_title}]({url})\n"
return results
def get_name(hf_user_name: Optional[gr.OAuthProfile] = None):
return hf_user_name.name if hf_user_name else ""
with gr.Blocks() as demo:
gr.HTML(
"<h1 style='text-align:center;'> 📃 Hugging Face Paper Claimer 📃"
" </h1>"
)
gr.HTML(
"""<div style='text-align:center;'>You can use this Space to help you find arXiv papers you can still claim.
You need to be logged in to use this Space.
Once you login your name will be prepopulated but you can change this if the name you publish under is different.</div>"""
)
gr.Markdown(
"**NOTE** This Space uses the [Semantic Scholar"
" API](https://www.semanticscholar.org/product/api) to find papers you have"
" authored. Occasionaly this API returns false positives i.e. papers which you"
" did not author"
)
with gr.Row():
gr.LoginButton(size="sm")
gr.LogoutButton(size="sm")
author_name = gr.Textbox(
value=get_name,
label="The name you publish under",
interactive=True,
)
positive_arxiv_ids = gr.Textbox(
placeholder="1910.01108",
label=(
"ArXiv ID for a paper for which you are an author, separate multiple IDs"
" with commas"
),
interactive=True,
)
btn = gr.Button("Get papers")
btn.click(get_papers, [author_name, positive_arxiv_ids], gr.Markdown())
demo.launch(debug=True)
|