Spaces:
Sleeping
Sleeping
File size: 3,782 Bytes
57c87c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import dataclasses
from multiprocessing import cpu_count
import tqdm
import requests
import streamlit as st
import pandas as pd
from datasets import Dataset, load_dataset
from paperswithcode import PapersWithCodeClient
@dataclasses.dataclass(frozen=True)
class PaperInfo:
date: str
arxiv_id: str
github: str
title: str
paper_page: str
upvotes: int
num_comments: int
def get_df() -> pd.DataFrame:
df = pd.merge(
left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
on="arxiv_id",
)
df = df[::-1].reset_index(drop=True)
paper_info = []
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
info = PaperInfo(
**row,
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
)
paper_info.append(info)
return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
"""
Get the Github URL for a paper.
"""
repo_url = ""
try:
# get paper ID
results = client.paper_list(q=paper_title).results
paper_id = results[0].id
# get paper
paper = client.paper_get(paper_id=paper_id)
# get repositories
repositories = client.paper_repository_list(paper_id=paper.id).results
for repo in repositories:
if repo.is_official:
repo_url = repo.url
except:
pass
return repo_url
def add_metadata_batch(batch, client: PapersWithCodeClient):
"""
Add metadata to a batch of papers.
"""
# get Github URLs for all papers in the batch
github_urls = []
for paper_title in batch["title"]:
github_url = get_github_url(client, paper_title)
github_urls.append(github_url)
# overwrite the Github links
batch["github"] = github_urls
return batch
def add_hf_assets(batch):
"""
Add Hugging Face assets to a batch of papers.
"""
num_spaces = []
num_models = []
num_datasets = []
for arxiv_id in batch["arxiv_id"]:
if arxiv_id != "":
response = requests.get(f"https://huggingface.co/api/arxiv/{arxiv_id}/repos")
result = response.json()
num_spaces_example = len(result["spaces"])
num_models_example = len(result["models"])
num_datasets_example = len(result["datasets"])
else:
num_spaces_example = 0
num_models_example = 0
num_datasets_example = 0
num_spaces.append(num_spaces_example)
num_models.append(num_models_example)
num_datasets.append(num_datasets_example)
batch["num_models"] = num_models
batch["num_datasets"] = num_datasets
batch["num_spaces"] = num_spaces
return batch
@st.cache_data
def get_data() -> pd.DataFrame:
"""
Load the dataset and enrich it with metadata.
"""
# step 1. load as Pandas dataframe
df = get_df()
df['date'] = pd.to_datetime(df['date'])
# step 2. enrich using PapersWithCode API
dataset = Dataset.from_pandas(df)
# TODO remove
# dataset = dataset.select(range(10))
dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})
# step 3. enrich using Hugging Face API
dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())
# return as Pandas dataframe
dataframe = dataset.to_pandas()
# convert date column to datetime
dataframe['date'] = pd.to_datetime(dataframe['date'])
print("First few rows of the dataset:")
print(dataframe.head())
return dataframe |