|
import dataclasses |
|
from multiprocessing import cpu_count |
|
import tqdm |
|
import requests |
|
import streamlit as st |
|
|
|
import pandas as pd |
|
from datasets import Dataset, load_dataset |
|
from paperswithcode import PapersWithCodeClient |
|
|
|
|
|
@dataclasses.dataclass(frozen=True) |
|
class PaperInfo: |
|
date: str |
|
arxiv_id: str |
|
github: str |
|
title: str |
|
paper_page: str |
|
upvotes: int |
|
num_comments: int |
|
|
|
|
|
def get_df() -> pd.DataFrame: |
|
""" |
|
Load the initial dataset as a Pandas dataframe. |
|
""" |
|
|
|
df = pd.merge( |
|
left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(), |
|
right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(), |
|
on="arxiv_id", |
|
) |
|
df = df[::-1].reset_index(drop=True) |
|
|
|
paper_info = [] |
|
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): |
|
info = PaperInfo( |
|
**row, |
|
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}", |
|
) |
|
paper_info.append(info) |
|
return pd.DataFrame([dataclasses.asdict(info) for info in paper_info]) |
|
|
|
|
|
def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str: |
|
""" |
|
Get the Github URL for a paper. |
|
""" |
|
|
|
repo_url = "" |
|
try: |
|
|
|
results = client.paper_list(q=paper_title).results |
|
paper_id = results[0].id |
|
|
|
|
|
paper = client.paper_get(paper_id=paper_id) |
|
|
|
|
|
repositories = client.paper_repository_list(paper_id=paper.id).results |
|
|
|
for repo in repositories: |
|
if repo.is_official: |
|
repo_url = repo.url |
|
|
|
except: |
|
pass |
|
|
|
return repo_url |
|
|
|
|
|
def add_metadata_batch(batch, client: PapersWithCodeClient): |
|
""" |
|
Add metadata to a batch of papers. |
|
""" |
|
|
|
|
|
github_urls = [] |
|
for paper_title in batch["title"]: |
|
github_url = get_github_url(client, paper_title) |
|
github_urls.append(github_url) |
|
|
|
|
|
batch["github"] = github_urls |
|
|
|
return batch |
|
|
|
|
|
def add_hf_assets(batch): |
|
""" |
|
Add Hugging Face assets to a batch of papers. |
|
""" |
|
num_spaces = [] |
|
num_models = [] |
|
num_datasets = [] |
|
for arxiv_id in batch["arxiv_id"]: |
|
if arxiv_id != "": |
|
response = requests.get(f"https://huggingface.co/api/arxiv/{arxiv_id}/repos") |
|
result = response.json() |
|
num_spaces_example = len(result["spaces"]) |
|
num_models_example = len(result["models"]) |
|
num_datasets_example = len(result["datasets"]) |
|
else: |
|
num_spaces_example = 0 |
|
num_models_example = 0 |
|
num_datasets_example = 0 |
|
|
|
num_spaces.append(num_spaces_example) |
|
num_models.append(num_models_example) |
|
num_datasets.append(num_datasets_example) |
|
|
|
batch["num_models"] = num_models |
|
batch["num_datasets"] = num_datasets |
|
batch["num_spaces"] = num_spaces |
|
|
|
return batch |
|
|
|
|
|
def check_hf_mention(batch): |
|
""" |
|
Check if a paper mentions Hugging Face in the README. |
|
""" |
|
|
|
hf_mentions = [] |
|
for github_url in batch["github"]: |
|
hf_mention = 0 |
|
if github_url != "": |
|
|
|
owner = github_url.split("/")[-2] |
|
repo = github_url.split("/")[-1] |
|
branch = "main" |
|
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md" |
|
response = requests.get(url) |
|
|
|
if response.status_code != 200: |
|
|
|
branch = "master" |
|
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md" |
|
response = requests.get(url) |
|
|
|
if response.status_code == 200: |
|
|
|
text = response.text |
|
if "huggingface" in text.lower() or "hugging face" in text.lower(): |
|
hf_mention = 1 |
|
|
|
hf_mentions.append(hf_mention) |
|
|
|
|
|
batch["hf_mention"] = hf_mentions |
|
|
|
return batch |
|
|
|
|
|
@st.cache_data |
|
def get_data() -> pd.DataFrame: |
|
""" |
|
Load the dataset and enrich it with metadata. |
|
""" |
|
|
|
df = get_df() |
|
df['date'] = pd.to_datetime(df['date']) |
|
|
|
|
|
dataset = Dataset.from_pandas(df) |
|
|
|
|
|
|
|
|
|
dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()}) |
|
|
|
|
|
dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count()) |
|
|
|
|
|
dataset = dataset.map(check_hf_mention, batched=True, batch_size=4, num_proc=cpu_count()) |
|
|
|
|
|
dataframe = dataset.to_pandas() |
|
|
|
|
|
dataframe['date'] = pd.to_datetime(dataframe['date']) |
|
|
|
print("First few rows of the dataset:") |
|
print(dataframe.head()) |
|
|
|
return dataframe |