nielsr HF staff commited on
Commit
57c87c9
1 Parent(s): acd0e3c

First draft

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +75 -0
  3. load_dataframe.py +144 -0
  4. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ env/
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+ import streamlit as st
4
+ import pandas as pd
5
+
6
+ # from load_dataframe import get_data
7
+
8
+
9
+ # Main Streamlit app
10
+ def main():
11
+ st.title("Hugging Face Papers KPI Dashboard")
12
+
13
+ # TODO use this instead
14
+ # df = get_data()
15
+ df = pd.read_csv('/Users/nielsrogge/Downloads/daily_papers_enriched (1).csv')
16
+ df = df.drop(['Unnamed: 0'], axis=1)
17
+ # Use date as index
18
+ # Note that it's a string, not a datetime
19
+ df = df.set_index('date')
20
+ df.index = pd.to_datetime(df.index).strftime('%d-%m-%Y')
21
+ df = df.sort_index()
22
+
23
+ # Button to select day, month or week
24
+ # Add streamlit selectbox.
25
+ view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])
26
+
27
+ if view_level == "day":
28
+ # make a button to select the day, defaulting to today
29
+ day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
30
+ # convert to the day of a Pandas Timestamp
31
+ day = pd.Timestamp(day)
32
+
33
+ print("Day:", day)
34
+
35
+ df = df.loc[day.strftime('%d-%m-%Y'):day.strftime('%d-%m-%Y')]
36
+
37
+ st.write(f"Showing data for {day.strftime('%d/%m/%Y')}")
38
+
39
+ st.markdown(f"""
40
+ ## Number of papers: {df.shape[0]}
41
+ #### Number of papers with a Github link: {df['github'].notnull().sum()}
42
+ #### Number of papers with at least one HF artifact: {df['num_models'].sum()}
43
+ """)
44
+
45
+ st.dataframe(df,
46
+ hide_index=True,
47
+ column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
48
+ column_config={"github": st.column_config.LinkColumn(),
49
+ "paper_page": st.column_config.LinkColumn()},
50
+ width=2000)
51
+
52
+ elif view_level == "week":
53
+ # make a button to select the week
54
+ week = st.sidebar.date_input("Select week", value=pd.Timestamp.today().isocalendar())
55
+
56
+ df = df.loc[df['date'].dt.isocalendar().week == week.isocalendar().week]
57
+
58
+ st.write(f"Showing data for {day}")
59
+ st.dataframe(df)
60
+
61
+ elif view_level == "month":
62
+ # make a button to select the month, defaulting to current month
63
+ month = st.sidebar.date_input("Select month", value=pd.Timestamp.today().month_name())
64
+
65
+ df = df.loc[df['date'].dt.month_name() == month]
66
+
67
+ st.write(f"Showing data for {day}")
68
+ st.dataframe(df)
69
+
70
+ # Display data based on aggregation level
71
+
72
+
73
+
74
+ if __name__ == "__main__":
75
+ main()
load_dataframe.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ from multiprocessing import cpu_count
3
+ import tqdm
4
+ import requests
5
+ import streamlit as st
6
+
7
+ import pandas as pd
8
+ from datasets import Dataset, load_dataset
9
+ from paperswithcode import PapersWithCodeClient
10
+
11
+
12
+ @dataclasses.dataclass(frozen=True)
13
+ class PaperInfo:
14
+ date: str
15
+ arxiv_id: str
16
+ github: str
17
+ title: str
18
+ paper_page: str
19
+ upvotes: int
20
+ num_comments: int
21
+
22
+
23
+ def get_df() -> pd.DataFrame:
24
+ df = pd.merge(
25
+ left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
26
+ right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
27
+ on="arxiv_id",
28
+ )
29
+ df = df[::-1].reset_index(drop=True)
30
+
31
+ paper_info = []
32
+ for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
33
+ info = PaperInfo(
34
+ **row,
35
+ paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
36
+ )
37
+ paper_info.append(info)
38
+ return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
39
+
40
+
41
+ def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str:
42
+ """
43
+ Get the Github URL for a paper.
44
+ """
45
+
46
+ repo_url = ""
47
+ try:
48
+ # get paper ID
49
+ results = client.paper_list(q=paper_title).results
50
+ paper_id = results[0].id
51
+
52
+ # get paper
53
+ paper = client.paper_get(paper_id=paper_id)
54
+
55
+ # get repositories
56
+ repositories = client.paper_repository_list(paper_id=paper.id).results
57
+
58
+ for repo in repositories:
59
+ if repo.is_official:
60
+ repo_url = repo.url
61
+
62
+ except:
63
+ pass
64
+
65
+ return repo_url
66
+
67
+
68
+ def add_metadata_batch(batch, client: PapersWithCodeClient):
69
+ """
70
+ Add metadata to a batch of papers.
71
+ """
72
+
73
+ # get Github URLs for all papers in the batch
74
+ github_urls = []
75
+ for paper_title in batch["title"]:
76
+ github_url = get_github_url(client, paper_title)
77
+ github_urls.append(github_url)
78
+
79
+ # overwrite the Github links
80
+ batch["github"] = github_urls
81
+
82
+ return batch
83
+
84
+
85
+ def add_hf_assets(batch):
86
+ """
87
+ Add Hugging Face assets to a batch of papers.
88
+ """
89
+ num_spaces = []
90
+ num_models = []
91
+ num_datasets = []
92
+ for arxiv_id in batch["arxiv_id"]:
93
+ if arxiv_id != "":
94
+ response = requests.get(f"https://huggingface.co/api/arxiv/{arxiv_id}/repos")
95
+ result = response.json()
96
+ num_spaces_example = len(result["spaces"])
97
+ num_models_example = len(result["models"])
98
+ num_datasets_example = len(result["datasets"])
99
+ else:
100
+ num_spaces_example = 0
101
+ num_models_example = 0
102
+ num_datasets_example = 0
103
+
104
+ num_spaces.append(num_spaces_example)
105
+ num_models.append(num_models_example)
106
+ num_datasets.append(num_datasets_example)
107
+
108
+ batch["num_models"] = num_models
109
+ batch["num_datasets"] = num_datasets
110
+ batch["num_spaces"] = num_spaces
111
+
112
+ return batch
113
+
114
+
115
+ @st.cache_data
116
+ def get_data() -> pd.DataFrame:
117
+ """
118
+ Load the dataset and enrich it with metadata.
119
+ """
120
+ # step 1. load as Pandas dataframe
121
+ df = get_df()
122
+ df['date'] = pd.to_datetime(df['date'])
123
+
124
+ # step 2. enrich using PapersWithCode API
125
+ dataset = Dataset.from_pandas(df)
126
+
127
+ # TODO remove
128
+ # dataset = dataset.select(range(10))
129
+
130
+ dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()})
131
+
132
+ # step 3. enrich using Hugging Face API
133
+ dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())
134
+
135
+ # return as Pandas dataframe
136
+ dataframe = dataset.to_pandas()
137
+
138
+ # convert date column to datetime
139
+ dataframe['date'] = pd.to_datetime(dataframe['date'])
140
+
141
+ print("First few rows of the dataset:")
142
+ print(dataframe.head())
143
+
144
+ return dataframe
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ plotly
3
+ tqdm
4
+ datasets
5
+ paperswithcode