Spaces:
Sleeping
Sleeping
Update requirements
Browse files- app.py +37 -15
- load_dataframe.py +43 -0
- requirements.txt +5 -5
app.py
CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
|
|
5 |
import matplotlib.pyplot as plt
|
6 |
|
7 |
# from load_dataframe import get_data
|
|
|
8 |
|
9 |
|
10 |
def aggregated_data(df, aggregation_level="week"):
|
@@ -22,6 +23,13 @@ def aggregated_data(df, aggregation_level="week"):
|
|
22 |
# Calculate the percentage of papers with artifacts
|
23 |
percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
# Create the plot
|
26 |
plt.figure(figsize=(12, 6))
|
27 |
plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
|
@@ -40,38 +48,49 @@ def aggregated_data(df, aggregation_level="week"):
|
|
40 |
|
41 |
|
42 |
def display_data(df):
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
st.markdown(f"""
|
46 |
-
##
|
47 |
-
|
48 |
-
|
|
|
|
|
49 |
""")
|
50 |
|
51 |
st.write("Papers with at least one artifact")
|
52 |
-
|
53 |
-
st.dataframe(df[df['has_artifact']],
|
54 |
hide_index=True,
|
55 |
-
column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
56 |
column_config={"github": st.column_config.LinkColumn(),
|
57 |
-
"paper_page": st.column_config.LinkColumn()
|
58 |
-
|
|
|
|
|
59 |
|
60 |
st.write("Papers without artifacts")
|
61 |
-
st.
|
62 |
hide_index=True,
|
63 |
-
column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
64 |
column_config={"github": st.column_config.LinkColumn(),
|
65 |
"paper_page": st.column_config.LinkColumn()},
|
66 |
-
width=2000
|
|
|
67 |
|
68 |
st.write("Papers with a HF mention in README but no artifacts")
|
69 |
-
st.
|
70 |
hide_index=True,
|
71 |
-
column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
72 |
column_config={"github": st.column_config.LinkColumn(),
|
73 |
"paper_page": st.column_config.LinkColumn()},
|
74 |
-
width=2000
|
|
|
75 |
|
76 |
|
77 |
def main():
|
@@ -90,6 +109,9 @@ def main():
|
|
90 |
df.index = pd.to_datetime(df.index)
|
91 |
df = df.sort_index()
|
92 |
|
|
|
|
|
|
|
93 |
if selection == "Daily/weekly/monthly data":
|
94 |
# Button to select day, month or week
|
95 |
# Add streamlit selectbox.
|
|
|
5 |
import matplotlib.pyplot as plt
|
6 |
|
7 |
# from load_dataframe import get_data
|
8 |
+
from urllib.parse import quote
|
9 |
|
10 |
|
11 |
def aggregated_data(df, aggregation_level="week"):
|
|
|
23 |
# Calculate the percentage of papers with artifacts
|
24 |
percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
|
25 |
|
26 |
+
# Calculate the growth rate
|
27 |
+
growth_rate = percentage_papers_with_artifacts.pct_change() * 100
|
28 |
+
|
29 |
+
# Display the latest growth rate as a big number
|
30 |
+
latest_growth_rate = growth_rate.iloc[-1] if not growth_rate.empty else 0
|
31 |
+
st.metric(label=f"{aggregation_level.capitalize()}ly Growth Rate", value=f"{latest_growth_rate:.2f}%")
|
32 |
+
|
33 |
# Create the plot
|
34 |
plt.figure(figsize=(12, 6))
|
35 |
plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
|
|
|
48 |
|
49 |
|
50 |
def display_data(df):
|
51 |
+
df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
|
52 |
+
num_artifacts = df['has_artifact'].sum()
|
53 |
+
percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
|
54 |
+
percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)
|
55 |
+
|
56 |
+
# add reached out column
|
57 |
+
df['reached_out'] = [False for _ in range(df.shape[0])]
|
58 |
|
59 |
st.markdown(f"""
|
60 |
+
## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact
|
61 |
+
|
62 |
+
* Number of papers: {df.shape[0]}
|
63 |
+
* Number of papers with a Github link: {df['github'].notnull().sum()}
|
64 |
+
* Number of papers with at least one HF artifact: {num_artifacts}
|
65 |
""")
|
66 |
|
67 |
st.write("Papers with at least one artifact")
|
68 |
+
st.data_editor(df[df['has_artifact']],
|
|
|
69 |
hide_index=True,
|
70 |
+
column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
71 |
column_config={"github": st.column_config.LinkColumn(),
|
72 |
+
"paper_page": st.column_config.LinkColumn(),
|
73 |
+
"paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
|
74 |
+
width=2000,
|
75 |
+
key="papers_with_artifacts")
|
76 |
|
77 |
st.write("Papers without artifacts")
|
78 |
+
st.data_editor(df[~df['has_artifact']],
|
79 |
hide_index=True,
|
80 |
+
column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
81 |
column_config={"github": st.column_config.LinkColumn(),
|
82 |
"paper_page": st.column_config.LinkColumn()},
|
83 |
+
width=2000,
|
84 |
+
key="papers_without_artifacts")
|
85 |
|
86 |
st.write("Papers with a HF mention in README but no artifacts")
|
87 |
+
st.data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
|
88 |
hide_index=True,
|
89 |
+
column_order=("reached_out", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
|
90 |
column_config={"github": st.column_config.LinkColumn(),
|
91 |
"paper_page": st.column_config.LinkColumn()},
|
92 |
+
width=2000,
|
93 |
+
key="papers_with_hf_mention_no_artifacts")
|
94 |
|
95 |
|
96 |
def main():
|
|
|
109 |
df.index = pd.to_datetime(df.index)
|
110 |
df = df.sort_index()
|
111 |
|
112 |
+
# hack: include title in URL column
|
113 |
+
df['updated_url'] = df.apply(lambda row: f'{row["paper_page"]}/title/{quote(row["title"])}', axis=1)
|
114 |
+
|
115 |
if selection == "Daily/weekly/monthly data":
|
116 |
# Button to select day, month or week
|
117 |
# Add streamlit selectbox.
|
load_dataframe.py
CHANGED
@@ -21,6 +21,10 @@ class PaperInfo:
|
|
21 |
|
22 |
|
23 |
def get_df() -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
24 |
df = pd.merge(
|
25 |
left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
|
26 |
right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
|
@@ -112,6 +116,42 @@ def add_hf_assets(batch):
|
|
112 |
return batch
|
113 |
|
114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
@st.cache_data
|
116 |
def get_data() -> pd.DataFrame:
|
117 |
"""
|
@@ -132,6 +172,9 @@ def get_data() -> pd.DataFrame:
|
|
132 |
# step 3. enrich using Hugging Face API
|
133 |
dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())
|
134 |
|
|
|
|
|
|
|
135 |
# return as Pandas dataframe
|
136 |
dataframe = dataset.to_pandas()
|
137 |
|
|
|
21 |
|
22 |
|
23 |
def get_df() -> pd.DataFrame:
|
24 |
+
"""
|
25 |
+
Load the initial dataset as a Pandas dataframe.
|
26 |
+
"""
|
27 |
+
|
28 |
df = pd.merge(
|
29 |
left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
|
30 |
right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
|
|
|
116 |
return batch
|
117 |
|
118 |
|
119 |
+
def check_hf_mention(batch):
|
120 |
+
"""
|
121 |
+
Check if a paper mentions Hugging Face in the README.
|
122 |
+
"""
|
123 |
+
|
124 |
+
hf_mentions = []
|
125 |
+
for github_url in batch["github"]:
|
126 |
+
hf_mention = 0
|
127 |
+
if github_url != "":
|
128 |
+
# get README text using Github API
|
129 |
+
owner = github_url.split("/")[-2]
|
130 |
+
repo = github_url.split("/")[-1]
|
131 |
+
branch = "main"
|
132 |
+
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md"
|
133 |
+
response = requests.get(url)
|
134 |
+
|
135 |
+
if response.status_code != 200:
|
136 |
+
# try master branch as second attempt
|
137 |
+
branch = "master"
|
138 |
+
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md"
|
139 |
+
response = requests.get(url)
|
140 |
+
|
141 |
+
if response.status_code == 200:
|
142 |
+
# get text
|
143 |
+
text = response.text
|
144 |
+
if "huggingface" in text.lower() or "hugging face" in text.lower():
|
145 |
+
hf_mention = 1
|
146 |
+
|
147 |
+
hf_mentions.append(hf_mention)
|
148 |
+
|
149 |
+
# overwrite the Github links
|
150 |
+
batch["hf_mention"] = hf_mentions
|
151 |
+
|
152 |
+
return batch
|
153 |
+
|
154 |
+
|
155 |
@st.cache_data
|
156 |
def get_data() -> pd.DataFrame:
|
157 |
"""
|
|
|
172 |
# step 3. enrich using Hugging Face API
|
173 |
dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count())
|
174 |
|
175 |
+
# step 4. check if Hugging Face is mentioned in the README
|
176 |
+
dataset = dataset.map(check_hf_mention, batched=True, batch_size=4, num_proc=cpu_count())
|
177 |
+
|
178 |
# return as Pandas dataframe
|
179 |
dataframe = dataset.to_pandas()
|
180 |
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
streamlit
|
2 |
-
|
3 |
-
tqdm
|
4 |
-
datasets
|
5 |
-
paperswithcode
|
|
|
1 |
+
streamlit==1.36.0
|
2 |
+
matplotlib==3.7.0
|
3 |
+
tqdm==4.66.4
|
4 |
+
datasets==2.20.0
|
5 |
+
paperswithcode-client==0.3.1
|