File size: 7,524 Bytes
57c87c9 404478b 57c87c9 1396667 57c87c9 404478b 4dd059d 404478b 3170ddb 1396667 9325c4d 1396667 3170ddb 404478b 99a2513 2adbdb9 99a2513 2adbdb9 99a2513 3170ddb 404478b 3170ddb 99a2513 3170ddb 404478b b58eec2 99a2513 2adbdb9 b58eec2 99a2513 4dd059d 99a2513 404478b 57c87c9 b58eec2 57c87c9 570845b 99a2513 404478b 570845b 99a2513 2adbdb9 570845b b58eec2 99a2513 570845b 99a2513 570845b 404478b 99a2513 404478b 2adbdb9 570845b 404478b 570845b 99a2513 570845b 99a2513 570845b 404478b 99a2513 404478b 2adbdb9 570845b b58eec2 570845b 99a2513 570845b 2adbdb9 404478b 570845b 57c87c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
from datetime import datetime
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from load_dataframe import get_data
def aggregated_data(df, aggregation_level="week"):
st.write(f"Aggregated data by {aggregation_level}")
# Create a column that indicates if a paper has any artifacts
df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
# Resample by week
freq = 'W' if aggregation_level == "week" else 'ME'
weekly_total_papers = df.resample(freq).size()
weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum()
# Calculate the percentage of papers with artifacts
percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
# Calculate the growth rate
growth_rate = percentage_papers_with_artifacts.pct_change() * 100
growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()
# Display the average growth rate as a big number
average_growth_rate = growth_rate.mean()
st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")
# Create the plot
plt.figure(figsize=(12, 6))
plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
# Set the y-axis limits
plt.ylim(0, 100)
plt.xlabel(aggregation_level)
plt.ylabel('Percentage')
plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time')
plt.legend()
plt.grid(True)
# Use Streamlit to display the plot
st.pyplot(plt)
def show_data_editor(filtered_df: pd.DataFrame, key: str):
edited_df = st.data_editor(filtered_df,
hide_index=True,
column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
column_config={"github": st.column_config.LinkColumn(),
"paper_page": st.column_config.LinkColumn(),
"paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
width=2000,
key=key)
if edited_df is not None and not edited_df.equals(filtered_df):
# update the df of the session state with the affected rows
# TODO there seems to be a bug in here
original_df = st.session_state.df
original_df.update(edited_df)
st.session_state.df = original_df
def display_data(filtered_df: pd.DataFrame):
num_artifacts = filtered_df['has_artifact'].sum()
percentage_of_at_least_one_artifact = num_artifacts / filtered_df.shape[0] if filtered_df.shape[0] > 0 else 0
percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)
st.markdown(f"""
## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact
* Number of papers: {filtered_df.shape[0]}
* Number of papers with a Github link: {(filtered_df['github'].values != '').sum()}
* Number of papers with at least one HF artifact: {num_artifacts}
""")
st.write("Papers with at least one artifact")
show_data_editor(filtered_df=filtered_df[filtered_df['has_artifact']],
key="papers_with_artifacts")
st.write("Papers without artifacts")
show_data_editor(filtered_df=filtered_df[~filtered_df['has_artifact']],
key="papers_without_artifacts")
st.write("Papers with a HF mention in README but no artifacts")
show_data_editor(filtered_df=filtered_df[(filtered_df['hf_mention'] == 1) & (~filtered_df['has_artifact'])],
key="papers_with_hf_mention_no_artifacts")
def main():
st.title("Hugging Face Artifacts KPI Dashboard")
# 2 tabs: one for daily data, one for weekly data
st.sidebar.title("Navigation")
selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
# Initialize session state
if 'df' not in st.session_state:
df = get_data()
# add has_artifact, reached out and reached out link columns
# TODO remove since this will overwrite everything if we have added data before
df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
df['reached_out'] = [False for _ in range(df.shape[0])]
df["reached_out_link"] = ["" for _ in range(df.shape[0])]
st.session_state.df = df
if selection == "Daily/weekly/monthly data":
# Button to select day, month or week
# Add streamlit selectbox.
view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])
if view_level == "day":
# make a button to select the day, defaulting to today
day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
# convert to the day of a Pandas Timestamp
day = pd.Timestamp(day)
# fetch df from sessions state
df = st.session_state.df
filtered_df = df[df.index.date == day.date()]
st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")
display_data(filtered_df=filtered_df)
elif view_level == "week":
# make a button to select the week
week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)
# fetch df from sessions state
df = st.session_state.df
# Extract week number from the index
df['week'] = df.index.isocalendar().week
# Filter the dataframe for the desired week number
filtered_df = df[df['week'] == week_number]
st.write(f"Showing data for week {week_number}")
display_data(filtered_df=filtered_df)
elif view_level == "month":
# make a button to select the month, defaulting to current month
month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
year_str = st.selectbox("Select year", options=["2024"])
# fetch df from sessions state
df = st.session_state.df
# Filter the dataframe for the desired week number
month_map = {
'January': 1, 'February': 2, 'March': 3, 'April': 4,
'May': 5, 'June': 6, 'July': 7, 'August': 8,
'September': 9, 'October': 10, 'November': 11, 'December': 12
}
# Convert month string to number
month = month_map[month_str]
year = int(year_str)
filtered_df = df[(df.index.month == month) & (df.index.year == year)]
st.write(f"Showing data for {month_str} {year_str}")
display_data(filtered_df=filtered_df)
elif selection == "Aggregated data":
# get the latest dataframe
df = get_data()
aggregated_data(df)
aggregated_data(df, aggregation_level="month")
else:
st.write("Error: selection not recognized")
if __name__ == "__main__":
main() |