nielsr's picture
nielsr HF staff
Improve data processing
1396667
raw
history blame
7.29 kB
from datetime import datetime
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from load_dataframe import get_data
def aggregated_data(df, aggregation_level="week"):
st.write(f"Aggregated data by {aggregation_level}")
# Create a column that indicates if a paper has any artifacts
df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
# Resample by week
freq = 'W' if aggregation_level == "week" else 'ME'
weekly_total_papers = df.resample(freq).size()
weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum()
# Calculate the percentage of papers with artifacts
percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
# Calculate the growth rate
growth_rate = percentage_papers_with_artifacts.pct_change() * 100
growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()
# Display the average growth rate as a big number
average_growth_rate = growth_rate.mean()
st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")
# Create the plot
plt.figure(figsize=(12, 6))
plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
# Set the y-axis limits
plt.ylim(0, 100)
plt.xlabel(aggregation_level)
plt.ylabel('Percentage')
plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time')
plt.legend()
plt.grid(True)
# Use Streamlit to display the plot
st.pyplot(plt)
def display_data(df):
df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
num_artifacts = df['has_artifact'].sum()
percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)
# add reached out and reached out link columns
df['reached_out'] = [False for _ in range(df.shape[0])]
df["reached_out_link"] = ["" for _ in range(df.shape[0])]
st.markdown(f"""
## {percentage_of_at_least_one_artifact}% papers with at least one πŸ€— artifact
* Number of papers: {df.shape[0]}
* Number of papers with a Github link: {df['github'].notnull().sum()}
* Number of papers with at least one HF artifact: {num_artifacts}
""")
st.write("Papers with at least one artifact")
st.data_editor(df[df['has_artifact']],
hide_index=True,
column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
column_config={"github": st.column_config.LinkColumn(),
"paper_page": st.column_config.LinkColumn(),
"paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
width=2000,
key="papers_with_artifacts")
st.write("Papers without artifacts")
st.data_editor(df[~df['has_artifact']],
hide_index=True,
column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
column_config={"github": st.column_config.LinkColumn(),
"paper_page": st.column_config.LinkColumn()},
width=2000,
key="papers_without_artifacts")
st.write("Papers with a HF mention in README but no artifacts")
st.data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
hide_index=True,
column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
column_config={"github": st.column_config.LinkColumn(),
"paper_page": st.column_config.LinkColumn()},
width=2000,
key="papers_with_hf_mention_no_artifacts")
def main():
st.title("Hugging Face Artifacts KPI Dashboard")
# 2 tabs: one for daily data, one for weekly data
st.sidebar.title("Navigation")
selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
# TODO use this instead
df = get_data()
print(df.head())
# df = pd.read_csv('daily_papers_enriched (3).csv')
df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
# Use date as index
# df = df.set_index('date')
# df.index = pd.to_datetime(df.index)
df = df.sort_index()
if selection == "Daily/weekly/monthly data":
# Button to select day, month or week
# Add streamlit selectbox.
view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])
if view_level == "day":
# make a button to select the day, defaulting to today
day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
# convert to the day of a Pandas Timestamp
day = pd.Timestamp(day)
df = df[df.index.date == day.date()]
st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")
display_data(df)
elif view_level == "week":
# make a button to select the week
week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)
# Extract week number from the index
df['week'] = df.index.isocalendar().week
# Filter the dataframe for the desired week number
df = df[df['week'] == week_number]
st.write(f"Showing data for week {week_number}")
display_data(df)
elif view_level == "month":
# make a button to select the month, defaulting to current month
month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
year_str = st.selectbox("Select year", options=["2024"])
# Filter the dataframe for the desired week number
month_map = {
'January': 1, 'February': 2, 'March': 3, 'April': 4,
'May': 5, 'June': 6, 'July': 7, 'August': 8,
'September': 9, 'October': 10, 'November': 11, 'December': 12
}
# Convert month string to number
month = month_map[month_str]
year = int(year_str)
df = df[(df.index.month == month) & (df.index.year == year)]
st.write(f"Showing data for {month_str} {year_str}")
display_data(df)
elif selection == "Aggregated data":
aggregated_data(df)
aggregated_data(df, aggregation_level="month")
else:
st.write("Error: selection not recognized")
if __name__ == "__main__":
main()