Spaces:

nielsr
/

community-science-progress

Running

App Files Files Community

community-science-progress / app.py

nielsr HF Staff

Improve data processing

1396667 12 months ago

raw

history blame

7.29 kB

	from datetime import datetime

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt

	from load_dataframe import get_data


	def aggregated_data(df, aggregation_level="week"):

	st.write(f"Aggregated data by {aggregation_level}")

	# Create a column that indicates if a paper has any artifacts
	df['has_artifact'] = (df['num_models'] > 0) \| (df['num_datasets'] > 0) \| (df['num_spaces'] > 0)

	# Resample by week
	freq = 'W' if aggregation_level == "week" else 'ME'
	weekly_total_papers = df.resample(freq).size()
	weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum()

	# Calculate the percentage of papers with artifacts
	percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100

	# Calculate the growth rate
	growth_rate = percentage_papers_with_artifacts.pct_change() * 100
	growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()

	# Display the average growth rate as a big number
	average_growth_rate = growth_rate.mean()
	st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")

	# Create the plot
	plt.figure(figsize=(12, 6))
	plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')

	# Set the y-axis limits
	plt.ylim(0, 100)

	plt.xlabel(aggregation_level)
	plt.ylabel('Percentage')
	plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time')
	plt.legend()
	plt.grid(True)

	# Use Streamlit to display the plot
	st.pyplot(plt)


	def display_data(df):
	df['has_artifact'] = (df['num_models'] > 0) \| (df['num_datasets'] > 0) \| (df['num_spaces'] > 0)
	num_artifacts = df['has_artifact'].sum()
	percentage_of_at_least_one_artifact = num_artifacts / df.shape[0] if df.shape[0] > 0 else 0
	percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)

	# add reached out and reached out link columns
	df['reached_out'] = [False for _ in range(df.shape[0])]
	df["reached_out_link"] = ["" for _ in range(df.shape[0])]

	st.markdown(f"""
	## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact

	* Number of papers: {df.shape[0]}
	* Number of papers with a Github link: {df['github'].notnull().sum()}
	* Number of papers with at least one HF artifact: {num_artifacts}
	""")

	st.write("Papers with at least one artifact")
	st.data_editor(df[df['has_artifact']],
	hide_index=True,
	column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
	column_config={"github": st.column_config.LinkColumn(),
	"paper_page": st.column_config.LinkColumn(),
	"paper_page_with_title": st.column_config.LinkColumn(display_text=r'\\|(.*)')},
	width=2000,
	key="papers_with_artifacts")

	st.write("Papers without artifacts")
	st.data_editor(df[~df['has_artifact']],
	hide_index=True,
	column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
	column_config={"github": st.column_config.LinkColumn(),
	"paper_page": st.column_config.LinkColumn()},
	width=2000,
	key="papers_without_artifacts")

	st.write("Papers with a HF mention in README but no artifacts")
	st.data_editor(df[(df['hf_mention'] == 1) & (~df['has_artifact'])],
	hide_index=True,
	column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
	column_config={"github": st.column_config.LinkColumn(),
	"paper_page": st.column_config.LinkColumn()},
	width=2000,
	key="papers_with_hf_mention_no_artifacts")


	def main():
	st.title("Hugging Face Artifacts KPI Dashboard")

	# 2 tabs: one for daily data, one for weekly data
	st.sidebar.title("Navigation")
	selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])

	# TODO use this instead
	df = get_data()

	print(df.head())

	# df = pd.read_csv('daily_papers_enriched (3).csv')
	df = df.drop(['Unnamed: 0'], axis=1) if 'Unnamed: 0' in df.columns else df
	# Use date as index
	# df = df.set_index('date')
	# df.index = pd.to_datetime(df.index)
	df = df.sort_index()

	if selection == "Daily/weekly/monthly data":
	# Button to select day, month or week
	# Add streamlit selectbox.
	view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])

	if view_level == "day":
	# make a button to select the day, defaulting to today
	day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
	# convert to the day of a Pandas Timestamp
	day = pd.Timestamp(day)

	df = df[df.index.date == day.date()]

	st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")

	display_data(df)

	elif view_level == "week":
	# make a button to select the week
	week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)

	# Extract week number from the index
	df['week'] = df.index.isocalendar().week

	# Filter the dataframe for the desired week number
	df = df[df['week'] == week_number]

	st.write(f"Showing data for week {week_number}")

	display_data(df)

	elif view_level == "month":
	# make a button to select the month, defaulting to current month
	month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
	year_str = st.selectbox("Select year", options=["2024"])

	# Filter the dataframe for the desired week number
	month_map = {
	'January': 1, 'February': 2, 'March': 3, 'April': 4,
	'May': 5, 'June': 6, 'July': 7, 'August': 8,
	'September': 9, 'October': 10, 'November': 11, 'December': 12
	}

	# Convert month string to number
	month = month_map[month_str]
	year = int(year_str)
	df = df[(df.index.month == month) & (df.index.year == year)]

	st.write(f"Showing data for {month_str} {year_str}")

	display_data(df)

	elif selection == "Aggregated data":
	aggregated_data(df)
	aggregated_data(df, aggregation_level="month")

	else:
	st.write("Error: selection not recognized")


	if __name__ == "__main__":
	main()