nielsr HF staff commited on
Commit
404478b
·
1 Parent(s): 570845b

Add aggregated data

Browse files
Files changed (1) hide show
  1. app.py +87 -33
app.py CHANGED
@@ -2,11 +2,60 @@ from datetime import datetime
2
 
3
  import streamlit as st
4
  import pandas as pd
 
5
 
6
  # from load_dataframe import get_data
7
 
8
 
9
- # Main Streamlit app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def main():
11
  st.title("Hugging Face Papers KPI Dashboard")
12
 
@@ -14,16 +63,16 @@ def main():
14
  st.sidebar.title("Navigation")
15
  selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
16
 
17
- if selection == "Daily/weekly/monthly data":
18
- # TODO use this instead
19
- # df = get_data()
20
- df = pd.read_csv('/Users/nielsrogge/Downloads/daily_papers_enriched (1).csv')
21
- df = df.drop(['Unnamed: 0'], axis=1)
22
- # Use date as index
23
- df = df.set_index('date')
24
- df.index = pd.to_datetime(df.index)
25
- df = df.sort_index()
26
 
 
27
  # Button to select day, month or week
28
  # Add streamlit selectbox.
29
  view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])
@@ -40,41 +89,46 @@ def main():
40
 
41
  st.write(f"Showing data for {day.strftime('%d/%m/%Y')}")
42
 
43
- num_artifacts = df[(df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)].shape[0]
44
-
45
- st.markdown(f"""
46
- ## Number of papers: {df.shape[0]}
47
- #### Number of papers with a Github link: {df['github'].notnull().sum()}
48
- #### Number of papers with at least one HF artifact: {num_artifacts}
49
- """)
50
-
51
- st.dataframe(df,
52
- hide_index=True,
53
- column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
54
- column_config={"github": st.column_config.LinkColumn(),
55
- "paper_page": st.column_config.LinkColumn()},
56
- width=2000)
57
 
58
  elif view_level == "week":
59
  # make a button to select the week
60
- week = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)
 
 
 
 
 
 
61
 
62
- df = df.loc[df['date'].dt.isocalendar().week == week.isocalendar().week]
63
 
64
- st.write(f"Showing data for {day}")
65
- st.dataframe(df)
66
 
67
  elif view_level == "month":
68
  # make a button to select the month, defaulting to current month
69
- month = st.sidebar.date_input("Select month", value=pd.Timestamp.today().month_name())
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- df = df.loc[df['date'].dt.month_name() == month]
72
 
73
- st.write(f"Showing data for {day}")
74
- st.dataframe(df)
75
 
76
  elif selection == "Aggregated data":
77
- st.write("Aggregated data")
 
78
 
79
  else:
80
  st.write("Error: selection not recognized")
 
2
 
3
  import streamlit as st
4
  import pandas as pd
5
+ import matplotlib.pyplot as plt
6
 
7
  # from load_dataframe import get_data
8
 
9
 
10
+ def aggregated_data(df, aggregation_level="week"):
11
+
12
+ st.write(f"Aggregated data by {aggregation_level}")
13
+
14
+ # Create a column that indicates if a paper has any artifacts
15
+ df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
16
+
17
+ # Resample by week
18
+ freq = 'W' if aggregation_level == "week" else 'M'
19
+ weekly_total_papers = df.resample(freq).size()
20
+ weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum()
21
+
22
+ # Calculate the percentage of papers with artifacts
23
+ percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100
24
+
25
+ # Create the plot
26
+ plt.figure(figsize=(12, 6))
27
+ plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')
28
+
29
+ # Set the y-axis limits
30
+ plt.ylim(0, 100)
31
+
32
+ plt.xlabel(aggregation_level)
33
+ plt.ylabel('Percentage')
34
+ plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time')
35
+ plt.legend()
36
+ plt.grid(True)
37
+
38
+ # Use Streamlit to display the plot
39
+ st.pyplot(plt)
40
+
41
+
42
+ def display_data(df):
43
+ num_artifacts = df[(df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)].shape[0]
44
+
45
+ st.markdown(f"""
46
+ ## Number of papers: {df.shape[0]}
47
+ #### Number of papers with a Github link: {df['github'].notnull().sum()}
48
+ #### Number of papers with at least one HF artifact: {num_artifacts}
49
+ """)
50
+
51
+ st.dataframe(df,
52
+ hide_index=True,
53
+ column_order=("paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
54
+ column_config={"github": st.column_config.LinkColumn(),
55
+ "paper_page": st.column_config.LinkColumn()},
56
+ width=2000)
57
+
58
+
59
  def main():
60
  st.title("Hugging Face Papers KPI Dashboard")
61
 
 
63
  st.sidebar.title("Navigation")
64
  selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])
65
 
66
+ # TODO use this instead
67
+ # df = get_data()
68
+ df = pd.read_csv('/Users/nielsrogge/Downloads/daily_papers_enriched (1).csv')
69
+ df = df.drop(['Unnamed: 0'], axis=1)
70
+ # Use date as index
71
+ df = df.set_index('date')
72
+ df.index = pd.to_datetime(df.index)
73
+ df = df.sort_index()
 
74
 
75
+ if selection == "Daily/weekly/monthly data":
76
  # Button to select day, month or week
77
  # Add streamlit selectbox.
78
  view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])
 
89
 
90
  st.write(f"Showing data for {day.strftime('%d/%m/%Y')}")
91
 
92
+ display_data(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  elif view_level == "week":
95
  # make a button to select the week
96
+ week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)
97
+
98
+ # Extract week number from the index
99
+ df['week'] = df.index.isocalendar().week
100
+
101
+ # Filter the dataframe for the desired week number
102
+ df = df[df['week'] == week_number]
103
 
104
+ st.write(f"Showing data for week {week_number}")
105
 
106
+ display_data(df)
 
107
 
108
  elif view_level == "month":
109
  # make a button to select the month, defaulting to current month
110
+ month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
111
+ year_str = st.selectbox("Select year", options=["2024"])
112
+
113
+ # Filter the dataframe for the desired week number
114
+ month_map = {
115
+ 'January': 1, 'February': 2, 'March': 3, 'April': 4,
116
+ 'May': 5, 'June': 6, 'July': 7, 'August': 8,
117
+ 'September': 9, 'October': 10, 'November': 11, 'December': 12
118
+ }
119
+
120
+ # Convert month string to number
121
+ month = month_map[month_str]
122
+ year = int(year_str)
123
+ df = df[(df.index.month == month) & (df.index.year == year)]
124
 
125
+ st.write(f"Showing data for month {month}")
126
 
127
+ display_data(df)
 
128
 
129
  elif selection == "Aggregated data":
130
+ aggregated_data(df)
131
+ aggregated_data(df, aggregation_level="month")
132
 
133
  else:
134
  st.write("Error: selection not recognized")