File size: 7,524 Bytes
57c87c9
 
 
 
404478b
57c87c9
1396667
57c87c9
 
404478b
 
 
 
 
 
 
 
4dd059d
404478b
 
 
 
 
 
3170ddb
 
1396667
9325c4d
 
 
1396667
3170ddb
404478b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99a2513
 
2adbdb9
 
 
 
 
 
 
 
99a2513
 
 
 
 
 
2adbdb9
 
99a2513
 
 
3170ddb
 
404478b
3170ddb
 
99a2513
 
3170ddb
404478b
b58eec2
 
99a2513
 
2adbdb9
b58eec2
99a2513
 
4dd059d
 
99a2513
 
404478b
 
57c87c9
b58eec2
57c87c9
570845b
 
 
 
99a2513
 
 
 
 
 
 
 
 
 
404478b
570845b
 
 
 
 
 
 
 
 
 
99a2513
 
 
2adbdb9
570845b
b58eec2
99a2513
570845b
99a2513
570845b
404478b
99a2513
 
 
404478b
 
 
 
 
2adbdb9
570845b
404478b
570845b
99a2513
570845b
99a2513
570845b
404478b
 
99a2513
 
 
404478b
 
 
 
 
 
 
 
 
 
 
2adbdb9
570845b
b58eec2
570845b
99a2513
570845b
 
2adbdb9
 
 
 
404478b
 
570845b
 
 
57c87c9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
from datetime import datetime

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt

from load_dataframe import get_data


def aggregated_data(df, aggregation_level="week"):

    st.write(f"Aggregated data by {aggregation_level}")

    # Create a column that indicates if a paper has any artifacts
    df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)

    # Resample by week
    freq = 'W' if aggregation_level == "week" else 'ME'
    weekly_total_papers = df.resample(freq).size()
    weekly_papers_with_artifacts = df.resample(freq)['has_artifact'].sum()

    # Calculate the percentage of papers with artifacts
    percentage_papers_with_artifacts = (weekly_papers_with_artifacts / weekly_total_papers) * 100

    # Calculate the growth rate
    growth_rate = percentage_papers_with_artifacts.pct_change() * 100
    growth_rate = growth_rate.replace([float('inf'), float('-inf')], pd.NA).dropna()

    # Display the average growth rate as a big number
    average_growth_rate = growth_rate.mean()
    st.metric(label=f"{aggregation_level.capitalize()}ly Average Growth Rate", value=f"{average_growth_rate:.2f}%")

    # Create the plot
    plt.figure(figsize=(12, 6))
    plt.plot(percentage_papers_with_artifacts.index, percentage_papers_with_artifacts, marker='o', linestyle='-', color='b', label='Percentage of Papers with on least 1 Artifact')

    # Set the y-axis limits
    plt.ylim(0, 100)
    
    plt.xlabel(aggregation_level)
    plt.ylabel('Percentage')
    plt.title('Percentage of Papers with Artifacts (Models, Datasets, Spaces) Over Time')
    plt.legend()
    plt.grid(True)

    # Use Streamlit to display the plot
    st.pyplot(plt)


def show_data_editor(filtered_df: pd.DataFrame, key: str):
    edited_df = st.data_editor(filtered_df,
                hide_index=True,
                column_order=("reached_out", "reached_out_link", "paper_page", "title", "github", "num_models", "num_datasets", "num_spaces"),
                column_config={"github": st.column_config.LinkColumn(),
                                "paper_page": st.column_config.LinkColumn(),
                                "paper_page_with_title": st.column_config.LinkColumn(display_text=r'\|(.*)')},
                width=2000,
                key=key)
    
    if edited_df is not None and not edited_df.equals(filtered_df):
        # update the df of the session state with the affected rows
        # TODO there seems to be a bug in here
        original_df = st.session_state.df
        original_df.update(edited_df)
        st.session_state.df = original_df


def display_data(filtered_df: pd.DataFrame):
    num_artifacts = filtered_df['has_artifact'].sum()
    percentage_of_at_least_one_artifact = num_artifacts / filtered_df.shape[0] if filtered_df.shape[0] > 0 else 0
    percentage_of_at_least_one_artifact = round(percentage_of_at_least_one_artifact * 100, 2)

    st.markdown(f"""
    ## {percentage_of_at_least_one_artifact}% papers with at least one 🤗 artifact
    
    * Number of papers: {filtered_df.shape[0]}
    * Number of papers with a Github link: {(filtered_df['github'].values != '').sum()}
    * Number of papers with at least one HF artifact: {num_artifacts}
    """)

    st.write("Papers with at least one artifact")
    show_data_editor(filtered_df=filtered_df[filtered_df['has_artifact']],
                     key="papers_with_artifacts")

    st.write("Papers without artifacts")
    show_data_editor(filtered_df=filtered_df[~filtered_df['has_artifact']],
                     key="papers_without_artifacts")
    
    st.write("Papers with a HF mention in README but no artifacts")
    show_data_editor(filtered_df=filtered_df[(filtered_df['hf_mention'] == 1) & (~filtered_df['has_artifact'])],
                     key="papers_with_hf_mention_no_artifacts")


def main():
    st.title("Hugging Face Artifacts KPI Dashboard")

    # 2 tabs: one for daily data, one for weekly data
    st.sidebar.title("Navigation")
    selection = st.sidebar.selectbox("Go to", ["Daily/weekly/monthly data", "Aggregated data"])

    # Initialize session state
    if 'df' not in st.session_state:
        df = get_data()
        # add has_artifact, reached out and reached out link columns
        # TODO remove since this will overwrite everything if we have added data before
        df['has_artifact'] = (df['num_models'] > 0) | (df['num_datasets'] > 0) | (df['num_spaces'] > 0)
        df['reached_out'] = [False for _ in range(df.shape[0])]
        df["reached_out_link"] = ["" for _ in range(df.shape[0])]
        st.session_state.df = df

    if selection == "Daily/weekly/monthly data":
        # Button to select day, month or week
        # Add streamlit selectbox.
        view_level = st.selectbox(label="View data per day, week or month", options=["day", "week", "month"])

        if view_level == "day":
            # make a button to select the day, defaulting to today
            day = st.date_input("Select day", value="today", format="DD/MM/YYYY")
            # convert to the day of a Pandas Timestamp
            day = pd.Timestamp(day)

            # fetch df from sessions state
            df = st.session_state.df

            filtered_df = df[df.index.date == day.date()]

            st.write(f"Showing data for {day.day_name()} {day.strftime('%d/%m/%Y')}")
            display_data(filtered_df=filtered_df)

        elif view_level == "week":            
            # make a button to select the week
            week_number = st.number_input("Select week", value=datetime.today().isocalendar()[1], min_value=1, max_value=52)

            # fetch df from sessions state
            df = st.session_state.df
            
            # Extract week number from the index
            df['week'] = df.index.isocalendar().week

            # Filter the dataframe for the desired week number
            filtered_df = df[df['week'] == week_number]
            
            st.write(f"Showing data for week {week_number}")
            
            display_data(filtered_df=filtered_df)

        elif view_level == "month":            
            # make a button to select the month, defaulting to current month
            month_str = st.selectbox("Select month", options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
            year_str = st.selectbox("Select year", options=["2024"])

            # fetch df from sessions state
            df = st.session_state.df
            
            # Filter the dataframe for the desired week number
            month_map = {
                'January': 1, 'February': 2, 'March': 3, 'April': 4, 
                'May': 5, 'June': 6, 'July': 7, 'August': 8, 
                'September': 9, 'October': 10, 'November': 11, 'December': 12
            }
            
            # Convert month string to number
            month = month_map[month_str]
            year = int(year_str)
            filtered_df = df[(df.index.month == month) & (df.index.year == year)]
            
            st.write(f"Showing data for {month_str} {year_str}")
            
            display_data(filtered_df=filtered_df)

    elif selection == "Aggregated data":

        # get the latest dataframe
        df = get_data()

        aggregated_data(df)
        aggregated_data(df, aggregation_level="month")

    else:
        st.write("Error: selection not recognized")


if __name__ == "__main__":
    main()