Spaces:
Sleeping
Sleeping
File size: 20,428 Bytes
7fa3a99 1093a00 7fa3a99 84f637a 7fa3a99 84f637a 7fa3a99 84f637a 7fa3a99 84f637a 7fa3a99 84f637a 7fa3a99 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 |
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.manifold import TSNE
unfiltered_spaces_with_outliers = pd.read_csv('hugging_face_spaces.csv')
spaces = unfiltered_spaces_with_outliers[unfiltered_spaces_with_outliers['likes'] >= 3]
# spaces = spaces_with_outliers[(np.abs(stats.zscore(spaces_with_outliers['likes'])) < 3)]
# print(spaces.describe())
# print(spaces.describe().columns)
# print(spaces.describe().astype(str).to_numpy().tolist())
# spaces = spaces[spaces[['total_commits', 'community_interactions']].notna()]
# spaces[['total_commits', 'community_interactions']] = spaces[['total_commits', 'community_interactions']].astype('Int64')
descriptions = [
'Unique id of a space, comprised of username followed by the space name separated by a "/"',
'SDK of space, one of gradio, streamlit or static',
'Total number of likes associated with the space',
'Username of the user',
'Name of the space',
'URL associated to the space',
'Various forms of input the space takes in',
'Various entities the space outputs',
'Is True if the space repo has an app.py',
'AI/ML related python packages that were used in the making of the model',
'The date of the last known commit on the space\'s repository',
'The total number of commits on the space\'s repository',
'The current status of space',
'The total number of contributions or interactions from the community on the space\'s repository',
]
def info(df_in):
df_info = df_in.columns.to_frame(name='Column')
df_info['Non-Null Count'] = df_in.notna().sum()
df_info['Datatype'] = df_in.dtypes
df_info.reset_index(drop=True, inplace=True)
df_info['#'] = df_info.index
df_info['Description'] = descriptions
return df_info[['#', 'Column', 'Non-Null Count', 'Datatype', 'Description']].astype(str)
def get_nulls():
fig, ax = plt.subplots(figsize=(25, 5))
sns.barplot(x=spaces.columns, y=spaces.notnull().sum(), ax=ax, palette="tab20c_r")
ax.set_xticklabels(spaces.columns)
ax.set_title(f"Non-Null values in each column ( Total rows: {len(spaces.index)} )\n", fontsize="x-large")
for i, val in enumerate(spaces.notnull().sum()):
plt.text(i, val+9, val, horizontalalignment='center', verticalalignment='bottom', fontdict={'fontweight':500, 'size':12})
ax.set_yticklabels([])
ax.set_yticks([False])
sns.despine(top=True, right=True, left=True)
return fig
def get_corr():
fig = plt.figure(figsize=(10, 5))
sns.heatmap(spaces.corr(), annot=True, linewidths=.5, fmt='.1f')
plt.tight_layout()
return fig
def get_corr_df():
corr = spaces.corr()
corr = corr.reset_index()
return corr.astype(str)
def get_corr_scatter_total_commits():
fig = plt.figure(figsize=(15, 5))
plt.scatter(spaces['likes'], spaces['total_commits'])
plt.tight_layout()
return fig
def get_corr_scatter_community_interactions():
fig = plt.figure(figsize=(15, 5))
plt.scatter(spaces['likes'], spaces['community_interactions'])
plt.tight_layout()
return fig
def get_top_spaces(quantity, min_value, max_value, filter_by, sort_by):
top_spaces = spaces[(spaces[filter_by] >= min_value) & (spaces[filter_by] <= max_value)]
top_spaces = top_spaces.sort_values(filter_by, ascending=True if sort_by == 'ascending' else False)
top_spaces = top_spaces[['repo_id', filter_by]].iloc[:quantity]
fig = plt.figure(figsize=(10, 20))
plt.bar(top_spaces['repo_id'], top_spaces[filter_by])
plt.xticks(rotation=30)
plt.tight_layout()
return gr.Dataframe.update(value=top_spaces.astype(str).to_numpy().tolist()), gr.Plot.update(value=fig)
def change_limits(filter_by):
updated_slider = gr.Slider.update(minimum=spaces[filter_by].min(), maximum=spaces[filter_by].max(), value=3)
return updated_slider, updated_slider
def get_most_spaces():
most_spaces = spaces['user_name'].value_counts().sort_values(ascending=False).reset_index().iloc[:7]
fig = plt.figure(figsize=(20, 10))
plt.barh(most_spaces.iloc[:, 0], most_spaces.iloc[:, 1])
plt.xticks(rotation=30)
plt.tight_layout()
return most_spaces, fig
def get_most_liked_users():
y = pd.pivot_table(
spaces,
index=['user_name'],
aggfunc={'likes': np.sum, 'user_name': len}
).sort_values('likes', ascending=False).rename(columns={'user_name': 'space_count'}).iloc[:10].reset_index()
y['likes'] = y['likes'].astype(int)
y['space_count'] = y['space_count'].astype(int)
fig = plt.figure(figsize=(20, 8))
sns.set_theme()
sns.relplot(data=y.iloc[:7], x='user_name', y='likes', col='space_count')
plt.show()
return y.iloc[:7].astype(str), fig
def pie_plot(data, figsize=(10, 5)):
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']
fig = plt.figure(figsize=figsize)
for i, (categories, counts) in enumerate(data.items()):
plt.subplot(1, len(data), i+1)
plt.pie(list(map(int, counts.split(","))), colors=colors, labels=categories.split(","), autopct='%1.1f%%', startangle=90)
# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
plt.gcf().gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
plt.axis('equal')
plt.tight_layout()
return fig
def get_sdk_proportions():
df = spaces.groupby('sdk').size().reset_index(name='counts')
return pie_plot({",".join(list(df['sdk'])): ",".join(df['counts'].astype(str))})
def get_sdk_frequencies():
fig = plt.figure()
sns.stripplot(x='sdk', y='likes', data=spaces, jitter=True)
plt.tight_layout()
return fig
def get_io_proportions():
inputs = [y.split(',') for y in spaces[spaces['inputs'].notnull()]['inputs'].values]
inputs = [x for xs in inputs for x in xs]
inputs = pd.Series(inputs).value_counts()
mask = (inputs/inputs.sum() * 100).lt(2)
updated_inputs = inputs[~mask]
updated_inputs['Other'] = inputs[mask].sum()
outputs = [y.split(',') for y in spaces[spaces['outputs'].notnull()]['outputs'].values]
outputs = [x for xs in outputs for x in xs]
outputs = pd.Series(outputs).value_counts()
mask = (outputs/outputs.sum() * 100).lt(2)
updated_outputs = outputs[~mask]
updated_outputs['Other'] = outputs[mask].sum()
return pie_plot({
",".join(list(updated_inputs.index.astype(str))): ",".join(updated_inputs.values.astype(str)),
",".join(list(updated_outputs.index.astype(str))): ",".join(updated_outputs.values.astype(str))
})
def get_packages_proportions():
spaces_ai_reqs = [y.split(',') for y in spaces[spaces['ai_ml_reqs'].notnull()]['ai_ml_reqs'].values]
spaces_ai_reqs = [x for xs in spaces_ai_reqs for x in xs]
spaces_ai_reqs = pd.Series(spaces_ai_reqs).value_counts()
mask = (spaces_ai_reqs/spaces_ai_reqs.sum() * 100).lt(3)
updated_spaces_ai_reqs = spaces_ai_reqs[~mask]
updated_spaces_ai_reqs['Other'] = spaces_ai_reqs[mask].sum()
print(updated_spaces_ai_reqs)
return pie_plot({",".join(list(updated_spaces_ai_reqs.index.astype(str))): ",".join(updated_spaces_ai_reqs.values.astype(str))})
def get_processable_spaces_proportions():
spaces_status = spaces['status'].value_counts()
mask = (spaces_status/spaces_status.sum() * 100).lt(5)
updated_spaces_status = spaces_status[~mask]
updated_spaces_status['Error'] = spaces_status[mask].sum()
return pie_plot({",".join(list(updated_spaces_status.index.astype(str))): ",".join(updated_spaces_status.values.astype(str))})
def get_tsne():
spaces_numeric = spaces[['likes', 'total_commits', 'community_interactions']]
spaces_numeric = spaces_numeric.dropna()
fig = plt.figure()
m = TSNE(learning_rate=50)
tsne_features = m.fit_transform(spaces_numeric)
spaces_numeric['x'] = tsne_features[:, 0]
spaces_numeric['y'] = tsne_features[:, 1]
spaces_numeric['sdk'] = spaces['sdk']
spaces_numeric['like_class'] = spaces_numeric['likes'].apply(get_likes_description)
sns.scatterplot(x='x', y='y', hue='sdk', style='like_class', data=spaces_numeric)
return fig
def get_likes_description(likes):
if likes < spaces['likes'].mean():
return "Below Average"
elif likes < (spaces['likes'].mean() + (spaces['likes'].std() * 1.5)):
return "Good"
else:
return "Awesome"
with gr.Blocks(css="""
#md {width: 60%; padding: 0px 10px 0px}
#plot {width: 40%; margin: auto;}
#spacer {padding: 15px 15px 15px 15px}
""") as demo:
gr.Markdown("""
# Exploring the statistics of 🤗 Spaces
Hugging Face Spaces offer a simple way to host ML demo apps directly on your profile or your organization’s profile. This allows you to create your ML portfolio, showcase your projects at conferences or to stakeholders, and work collaboratively with other people in the ML ecosystem.
This is an interactive blog that provides an overview of all the present spaces on 🤗
**Data Set**: The above and related information was collected by DeepKlarity using the HuggingFace Spaces API Endpoint and extracting data individually from each space's repository via scraping
The dataset can be accessed from the git repository here
## Read the data
```
spaces = pd.read_csv('hugging_face_spaces.csv')
```
### Data columns info and descriptions:
The following table gives insights into each of the columns in the aforementioned dataset and their respective descriptions
""")
gr.Dataframe(type="numpy",
headers=['#', 'Column', 'Non-Null Count', 'Datatype', 'Description'],
value=info(unfiltered_spaces_with_outliers).to_numpy().tolist(),
datatype=['number', 'str', 'number', 'str', 'str'],
row_count=14,
col_count=5,
)
gr.Markdown(f"""
### Identifying valid spaces
For the sake of keeping things simple and logical, lets assume that a space is valid only if it has a minimum of 3 likes. Keeping this in mind, the size of the new filtered dataset falls from {len(unfiltered_spaces_with_outliers.index)} to {len(spaces.index)}.\n
""")
gr.Dataframe(type="numpy",
headers=list(spaces.columns),
value=spaces.astype(str).iloc[:5].to_numpy().tolist(),
row_count=5,
col_count=14,
)
gr.Markdown(f"""
The below plot summarizes the actual amount of non-null values present in each of the columns of the new filtered dataset.
""")
gr.Plot(value=get_nulls())
gr.Markdown("""
There seems to be lot of columns with empty/ null values. Perhaps familiarity with each of the columns and what exactly it captures will help in identifying which fields are useful for the analysis and which fields can be ignored. Here are the reasons for `NaN`s in each of these columns:
- **inputs, outputs**: An absence of any type of inputs/outputs represents a null. Moreover, the script corresponding to this dataset was able to only extract inputs/outputs for Gradio spaces. Therefore, any spaces that have Streamlit or Static as their sdk will also have a null in these columns
- **ai_ml_reqs**: Any spaces that do not use any of the AI/ML packages will have a null in this column
- **last_commit, total_commits, status, community_interactions**: These columns should have around the same number of nulls and represents the fact that an error occurred when scraping space's remote repository
### Finding correlations between characteristics
""")
with gr.Row():
# with gr.Column():
gr.Markdown("""
Looking at the correlations and plot, we can see that all no 2 columns correlate quite well as they all have a Pearsons R-value of less than 0.4. However of all the columns, the `likes` column correlated the best with `total_commits` and `community_interactions`\r
Although not always true, this does make sense because wel-made HuggingFace Spaces are bound to get more likes and in turn mean that the authors had worked on it for quite a long time and therefore made a lot of commits. Secondly, a good space is also bound to get more attention, i.e. more interactions from the community
Below are the corresponding 2 Scatter plots
""", elem_id='md')
# with gr.Row():
# gr.Plot(value=get_corr_scatter_total_commits())
# gr.Plot(value=get_corr_scatter_community_interactions())
# with gr.Column():
gr.Plot(value=get_corr(), elem_id='plot')
gr.Markdown("""
## Questions that we can try to answer?
One of the key reasons to writing this blog is to perform Exploratory Data Analysis (EDA) on the Huggingface Spaces datatset. We will be acomplishing this and gain insights by answering some high level questions as follows
### What are the top n spaces on HuggingFace filtered by x?
""")
with gr.Row():
with gr.Column():
quantity = gr.Slider(minimum=1, maximum=10, value=10, step=1, label="Quantity:", show_label=True)
min_value = gr.Slider(minimum=spaces['likes'].min(), maximum=spaces['likes'].max(), value=3, step=1, label="Min Value of Quantity:", show_label=True)
max_value = gr.Slider(minimum=spaces['likes'].min(), maximum=spaces['likes'].max(), value=3, step=1, label="Max Value of Quantity:", show_label=True)
filter_by = gr.Radio(choices=['likes', 'total_commits', 'community_interactions'], value='likes', label="Filter by:", show_label=True)
sort_by = gr.Radio(choices=['ascending', 'descending'], value='descending', label="Sort by:", show_label=True)
submit = gr.Button(value='Submit')
with gr.Column():
data_points = gr.Dataframe(
type="numpy",
headers=["Repo ID", 'Value'],
datatype=["str", "number"],
value=spaces.sort_values('likes')[['repo_id', 'likes']].iloc[:10].astype(str).to_numpy().tolist(),
)
with gr.Column():
data_plot = gr.Plot()
filter_by.change(fn=change_limits, inputs=[filter_by], outputs=[min_value, max_value])
submit.click(fn=get_top_spaces, inputs=[quantity, min_value, max_value, filter_by, sort_by], outputs=[data_points, data_plot])
gr.Markdown("", elem_id='spacer')
with gr.Row():
gr.Markdown(f"""
### What is the highest number of spaces created by any one user?
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec id rutrum diam, sed euismod lacus. Vivamus posuere, nibh sit amet dictum bibendum, tortor ligula faucibus odio, id mattis dolor erat eu ante. Quisque et velit nec libero viverra commodo in a augue. Quisque posuere leo arcu, in pretium ipsum dignissim non.
Curabitur in purus est. Proin vitae egestas orci, sit amet elementum urna. Sed condimentum rutrum erat, in vulputate purus consectetur sit amet. Cras rutrum mattis ante id malesuada. Duis luctus, erat vel imperdiet condimentum, elit lorem tincidunt sem, sit amet maximus arcu erat at ex.
""", elem_id='md')
gr.Plot(value=get_most_spaces()[1], elem_id='plot')
gr.Markdown("", elem_id='spacer')
with gr.Row():
gr.Plot(value=get_most_liked_users()[1], elem_id='plot')
gr.Markdown("""
### Which users has achieved the most likes?
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec id rutrum diam, sed euismod lacus. Vivamus posuere, nibh sit amet dictum bibendum, tortor ligula faucibus odio, id mattis dolor erat eu ante. Quisque et velit nec libero viverra commodo in a augue. Quisque posuere leo arcu, in pretium ipsum dignissim non.
Curabitur in purus est. Proin vitae egestas orci, sit amet elementum urna. Sed condimentum rutrum erat, in vulputate purus consectetur sit amet. Cras rutrum mattis ante id malesuada. Duis luctus, erat vel imperdiet condimentum, elit lorem tincidunt sem, sit amet maximus arcu erat at ex.
""", elem_id='md')
gr.Markdown("", elem_id='spacer')
with gr.Row():
gr.Markdown("""
### What are the proportions of the different SDKs used in creating spaces?
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec id rutrum diam, sed euismod lacus. Vivamus posuere, nibh sit amet dictum bibendum, tortor ligula faucibus odio, id mattis dolor erat eu ante. Quisque et velit nec libero viverra commodo in a augue. Quisque posuere leo arcu, in pretium ipsum dignissim non.
Curabitur in purus est. Proin vitae egestas orci, sit amet elementum urna. Sed condimentum rutrum erat, in vulputate purus consectetur sit amet. Cras rutrum mattis ante id malesuada. Duis luctus, erat vel imperdiet condimentum, elit lorem tincidunt sem, sit amet maximus arcu erat at ex.
""", elem_id='md')
gr.Plot(value=get_sdk_proportions(), elem_id='plot')
gr.Markdown("", elem_id='spacer')
with gr.Row():
gr.Plot(value=get_processable_spaces_proportions(), elem_id='plot')
gr.Markdown("""
### How many spaces are processable?
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec id rutrum diam, sed euismod lacus. Vivamus posuere, nibh sit amet dictum bibendum, tortor ligula faucibus odio, id mattis dolor erat eu ante. Quisque et velit nec libero viverra commodo in a augue. Quisque posuere leo arcu, in pretium ipsum dignissim non.
Curabitur in purus est. Proin vitae egestas orci, sit amet elementum urna. Sed condimentum rutrum erat, in vulputate purus consectetur sit amet. Cras rutrum mattis ante id malesuada. Duis luctus, erat vel imperdiet condimentum, elit lorem tincidunt sem, sit amet maximus arcu erat at ex.
""", elem_id='md')
gr.Markdown("", elem_id='spacer')
with gr.Row():
gr.Markdown("""
### What are the different types of input and output components used and which of them are used widely?
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec id rutrum diam, sed euismod lacus. Vivamus posuere, nibh sit amet dictum bibendum, tortor ligula faucibus odio, id mattis dolor erat eu ante. Quisque et velit nec libero viverra commodo in a augue. Quisque posuere leo arcu, in pretium ipsum dignissim non.
Curabitur in purus est. Proin vitae egestas orci, sit amet elementum urna. Sed condimentum rutrum erat, in vulputate purus consectetur sit amet. Cras rutrum mattis ante id malesuada. Duis luctus, erat vel imperdiet condimentum, elit lorem tincidunt sem, sit amet maximus arcu erat at ex.
""", elem_id='md')
gr.Plot(value=get_io_proportions(), elem_id='plot')
gr.Markdown("", elem_id='spacer')
with gr.Row():
gr.Plot(value=get_packages_proportions(), elem_id='plot')
gr.Markdown("""
### Which AI/ML python packages are used the most?
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec id rutrum diam, sed euismod lacus. Vivamus posuere, nibh sit amet dictum bibendum, tortor ligula faucibus odio, id mattis dolor erat eu ante. Quisque et velit nec libero viverra commodo in a augue. Quisque posuere leo arcu, in pretium ipsum dignissim non.
Curabitur in purus est. Proin vitae egestas orci, sit amet elementum urna. Sed condimentum rutrum erat, in vulputate purus consectetur sit amet. Cras rutrum mattis ante id malesuada. Duis luctus, erat vel imperdiet condimentum, elit lorem tincidunt sem, sit amet maximus arcu erat at ex.
""", elem_id='md')
gr.Markdown("", elem_id='spacer')
gr.Markdown("""
## Dataset in a nutshell
""")
# with gr.Row():
# with gr.Column():
# gr.Plot()
# with gr.Column():
# gr.Plot()
# with gr.Column():
# gr.Plot(value=get_tsne())
demo.launch()
|