Spaces:
Sleeping
Sleeping
fixing minor issues
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import numpy as np
|
|
4 |
import plotly.express as px
|
5 |
from datasets import load_dataset
|
6 |
|
|
|
7 |
def load_transform_data():
|
8 |
"""
|
9 |
Load and transform data from a parquet file.
|
@@ -11,9 +12,9 @@ def load_transform_data():
|
|
11 |
Returns:
|
12 |
pandas.DataFrame: Transformed dataframe.
|
13 |
"""
|
14 |
-
spaces_dataset =
|
15 |
dataset = load_dataset(spaces_dataset)
|
16 |
-
df = dataset[
|
17 |
# combine the sdk and tags columns, one of which is a string and the other is an array of strings
|
18 |
df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
|
19 |
df["licenses"] = df["license"].apply(
|
@@ -25,7 +26,7 @@ def load_transform_data():
|
|
25 |
)
|
26 |
|
27 |
# Fill the NaN values with an empty string
|
28 |
-
df[
|
29 |
|
30 |
# where the custom_domains column is not null, use that as the url, otherwise, use the host column
|
31 |
df["url"] = np.where(
|
@@ -37,9 +38,9 @@ def load_transform_data():
|
|
37 |
# Build up a pretty url that's clickable with the emoji
|
38 |
df["url"] = df[["url", "emoji"]].apply(
|
39 |
lambda x: (
|
40 |
-
f
|
41 |
-
if x.iloc[0] is not None
|
42 |
-
else f
|
43 |
),
|
44 |
axis=1,
|
45 |
)
|
@@ -145,9 +146,7 @@ def filtered_df(
|
|
145 |
}
|
146 |
)
|
147 |
if filtered_devmode:
|
148 |
-
_df = _df[
|
149 |
-
_df["devMode"] == filtered_devmode
|
150 |
-
]
|
151 |
|
152 |
return _df[["URL", "Likes", "Models", "Datasets", "Licenses"]]
|
153 |
|
@@ -158,7 +157,7 @@ def count_items(items):
|
|
158 |
Parameters:
|
159 |
items (dataframe column): A dataframe column containing a list of items.
|
160 |
Returns:
|
161 |
-
tuple: A tuple containing two dictionaries. The first dictionary contains the count of each item,
|
162 |
and the second dictionary contains the count of each author.
|
163 |
"""
|
164 |
items = np.concatenate([arr for arr in items.values if arr is not None])
|
@@ -169,14 +168,15 @@ def count_items(items):
|
|
169 |
item_count[item] += 1
|
170 |
else:
|
171 |
item_count[item] = 1
|
172 |
-
author = item.split(
|
173 |
if author in item_author_count:
|
174 |
item_author_count[author] += 1
|
175 |
else:
|
176 |
item_author_count[author] = 1
|
177 |
-
|
178 |
return item_count, item_author_count
|
179 |
|
|
|
180 |
def flatten_column(_df, column):
|
181 |
"""
|
182 |
Flattens a column in a DataFrame.
|
@@ -203,7 +203,7 @@ with gr.Blocks(fill_width=True) as demo:
|
|
203 |
# The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time.
|
204 |
# The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
|
205 |
df = df.sort_values("created_at")
|
206 |
-
df[
|
207 |
fig1 = px.line(
|
208 |
df,
|
209 |
x="created_at",
|
@@ -216,16 +216,29 @@ with gr.Blocks(fill_width=True) as demo:
|
|
216 |
|
217 |
with gr.Row():
|
218 |
# Create a pie charge showing the distribution of spaces by SDK
|
219 |
-
fig2 = px.pie(
|
|
|
|
|
|
|
|
|
|
|
220 |
gr.Plot(fig2)
|
221 |
|
222 |
# create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
|
223 |
-
emoji_counts = df[
|
224 |
-
fig3 = px.pie(
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
gr.Plot(fig3)
|
226 |
|
227 |
# Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
|
228 |
-
author_likes =
|
|
|
|
|
229 |
fig4 = px.scatter(
|
230 |
author_likes,
|
231 |
x="id",
|
@@ -238,7 +251,13 @@ with gr.Blocks(fill_width=True) as demo:
|
|
238 |
gr.Plot(fig4)
|
239 |
|
240 |
# Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
|
241 |
-
emoji_likes =
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
fig10 = px.scatter(
|
243 |
emoji_likes,
|
244 |
x="id",
|
@@ -251,8 +270,8 @@ with gr.Blocks(fill_width=True) as demo:
|
|
251 |
gr.Plot(fig10)
|
252 |
|
253 |
# Create a bar chart of hardware in use
|
254 |
-
hardware = df[
|
255 |
-
hardware.columns = [
|
256 |
fig5 = px.bar(
|
257 |
hardware,
|
258 |
x="Hardware",
|
@@ -268,8 +287,10 @@ with gr.Blocks(fill_width=True) as demo:
|
|
268 |
fig5.update_layout(yaxis_type="log")
|
269 |
gr.Plot(fig5)
|
270 |
|
271 |
-
model_count, model_author_count = count_items(df[
|
272 |
-
model_author_count = pd.DataFrame(
|
|
|
|
|
273 |
fig8 = px.bar(
|
274 |
model_author_count.sort_values("Number of Spaces", ascending=False).head(
|
275 |
20
|
@@ -281,7 +302,9 @@ with gr.Blocks(fill_width=True) as demo:
|
|
281 |
template="plotly_dark",
|
282 |
)
|
283 |
gr.Plot(fig8)
|
284 |
-
model_count = pd.DataFrame(
|
|
|
|
|
285 |
# then make a bar chart
|
286 |
fig6 = px.bar(
|
287 |
model_count.sort_values("Number of Spaces", ascending=False).head(20),
|
@@ -293,9 +316,13 @@ with gr.Blocks(fill_width=True) as demo:
|
|
293 |
)
|
294 |
gr.Plot(fig6)
|
295 |
|
296 |
-
dataset_count, dataset_author_count = count_items(df[
|
297 |
-
dataset_count = pd.DataFrame(
|
298 |
-
|
|
|
|
|
|
|
|
|
299 |
fig9 = px.bar(
|
300 |
dataset_author_count.sort_values("Number of Spaces", ascending=False).head(
|
301 |
20
|
@@ -323,26 +350,30 @@ with gr.Blocks(fill_width=True) as demo:
|
|
323 |
|
324 |
with gr.Row():
|
325 |
# Get the most duplicated spaces
|
326 |
-
duplicated_spaces =
|
|
|
|
|
327 |
duplicated_spaces["duplicated_from"] = duplicated_spaces[
|
328 |
"duplicated_from"
|
329 |
].apply(
|
330 |
lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
|
331 |
)
|
332 |
duplicated_spaces.columns = ["Space", "Number of Duplicates"]
|
333 |
-
gr.DataFrame(duplicated_spaces, datatype="html"
|
334 |
|
335 |
# Get the most liked spaces
|
336 |
-
liked_spaces =
|
|
|
|
|
337 |
liked_spaces["id"] = liked_spaces["id"].apply(
|
338 |
lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
|
339 |
)
|
340 |
-
liked_spaces.columns = [
|
341 |
gr.DataFrame(liked_spaces, datatype="html")
|
342 |
|
343 |
with gr.Row():
|
344 |
# Create a dataframe with the top 10 authors and the number of spaces they have created
|
345 |
-
author_counts = df[
|
346 |
author_counts["author"] = author_counts["author"].apply(
|
347 |
lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
|
348 |
)
|
@@ -350,22 +381,25 @@ with gr.Blocks(fill_width=True) as demo:
|
|
350 |
gr.DataFrame(author_counts, datatype="html")
|
351 |
|
352 |
# create a dataframe where we groupby author and sum their likes
|
353 |
-
author_likes = df.groupby(
|
354 |
-
author_likes = author_likes.sort_values(by=
|
|
|
|
|
355 |
author_likes["author"] = author_likes["author"].apply(
|
356 |
lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
|
357 |
)
|
358 |
author_likes.columns = ["Author", "Number of Likes"]
|
359 |
gr.DataFrame(author_likes, datatype="html")
|
360 |
|
361 |
-
|
362 |
with gr.Tab(label="Spaces Search"):
|
363 |
-
df = df[df[
|
364 |
|
365 |
# Layout
|
366 |
with gr.Row():
|
367 |
emoji = gr.Dropdown(
|
368 |
-
df["emoji"].unique().tolist(),
|
|
|
|
|
369 |
) # Dropdown to select the emoji
|
370 |
likes = gr.Slider(
|
371 |
minimum=df["likes"].min(),
|
@@ -375,7 +409,9 @@ with gr.Blocks(fill_width=True) as demo:
|
|
375 |
) # Slider to filter by likes
|
376 |
with gr.Row():
|
377 |
author = gr.Dropdown(
|
378 |
-
df["author"].unique().tolist(),
|
|
|
|
|
379 |
)
|
380 |
# get the list of unique strings in the sdk_tags column
|
381 |
sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
|
@@ -405,15 +441,17 @@ with gr.Blocks(fill_width=True) as demo:
|
|
405 |
)
|
406 |
|
407 |
devmode = gr.Checkbox(label="Show Dev Mode Spaces")
|
408 |
-
clear = gr.ClearButton(
|
|
|
409 |
emoji,
|
410 |
author,
|
411 |
hardware,
|
412 |
sdk_tags,
|
413 |
models,
|
414 |
datasets,
|
415 |
-
space_license
|
416 |
-
|
|
|
417 |
|
418 |
df = pd.DataFrame(
|
419 |
df[
|
@@ -432,7 +470,7 @@ with gr.Blocks(fill_width=True) as demo:
|
|
432 |
"r_models",
|
433 |
"r_datasets",
|
434 |
"r_licenses",
|
435 |
-
|
436 |
]
|
437 |
]
|
438 |
)
|
@@ -450,9 +488,9 @@ with gr.Blocks(fill_width=True) as demo:
|
|
450 |
devmode,
|
451 |
],
|
452 |
datatype="html",
|
453 |
-
wrap=True,
|
454 |
-
column_widths=["25%", "5%", "25%", "25%", "20%"]
|
455 |
)
|
456 |
|
457 |
|
458 |
-
demo.launch()
|
|
|
4 |
import plotly.express as px
|
5 |
from datasets import load_dataset
|
6 |
|
7 |
+
|
8 |
def load_transform_data():
|
9 |
"""
|
10 |
Load and transform data from a parquet file.
|
|
|
12 |
Returns:
|
13 |
pandas.DataFrame: Transformed dataframe.
|
14 |
"""
|
15 |
+
spaces_dataset = "jsulz/space-stats"
|
16 |
dataset = load_dataset(spaces_dataset)
|
17 |
+
df = dataset["train"].to_pandas()
|
18 |
# combine the sdk and tags columns, one of which is a string and the other is an array of strings
|
19 |
df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
|
20 |
df["licenses"] = df["license"].apply(
|
|
|
26 |
)
|
27 |
|
28 |
# Fill the NaN values with an empty string
|
29 |
+
df["emoji"] = np.where(df["emoji"].isnull(), "", df["emoji"])
|
30 |
|
31 |
# where the custom_domains column is not null, use that as the url, otherwise, use the host column
|
32 |
df["url"] = np.where(
|
|
|
38 |
# Build up a pretty url that's clickable with the emoji
|
39 |
df["url"] = df[["url", "emoji"]].apply(
|
40 |
lambda x: (
|
41 |
+
f'<a target="_blank" href=https://huggingface.co/spaces/{x.iloc[0]}>{str(x.iloc[1]) + " " + x.iloc[0]}</a>'
|
42 |
+
if x.iloc[0] is not None
|
43 |
+
else f'<a target="_blank" href=https://{x.iloc[0][0]}>{str(x.iloc[1]) + " " + x.iloc[0][0]}</a>'
|
44 |
),
|
45 |
axis=1,
|
46 |
)
|
|
|
146 |
}
|
147 |
)
|
148 |
if filtered_devmode:
|
149 |
+
_df = _df[_df["devMode"] == filtered_devmode]
|
|
|
|
|
150 |
|
151 |
return _df[["URL", "Likes", "Models", "Datasets", "Licenses"]]
|
152 |
|
|
|
157 |
Parameters:
|
158 |
items (dataframe column): A dataframe column containing a list of items.
|
159 |
Returns:
|
160 |
+
tuple: A tuple containing two dictionaries. The first dictionary contains the count of each item,
|
161 |
and the second dictionary contains the count of each author.
|
162 |
"""
|
163 |
items = np.concatenate([arr for arr in items.values if arr is not None])
|
|
|
168 |
item_count[item] += 1
|
169 |
else:
|
170 |
item_count[item] = 1
|
171 |
+
author = item.split("/")[0]
|
172 |
if author in item_author_count:
|
173 |
item_author_count[author] += 1
|
174 |
else:
|
175 |
item_author_count[author] = 1
|
176 |
+
|
177 |
return item_count, item_author_count
|
178 |
|
179 |
+
|
180 |
def flatten_column(_df, column):
|
181 |
"""
|
182 |
Flattens a column in a DataFrame.
|
|
|
203 |
# The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time.
|
204 |
# The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
|
205 |
df = df.sort_values("created_at")
|
206 |
+
df["cumulative_spaces"] = df["created_at"].rank(method="first").astype(int)
|
207 |
fig1 = px.line(
|
208 |
df,
|
209 |
x="created_at",
|
|
|
216 |
|
217 |
with gr.Row():
|
218 |
# Create a pie charge showing the distribution of spaces by SDK
|
219 |
+
fig2 = px.pie(
|
220 |
+
df,
|
221 |
+
names="sdk",
|
222 |
+
title="Distribution of Spaces by SDK",
|
223 |
+
template="plotly_dark",
|
224 |
+
)
|
225 |
gr.Plot(fig2)
|
226 |
|
227 |
# create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
|
228 |
+
emoji_counts = df["emoji"].value_counts().head(10).reset_index()
|
229 |
+
fig3 = px.pie(
|
230 |
+
emoji_counts,
|
231 |
+
names="emoji",
|
232 |
+
values="count",
|
233 |
+
title="Distribution of Spaces by Emoji",
|
234 |
+
template="plotly_dark",
|
235 |
+
)
|
236 |
gr.Plot(fig3)
|
237 |
|
238 |
# Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
|
239 |
+
author_likes = (
|
240 |
+
df.groupby("author").agg({"likes": "sum", "id": "count"}).reset_index()
|
241 |
+
)
|
242 |
fig4 = px.scatter(
|
243 |
author_likes,
|
244 |
x="id",
|
|
|
251 |
gr.Plot(fig4)
|
252 |
|
253 |
# Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
|
254 |
+
emoji_likes = (
|
255 |
+
df.groupby("emoji")
|
256 |
+
.agg({"likes": "sum", "id": "count"})
|
257 |
+
.sort_values(by="likes", ascending=False)
|
258 |
+
.head(20)
|
259 |
+
.reset_index()
|
260 |
+
)
|
261 |
fig10 = px.scatter(
|
262 |
emoji_likes,
|
263 |
x="id",
|
|
|
270 |
gr.Plot(fig10)
|
271 |
|
272 |
# Create a bar chart of hardware in use
|
273 |
+
hardware = df["hardware"].value_counts().reset_index()
|
274 |
+
hardware.columns = ["Hardware", "Number of Spaces"]
|
275 |
fig5 = px.bar(
|
276 |
hardware,
|
277 |
x="Hardware",
|
|
|
287 |
fig5.update_layout(yaxis_type="log")
|
288 |
gr.Plot(fig5)
|
289 |
|
290 |
+
model_count, model_author_count = count_items(df["models"])
|
291 |
+
model_author_count = pd.DataFrame(
|
292 |
+
model_author_count.items(), columns=["Model Author", "Number of Spaces"]
|
293 |
+
)
|
294 |
fig8 = px.bar(
|
295 |
model_author_count.sort_values("Number of Spaces", ascending=False).head(
|
296 |
20
|
|
|
302 |
template="plotly_dark",
|
303 |
)
|
304 |
gr.Plot(fig8)
|
305 |
+
model_count = pd.DataFrame(
|
306 |
+
model_count.items(), columns=["Model", "Number of Spaces"]
|
307 |
+
)
|
308 |
# then make a bar chart
|
309 |
fig6 = px.bar(
|
310 |
model_count.sort_values("Number of Spaces", ascending=False).head(20),
|
|
|
316 |
)
|
317 |
gr.Plot(fig6)
|
318 |
|
319 |
+
dataset_count, dataset_author_count = count_items(df["datasets"])
|
320 |
+
dataset_count = pd.DataFrame(
|
321 |
+
dataset_count.items(), columns=["Datasets", "Number of Spaces"]
|
322 |
+
)
|
323 |
+
dataset_author_count = pd.DataFrame(
|
324 |
+
dataset_author_count.items(), columns=["Dataset Author", "Number of Spaces"]
|
325 |
+
)
|
326 |
fig9 = px.bar(
|
327 |
dataset_author_count.sort_values("Number of Spaces", ascending=False).head(
|
328 |
20
|
|
|
350 |
|
351 |
with gr.Row():
|
352 |
# Get the most duplicated spaces
|
353 |
+
duplicated_spaces = (
|
354 |
+
df["duplicated_from"].value_counts().head(20).reset_index()
|
355 |
+
)
|
356 |
duplicated_spaces["duplicated_from"] = duplicated_spaces[
|
357 |
"duplicated_from"
|
358 |
].apply(
|
359 |
lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
|
360 |
)
|
361 |
duplicated_spaces.columns = ["Space", "Number of Duplicates"]
|
362 |
+
gr.DataFrame(duplicated_spaces, datatype="html")
|
363 |
|
364 |
# Get the most liked spaces
|
365 |
+
liked_spaces = (
|
366 |
+
df[["id", "likes"]].sort_values(by="likes", ascending=False).head(20)
|
367 |
+
)
|
368 |
liked_spaces["id"] = liked_spaces["id"].apply(
|
369 |
lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
|
370 |
)
|
371 |
+
liked_spaces.columns = ["Space", "Number of Likes"]
|
372 |
gr.DataFrame(liked_spaces, datatype="html")
|
373 |
|
374 |
with gr.Row():
|
375 |
# Create a dataframe with the top 10 authors and the number of spaces they have created
|
376 |
+
author_counts = df["author"].value_counts().head(20).reset_index()
|
377 |
author_counts["author"] = author_counts["author"].apply(
|
378 |
lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
|
379 |
)
|
|
|
381 |
gr.DataFrame(author_counts, datatype="html")
|
382 |
|
383 |
# create a dataframe where we groupby author and sum their likes
|
384 |
+
author_likes = df.groupby("author").agg({"likes": "sum"}).reset_index()
|
385 |
+
author_likes = author_likes.sort_values(by="likes", ascending=False).head(
|
386 |
+
20
|
387 |
+
)
|
388 |
author_likes["author"] = author_likes["author"].apply(
|
389 |
lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
|
390 |
)
|
391 |
author_likes.columns = ["Author", "Number of Likes"]
|
392 |
gr.DataFrame(author_likes, datatype="html")
|
393 |
|
|
|
394 |
with gr.Tab(label="Spaces Search"):
|
395 |
+
df = df[df["stage"] == "RUNNING"]
|
396 |
|
397 |
# Layout
|
398 |
with gr.Row():
|
399 |
emoji = gr.Dropdown(
|
400 |
+
df["emoji"].unique().tolist(),
|
401 |
+
label="Search by Emoji 🤗",
|
402 |
+
multiselect=True,
|
403 |
) # Dropdown to select the emoji
|
404 |
likes = gr.Slider(
|
405 |
minimum=df["likes"].min(),
|
|
|
409 |
) # Slider to filter by likes
|
410 |
with gr.Row():
|
411 |
author = gr.Dropdown(
|
412 |
+
df["author"].unique().tolist(),
|
413 |
+
label="Search by Author",
|
414 |
+
multiselect=True,
|
415 |
)
|
416 |
# get the list of unique strings in the sdk_tags column
|
417 |
sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
|
|
|
441 |
)
|
442 |
|
443 |
devmode = gr.Checkbox(label="Show Dev Mode Spaces")
|
444 |
+
clear = gr.ClearButton(
|
445 |
+
components=[
|
446 |
emoji,
|
447 |
author,
|
448 |
hardware,
|
449 |
sdk_tags,
|
450 |
models,
|
451 |
datasets,
|
452 |
+
space_license,
|
453 |
+
]
|
454 |
+
)
|
455 |
|
456 |
df = pd.DataFrame(
|
457 |
df[
|
|
|
470 |
"r_models",
|
471 |
"r_datasets",
|
472 |
"r_licenses",
|
473 |
+
"devMode",
|
474 |
]
|
475 |
]
|
476 |
)
|
|
|
488 |
devmode,
|
489 |
],
|
490 |
datatype="html",
|
491 |
+
wrap=True,
|
492 |
+
column_widths=["25%", "5%", "25%", "25%", "20%"],
|
493 |
)
|
494 |
|
495 |
|
496 |
+
demo.launch(share=True)
|