Spaces:
Sleeping
Sleeping
Commit
·
bcf8ba9
1
Parent(s):
52415b9
Update app.py
Browse files
app.py
CHANGED
@@ -1,16 +1,14 @@
|
|
1 |
-
import contextlib
|
2 |
-
import gradio as gr
|
3 |
-
import polars as pl
|
4 |
-
from functools import lru_cache
|
5 |
-
from cytoolz import concat, frequencies, topk
|
6 |
-
from datasets import load_dataset
|
7 |
from ast import literal_eval
|
8 |
-
from
|
9 |
-
import numpy as np
|
10 |
from itertools import combinations
|
11 |
-
from toolz import unique
|
12 |
-
import pandas as pd
|
13 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
pd.options.plotting.backend = "plotly"
|
16 |
|
@@ -35,15 +33,15 @@ def _clean_tags(tags: Optional[Union[str, List[str]]]):
|
|
35 |
def prep_dataset():
|
36 |
ds = download_dataset()
|
37 |
df = ds.to_pandas()
|
38 |
-
df[
|
39 |
-
df[
|
40 |
-
df[
|
41 |
-
df[
|
42 |
-
df[
|
43 |
-
df[
|
44 |
-
df[
|
45 |
-
df[
|
46 |
-
df = df.drop(columns=[
|
47 |
df.to_parquet("data.parquet")
|
48 |
return df
|
49 |
|
@@ -51,14 +49,14 @@ def prep_dataset():
|
|
51 |
def load_data():
|
52 |
return (
|
53 |
pd.read_parquet("data.parquet")
|
54 |
-
if Path(
|
55 |
else prep_dataset()
|
56 |
)
|
57 |
|
58 |
|
59 |
-
def filter_df_by_library(filter=
|
60 |
df = load_data()
|
61 |
-
return df[df[
|
62 |
|
63 |
|
64 |
@lru_cache()
|
@@ -71,7 +69,7 @@ def get_library_choices(min_freq: int = 50):
|
|
71 |
@lru_cache()
|
72 |
def get_all_tags():
|
73 |
df = load_data()
|
74 |
-
tags = df[
|
75 |
return list(concat(tags))
|
76 |
|
77 |
|
@@ -103,17 +101,19 @@ def tag_frequency(case_sensitive=True):
|
|
103 |
if not case_sensitive:
|
104 |
tags = (tag.lower() for tag in tags)
|
105 |
tags_frequencies = dict(frequencies(tags))
|
106 |
-
df = pd.DataFrame.from_dict(
|
107 |
-
|
|
|
108 |
return df.reset_index()
|
109 |
|
110 |
|
111 |
def tag_frequency_by_library(library_filter):
|
112 |
df = filter_df_by_library(library_filter)
|
113 |
-
tags = concat(df[
|
114 |
tags = dict(frequencies(tags))
|
115 |
-
df = pd.DataFrame.from_dict(tags, orient=
|
116 |
-
by=
|
|
|
117 |
return df.reset_index()
|
118 |
|
119 |
|
@@ -123,7 +123,12 @@ def has_model_card_by_library(top_n):
|
|
123 |
top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
|
124 |
# min_thresh = df.library.value_counts()[:min_number].index.to_list()
|
125 |
df = df[df.library.isin(top_libs)]
|
126 |
-
return
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
|
129 |
def model_card_length_by_library(top_n):
|
@@ -132,14 +137,15 @@ def model_card_length_by_library(top_n):
|
|
132 |
top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
|
133 |
# min_thresh = df.library.value_counts()[:min_number].index.to_list()
|
134 |
df = df[df.library.isin(top_libs)]
|
135 |
-
return df.groupby(
|
136 |
# df = df.groupby('library')['text_length'].describe().round().reset_index()
|
137 |
# df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})")
|
138 |
# return df.to_markdown()
|
139 |
|
|
|
140 |
def metadata_coverage_by_library(metadata_field):
|
141 |
df = load_data()
|
142 |
-
return df.groupby(
|
143 |
|
144 |
|
145 |
df = load_data()
|
@@ -154,14 +160,19 @@ with gr.Blocks() as demo:
|
|
154 |
with gr.Row():
|
155 |
gr.Markdown("thsh")
|
156 |
with gr.Row():
|
157 |
-
case_sensitive = gr.Checkbox(
|
|
|
|
|
|
|
158 |
mk = gr.Markdown()
|
159 |
case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False)
|
160 |
with gr.Accordion("Tag Frequencies", open=False):
|
161 |
df = gr.Dataframe()
|
162 |
case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False)
|
163 |
with gr.Row():
|
164 |
-
gr.Markdown(
|
|
|
|
|
165 |
with gr.Row():
|
166 |
with gr.Accordion("View case sensitive tag pairs", open=False):
|
167 |
gr.Dataframe(display_case_sensitive_duplicate_tags())
|
@@ -169,23 +180,35 @@ with gr.Blocks() as demo:
|
|
169 |
gr.Markdown("Tags by library")
|
170 |
library_choice = gr.Dropdown(choices=libraries, label="select library")
|
171 |
df = gr.Dataframe()
|
172 |
-
library_choice.change(
|
|
|
|
|
173 |
with gr.Tab("Tag health by library"):
|
174 |
metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
|
175 |
plot = gr.Plot()
|
176 |
-
metadata_field.change(
|
|
|
|
|
177 |
|
178 |
with gr.Tab("Model Cards"):
|
179 |
-
gr.Markdown(
|
|
|
180 |
information created by a human i.e. outlining the goals behind the creation of the model and information
|
181 |
created by a training framework. This automatically generated information can contain information about
|
182 |
-
number of epochs, learning rate, weight decay etc. """
|
183 |
-
|
|
|
|
|
|
|
184 |
with gr.Column():
|
185 |
plot = gr.Plot()
|
186 |
-
min_lib_frequency.change(
|
|
|
|
|
187 |
with gr.Column():
|
188 |
df = gr.Dataframe()
|
189 |
-
min_lib_frequency.change(
|
|
|
|
|
190 |
|
191 |
demo.launch(debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from ast import literal_eval
|
2 |
+
from functools import lru_cache
|
|
|
3 |
from itertools import combinations
|
|
|
|
|
4 |
from pathlib import Path
|
5 |
+
from typing import List, Optional, Union
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
from cytoolz import concat, frequencies, topk, unique
|
11 |
+
from datasets import load_dataset
|
12 |
|
13 |
pd.options.plotting.backend = "plotly"
|
14 |
|
|
|
33 |
def prep_dataset():
|
34 |
ds = download_dataset()
|
35 |
df = ds.to_pandas()
|
36 |
+
df["languages"] = df["languages"].apply(_clean_tags)
|
37 |
+
df["datasets"] = df["datasets"].apply(_clean_tags)
|
38 |
+
df["tags"] = df["tags"].apply(_clean_tags)
|
39 |
+
df["has_languages"] = df.languages.apply(len) > 0
|
40 |
+
df["has_tags"] = df.tags.apply(len) > 0
|
41 |
+
df["has_dataset"] = df.datasets.apply(len) > 0
|
42 |
+
df["has_co2"] = df.co2.isnull()
|
43 |
+
df["has_co2"] = df.co2.apply(lambda x: x is not None)
|
44 |
+
df = df.drop(columns=["Unnamed: 0"])
|
45 |
df.to_parquet("data.parquet")
|
46 |
return df
|
47 |
|
|
|
49 |
def load_data():
|
50 |
return (
|
51 |
pd.read_parquet("data.parquet")
|
52 |
+
if Path("data.parquet").exists()
|
53 |
else prep_dataset()
|
54 |
)
|
55 |
|
56 |
|
57 |
+
def filter_df_by_library(filter="transformers"):
|
58 |
df = load_data()
|
59 |
+
return df[df["library"] == filter] if filter else df
|
60 |
|
61 |
|
62 |
@lru_cache()
|
|
|
69 |
@lru_cache()
|
70 |
def get_all_tags():
|
71 |
df = load_data()
|
72 |
+
tags = df["tags"].to_list()
|
73 |
return list(concat(tags))
|
74 |
|
75 |
|
|
|
101 |
if not case_sensitive:
|
102 |
tags = (tag.lower() for tag in tags)
|
103 |
tags_frequencies = dict(frequencies(tags))
|
104 |
+
df = pd.DataFrame.from_dict(
|
105 |
+
tags_frequencies, orient="index", columns=["Count"]
|
106 |
+
).sort_values(by="Count", ascending=False)
|
107 |
return df.reset_index()
|
108 |
|
109 |
|
110 |
def tag_frequency_by_library(library_filter):
|
111 |
df = filter_df_by_library(library_filter)
|
112 |
+
tags = concat(df["tags"])
|
113 |
tags = dict(frequencies(tags))
|
114 |
+
df = pd.DataFrame.from_dict(tags, orient="index", columns=["Count"]).sort_values(
|
115 |
+
by="Count", ascending=False
|
116 |
+
)
|
117 |
return df.reset_index()
|
118 |
|
119 |
|
|
|
123 |
top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
|
124 |
# min_thresh = df.library.value_counts()[:min_number].index.to_list()
|
125 |
df = df[df.library.isin(top_libs)]
|
126 |
+
return (
|
127 |
+
df.groupby("library")["has_text"]
|
128 |
+
.apply(lambda x: np.sum(x) / len(x))
|
129 |
+
.sort_values()
|
130 |
+
.plot.barh()
|
131 |
+
)
|
132 |
|
133 |
|
134 |
def model_card_length_by_library(top_n):
|
|
|
137 |
top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
|
138 |
# min_thresh = df.library.value_counts()[:min_number].index.to_list()
|
139 |
df = df[df.library.isin(top_libs)]
|
140 |
+
return df.groupby("library")["text_length"].describe().round().reset_index()
|
141 |
# df = df.groupby('library')['text_length'].describe().round().reset_index()
|
142 |
# df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})")
|
143 |
# return df.to_markdown()
|
144 |
|
145 |
+
|
146 |
def metadata_coverage_by_library(metadata_field):
|
147 |
df = load_data()
|
148 |
+
return df.groupby("library")[metadata_field].mean().sort_values().plot.barh()
|
149 |
|
150 |
|
151 |
df = load_data()
|
|
|
160 |
with gr.Row():
|
161 |
gr.Markdown("thsh")
|
162 |
with gr.Row():
|
163 |
+
case_sensitive = gr.Checkbox(
|
164 |
+
True,
|
165 |
+
label="Case sensitive",
|
166 |
+
)
|
167 |
mk = gr.Markdown()
|
168 |
case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False)
|
169 |
with gr.Accordion("Tag Frequencies", open=False):
|
170 |
df = gr.Dataframe()
|
171 |
case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False)
|
172 |
with gr.Row():
|
173 |
+
gr.Markdown(
|
174 |
+
f"Number of tags which are case sensitive {len(get_case_sensitive_duplicate_tags())}"
|
175 |
+
)
|
176 |
with gr.Row():
|
177 |
with gr.Accordion("View case sensitive tag pairs", open=False):
|
178 |
gr.Dataframe(display_case_sensitive_duplicate_tags())
|
|
|
180 |
gr.Markdown("Tags by library")
|
181 |
library_choice = gr.Dropdown(choices=libraries, label="select library")
|
182 |
df = gr.Dataframe()
|
183 |
+
library_choice.change(
|
184 |
+
tag_frequency_by_library, [library_choice], df, queue=False
|
185 |
+
)
|
186 |
with gr.Tab("Tag health by library"):
|
187 |
metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
|
188 |
plot = gr.Plot()
|
189 |
+
metadata_field.change(
|
190 |
+
metadata_coverage_by_library, [metadata_field], plot, queue=False
|
191 |
+
)
|
192 |
|
193 |
with gr.Tab("Model Cards"):
|
194 |
+
gr.Markdown(
|
195 |
+
"""Model cards are a key component of metadata for a model. Model cards can include both
|
196 |
information created by a human i.e. outlining the goals behind the creation of the model and information
|
197 |
created by a training framework. This automatically generated information can contain information about
|
198 |
+
number of epochs, learning rate, weight decay etc. """
|
199 |
+
)
|
200 |
+
min_lib_frequency = gr.Slider(
|
201 |
+
minimum=1, maximum=top_n, value=10, label="filter by top n libraries"
|
202 |
+
)
|
203 |
with gr.Column():
|
204 |
plot = gr.Plot()
|
205 |
+
min_lib_frequency.change(
|
206 |
+
has_model_card_by_library, [min_lib_frequency], plot, queue=False
|
207 |
+
)
|
208 |
with gr.Column():
|
209 |
df = gr.Dataframe()
|
210 |
+
min_lib_frequency.change(
|
211 |
+
model_card_length_by_library, [min_lib_frequency], df, queue=False
|
212 |
+
)
|
213 |
|
214 |
demo.launch(debug=True)
|