Commit
·
a159f5a
1
Parent(s):
1000f2a
Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,11 @@ pd.options.plotting.backend = "plotly"
|
|
14 |
|
15 |
|
16 |
def download_dataset():
|
17 |
-
return load_dataset(
|
|
|
|
|
|
|
|
|
18 |
|
19 |
|
20 |
def _clean_tags(tags: Optional[Union[str, List[str]]]):
|
@@ -51,8 +55,8 @@ def prep_dataset():
|
|
51 |
df["has_dataset"] = df.datasets.apply(len) > 0
|
52 |
df["has_co2"] = df.co2.notnull()
|
53 |
df["has_co2"] = df.co2.apply(lambda x: x is not None)
|
54 |
-
df[
|
55 |
-
df[
|
56 |
df = df.drop(columns=["Unnamed: 0"])
|
57 |
df.to_parquet("data.parquet")
|
58 |
return df
|
@@ -160,12 +164,50 @@ def metadata_coverage_by_library(metadata_field):
|
|
160 |
return df.groupby("library")[metadata_field].mean().sort_values().plot.barh()
|
161 |
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
def metadata_coverage_by_autogenerated(metadata_field):
|
164 |
df = load_data()
|
165 |
-
subset_df = df[df[
|
166 |
subset_df.reset_index()
|
167 |
-
subset_df[
|
168 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
|
171 |
df = load_data()
|
@@ -221,11 +263,31 @@ with gr.Blocks() as demo:
|
|
221 |
metadata_coverage_by_library, [metadata_field], plot, queue=False
|
222 |
)
|
223 |
with gr.Tab("Auto generated model cards"):
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
|
|
|
|
228 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
with gr.Tab("Model Cards"):
|
231 |
gr.Markdown(
|
@@ -249,4 +311,4 @@ with gr.Blocks() as demo:
|
|
249 |
model_card_length_by_library, [min_lib_frequency], df, queue=False
|
250 |
)
|
251 |
|
252 |
-
demo.launch(
|
|
|
14 |
|
15 |
|
16 |
def download_dataset():
|
17 |
+
return load_dataset(
|
18 |
+
"open-source-metrics/model-repos-stats",
|
19 |
+
split="train",
|
20 |
+
ignore_verifications=True,
|
21 |
+
)
|
22 |
|
23 |
|
24 |
def _clean_tags(tags: Optional[Union[str, List[str]]]):
|
|
|
55 |
df["has_dataset"] = df.datasets.apply(len) > 0
|
56 |
df["has_co2"] = df.co2.notnull()
|
57 |
df["has_co2"] = df.co2.apply(lambda x: x is not None)
|
58 |
+
df["has_license"] = df.license.notnull()
|
59 |
+
df["is_generated"] = df.tags.apply(_is_generated_from_tag)
|
60 |
df = df.drop(columns=["Unnamed: 0"])
|
61 |
df.to_parquet("data.parquet")
|
62 |
return df
|
|
|
164 |
return df.groupby("library")[metadata_field].mean().sort_values().plot.barh()
|
165 |
|
166 |
|
167 |
+
def metatadata_coverage_autogenerated_vs_test():
|
168 |
+
df = load_data()
|
169 |
+
subset_df = df[df["is_generated"]].copy(deep=True)
|
170 |
+
subset_df.reset_index()
|
171 |
+
return (
|
172 |
+
df.groupby("is_generated")[[c for c in df.columns if c.startswith("has")]]
|
173 |
+
.mean()
|
174 |
+
.transpose()
|
175 |
+
.round(6)
|
176 |
+
.reset_index()
|
177 |
+
.rename(
|
178 |
+
columns={
|
179 |
+
True: "From autogenerated",
|
180 |
+
False: "Not autogenerated",
|
181 |
+
"index": "Metadata/tag field",
|
182 |
+
}
|
183 |
+
)
|
184 |
+
)
|
185 |
+
|
186 |
+
|
187 |
def metadata_coverage_by_autogenerated(metadata_field):
|
188 |
df = load_data()
|
189 |
+
subset_df = df[df["is_generated"]].copy(deep=True)
|
190 |
subset_df.reset_index()
|
191 |
+
subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated)
|
192 |
+
return (
|
193 |
+
subset_df.groupby("autogenerated-from")[metadata_field]
|
194 |
+
.mean()
|
195 |
+
.sort_values()
|
196 |
+
.plot.barh()
|
197 |
+
)
|
198 |
+
|
199 |
+
|
200 |
+
def model_card_length_by_autogenerated():
|
201 |
+
df = load_data()
|
202 |
+
subset_df = df[df["is_generated"]].copy(deep=True)
|
203 |
+
subset_df.reset_index()
|
204 |
+
subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated)
|
205 |
+
return (
|
206 |
+
subset_df.groupby("autogenerated-from")["text_length"]
|
207 |
+
.describe()
|
208 |
+
.round()
|
209 |
+
.reset_index()
|
210 |
+
)
|
211 |
|
212 |
|
213 |
df = load_data()
|
|
|
263 |
metadata_coverage_by_library, [metadata_field], plot, queue=False
|
264 |
)
|
265 |
with gr.Tab("Auto generated model cards"):
|
266 |
+
gr.Markdown(
|
267 |
+
"Some libraries/training frameworks automatically generate a model card when pushing models to "
|
268 |
+
"the hub. The below dataframe compares the metadata coverage across several tags for models "
|
269 |
+
"which are pushed with autogenerated model cards compared to those without. **Note** this "
|
270 |
+
"breakdown relies on tags with `autogenerated` in them."
|
271 |
+
"As a result some model cards might be in the wrong category. "
|
272 |
)
|
273 |
+
gr.Dataframe(metatadata_coverage_autogenerated_vs_test())
|
274 |
+
with gr.Row():
|
275 |
+
metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
|
276 |
+
plot = gr.Plot()
|
277 |
+
metadata_field.change(
|
278 |
+
metadata_coverage_by_autogenerated, [metadata_field], plot, queue=False
|
279 |
+
)
|
280 |
+
# )
|
281 |
+
# with gr.Row():
|
282 |
+
#
|
283 |
+
# # with gr.Column():
|
284 |
+
# # plot = gr.Plot()
|
285 |
+
# # min_lib_frequency.change(
|
286 |
+
# # model_card_length_by_autogenerated, [min_lib_frequency], plot, queue=False
|
287 |
+
# # )
|
288 |
+
# with gr.Column():
|
289 |
+
# gr.Markdown("Mean length of model card for autogenerated_from * model cards")
|
290 |
+
# df = gr.Dataframe(model_card_length_by_autogenerated)
|
291 |
|
292 |
with gr.Tab("Model Cards"):
|
293 |
gr.Markdown(
|
|
|
311 |
model_card_length_by_library, [min_lib_frequency], df, queue=False
|
312 |
)
|
313 |
|
314 |
+
demo.launch()
|