davanstrien HF Staff commited on
Commit
a159f5a
·
1 Parent(s): 1000f2a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -11
app.py CHANGED
@@ -14,7 +14,11 @@ pd.options.plotting.backend = "plotly"
14
 
15
 
16
  def download_dataset():
17
- return load_dataset("open-source-metrics/model-repos-stats", split="train", ignore_verifications=True)
 
 
 
 
18
 
19
 
20
  def _clean_tags(tags: Optional[Union[str, List[str]]]):
@@ -51,8 +55,8 @@ def prep_dataset():
51
  df["has_dataset"] = df.datasets.apply(len) > 0
52
  df["has_co2"] = df.co2.notnull()
53
  df["has_co2"] = df.co2.apply(lambda x: x is not None)
54
- df['has_license'] = df.license.notnull()
55
- df['is_generated'] = df.tags.apply(_is_generated_from_tag)
56
  df = df.drop(columns=["Unnamed: 0"])
57
  df.to_parquet("data.parquet")
58
  return df
@@ -160,12 +164,50 @@ def metadata_coverage_by_library(metadata_field):
160
  return df.groupby("library")[metadata_field].mean().sort_values().plot.barh()
161
 
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  def metadata_coverage_by_autogenerated(metadata_field):
164
  df = load_data()
165
- subset_df = df[df['is_generated']].copy(deep=True)
166
  subset_df.reset_index()
167
- subset_df['autogenerated-from'] = subset_df.tags.apply(_parse_tags_for_generated)
168
- return subset_df.groupby("autogenerated-from")[metadata_field].mean().sort_values().plot.barh()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
 
171
  df = load_data()
@@ -221,11 +263,31 @@ with gr.Blocks() as demo:
221
  metadata_coverage_by_library, [metadata_field], plot, queue=False
222
  )
223
  with gr.Tab("Auto generated model cards"):
224
- metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
225
- plot = gr.Plot()
226
- metadata_field.change(
227
- metadata_coverage_by_autogenerated, [metadata_field], plot, queue=False
 
 
228
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
  with gr.Tab("Model Cards"):
231
  gr.Markdown(
@@ -249,4 +311,4 @@ with gr.Blocks() as demo:
249
  model_card_length_by_library, [min_lib_frequency], df, queue=False
250
  )
251
 
252
- demo.launch(debug=True)
 
14
 
15
 
16
  def download_dataset():
17
+ return load_dataset(
18
+ "open-source-metrics/model-repos-stats",
19
+ split="train",
20
+ ignore_verifications=True,
21
+ )
22
 
23
 
24
  def _clean_tags(tags: Optional[Union[str, List[str]]]):
 
55
  df["has_dataset"] = df.datasets.apply(len) > 0
56
  df["has_co2"] = df.co2.notnull()
57
  df["has_co2"] = df.co2.apply(lambda x: x is not None)
58
+ df["has_license"] = df.license.notnull()
59
+ df["is_generated"] = df.tags.apply(_is_generated_from_tag)
60
  df = df.drop(columns=["Unnamed: 0"])
61
  df.to_parquet("data.parquet")
62
  return df
 
164
  return df.groupby("library")[metadata_field].mean().sort_values().plot.barh()
165
 
166
 
167
+ def metatadata_coverage_autogenerated_vs_test():
168
+ df = load_data()
169
+ subset_df = df[df["is_generated"]].copy(deep=True)
170
+ subset_df.reset_index()
171
+ return (
172
+ df.groupby("is_generated")[[c for c in df.columns if c.startswith("has")]]
173
+ .mean()
174
+ .transpose()
175
+ .round(6)
176
+ .reset_index()
177
+ .rename(
178
+ columns={
179
+ True: "From autogenerated",
180
+ False: "Not autogenerated",
181
+ "index": "Metadata/tag field",
182
+ }
183
+ )
184
+ )
185
+
186
+
187
  def metadata_coverage_by_autogenerated(metadata_field):
188
  df = load_data()
189
+ subset_df = df[df["is_generated"]].copy(deep=True)
190
  subset_df.reset_index()
191
+ subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated)
192
+ return (
193
+ subset_df.groupby("autogenerated-from")[metadata_field]
194
+ .mean()
195
+ .sort_values()
196
+ .plot.barh()
197
+ )
198
+
199
+
200
+ def model_card_length_by_autogenerated():
201
+ df = load_data()
202
+ subset_df = df[df["is_generated"]].copy(deep=True)
203
+ subset_df.reset_index()
204
+ subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated)
205
+ return (
206
+ subset_df.groupby("autogenerated-from")["text_length"]
207
+ .describe()
208
+ .round()
209
+ .reset_index()
210
+ )
211
 
212
 
213
  df = load_data()
 
263
  metadata_coverage_by_library, [metadata_field], plot, queue=False
264
  )
265
  with gr.Tab("Auto generated model cards"):
266
+ gr.Markdown(
267
+ "Some libraries/training frameworks automatically generate a model card when pushing models to "
268
+ "the hub. The below dataframe compares the metadata coverage across several tags for models "
269
+ "which are pushed with autogenerated model cards compared to those without. **Note** this "
270
+ "breakdown relies on tags with `autogenerated` in them."
271
+ "As a result some model cards might be in the wrong category. "
272
  )
273
+ gr.Dataframe(metatadata_coverage_autogenerated_vs_test())
274
+ with gr.Row():
275
+ metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
276
+ plot = gr.Plot()
277
+ metadata_field.change(
278
+ metadata_coverage_by_autogenerated, [metadata_field], plot, queue=False
279
+ )
280
+ # )
281
+ # with gr.Row():
282
+ #
283
+ # # with gr.Column():
284
+ # # plot = gr.Plot()
285
+ # # min_lib_frequency.change(
286
+ # # model_card_length_by_autogenerated, [min_lib_frequency], plot, queue=False
287
+ # # )
288
+ # with gr.Column():
289
+ # gr.Markdown("Mean length of model card for autogenerated_from * model cards")
290
+ # df = gr.Dataframe(model_card_length_by_autogenerated)
291
 
292
  with gr.Tab("Model Cards"):
293
  gr.Markdown(
 
311
  model_card_length_by_library, [min_lib_frequency], df, queue=False
312
  )
313
 
314
+ demo.launch()