davanstrien HF Staff commited on
Commit
bcf8ba9
·
1 Parent(s): 52415b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -40
app.py CHANGED
@@ -1,16 +1,14 @@
1
- import contextlib
2
- import gradio as gr
3
- import polars as pl
4
- from functools import lru_cache
5
- from cytoolz import concat, frequencies, topk
6
- from datasets import load_dataset
7
  from ast import literal_eval
8
- from typing import Union, List, Optional
9
- import numpy as np
10
  from itertools import combinations
11
- from toolz import unique
12
- import pandas as pd
13
  from pathlib import Path
 
 
 
 
 
 
 
14
 
15
  pd.options.plotting.backend = "plotly"
16
 
@@ -35,15 +33,15 @@ def _clean_tags(tags: Optional[Union[str, List[str]]]):
35
  def prep_dataset():
36
  ds = download_dataset()
37
  df = ds.to_pandas()
38
- df['languages'] = df['languages'].apply(_clean_tags)
39
- df['datasets'] = df['datasets'].apply(_clean_tags)
40
- df['tags'] = df['tags'].apply(_clean_tags)
41
- df['has_languages'] = df.languages.apply(len) > 0
42
- df['has_tags'] = df.tags.apply(len) > 0
43
- df['has_dataset'] = df.datasets.apply(len) > 0
44
- df['has_co2'] = df.co2.isnull()
45
- df['has_co2'] = df.co2.apply(lambda x: x is not None)
46
- df = df.drop(columns=['Unnamed: 0'])
47
  df.to_parquet("data.parquet")
48
  return df
49
 
@@ -51,14 +49,14 @@ def prep_dataset():
51
  def load_data():
52
  return (
53
  pd.read_parquet("data.parquet")
54
- if Path('data.parquet').exists()
55
  else prep_dataset()
56
  )
57
 
58
 
59
- def filter_df_by_library(filter='transformers'):
60
  df = load_data()
61
- return df[df['library'] == filter] if filter else df
62
 
63
 
64
  @lru_cache()
@@ -71,7 +69,7 @@ def get_library_choices(min_freq: int = 50):
71
  @lru_cache()
72
  def get_all_tags():
73
  df = load_data()
74
- tags = df['tags'].to_list()
75
  return list(concat(tags))
76
 
77
 
@@ -103,17 +101,19 @@ def tag_frequency(case_sensitive=True):
103
  if not case_sensitive:
104
  tags = (tag.lower() for tag in tags)
105
  tags_frequencies = dict(frequencies(tags))
106
- df = pd.DataFrame.from_dict(tags_frequencies, orient='index', columns=['Count']).sort_values(
107
- by='Count', ascending=False)
 
108
  return df.reset_index()
109
 
110
 
111
  def tag_frequency_by_library(library_filter):
112
  df = filter_df_by_library(library_filter)
113
- tags = concat(df['tags'])
114
  tags = dict(frequencies(tags))
115
- df = pd.DataFrame.from_dict(tags, orient='index', columns=['Count']).sort_values(
116
- by='Count', ascending=False)
 
117
  return df.reset_index()
118
 
119
 
@@ -123,7 +123,12 @@ def has_model_card_by_library(top_n):
123
  top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
124
  # min_thresh = df.library.value_counts()[:min_number].index.to_list()
125
  df = df[df.library.isin(top_libs)]
126
- return df.groupby('library')['has_text'].apply(lambda x: np.sum(x) / len(x)).sort_values().plot.barh()
 
 
 
 
 
127
 
128
 
129
  def model_card_length_by_library(top_n):
@@ -132,14 +137,15 @@ def model_card_length_by_library(top_n):
132
  top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
133
  # min_thresh = df.library.value_counts()[:min_number].index.to_list()
134
  df = df[df.library.isin(top_libs)]
135
- return df.groupby('library')['text_length'].describe().round().reset_index()
136
  # df = df.groupby('library')['text_length'].describe().round().reset_index()
137
  # df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})")
138
  # return df.to_markdown()
139
 
 
140
  def metadata_coverage_by_library(metadata_field):
141
  df = load_data()
142
- return df.groupby('library')[metadata_field].mean().sort_values().plot.barh()
143
 
144
 
145
  df = load_data()
@@ -154,14 +160,19 @@ with gr.Blocks() as demo:
154
  with gr.Row():
155
  gr.Markdown("thsh")
156
  with gr.Row():
157
- case_sensitive = gr.Checkbox(True, label="Case sensitive", )
 
 
 
158
  mk = gr.Markdown()
159
  case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False)
160
  with gr.Accordion("Tag Frequencies", open=False):
161
  df = gr.Dataframe()
162
  case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False)
163
  with gr.Row():
164
- gr.Markdown(f"Number of tags which are case sensitive {len(get_case_sensitive_duplicate_tags())}")
 
 
165
  with gr.Row():
166
  with gr.Accordion("View case sensitive tag pairs", open=False):
167
  gr.Dataframe(display_case_sensitive_duplicate_tags())
@@ -169,23 +180,35 @@ with gr.Blocks() as demo:
169
  gr.Markdown("Tags by library")
170
  library_choice = gr.Dropdown(choices=libraries, label="select library")
171
  df = gr.Dataframe()
172
- library_choice.change(tag_frequency_by_library, [library_choice], df, queue=False)
 
 
173
  with gr.Tab("Tag health by library"):
174
  metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
175
  plot = gr.Plot()
176
- metadata_field.change(metadata_coverage_by_library, [metadata_field], plot, queue=False)
 
 
177
 
178
  with gr.Tab("Model Cards"):
179
- gr.Markdown("""Model cards are a key component of metadata for a model. Model cards can include both
 
180
  information created by a human i.e. outlining the goals behind the creation of the model and information
181
  created by a training framework. This automatically generated information can contain information about
182
- number of epochs, learning rate, weight decay etc. """)
183
- min_lib_frequency = gr.Slider(minimum=1, maximum=top_n, value=10, label='filter by top n libraries')
 
 
 
184
  with gr.Column():
185
  plot = gr.Plot()
186
- min_lib_frequency.change(has_model_card_by_library, [min_lib_frequency], plot, queue=False)
 
 
187
  with gr.Column():
188
  df = gr.Dataframe()
189
- min_lib_frequency.change(model_card_length_by_library, [min_lib_frequency], df, queue=False)
 
 
190
 
191
  demo.launch(debug=True)
 
 
 
 
 
 
 
1
  from ast import literal_eval
2
+ from functools import lru_cache
 
3
  from itertools import combinations
 
 
4
  from pathlib import Path
5
+ from typing import List, Optional, Union
6
+
7
+ import gradio as gr
8
+ import numpy as np
9
+ import pandas as pd
10
+ from cytoolz import concat, frequencies, topk, unique
11
+ from datasets import load_dataset
12
 
13
  pd.options.plotting.backend = "plotly"
14
 
 
33
  def prep_dataset():
34
  ds = download_dataset()
35
  df = ds.to_pandas()
36
+ df["languages"] = df["languages"].apply(_clean_tags)
37
+ df["datasets"] = df["datasets"].apply(_clean_tags)
38
+ df["tags"] = df["tags"].apply(_clean_tags)
39
+ df["has_languages"] = df.languages.apply(len) > 0
40
+ df["has_tags"] = df.tags.apply(len) > 0
41
+ df["has_dataset"] = df.datasets.apply(len) > 0
42
+ df["has_co2"] = df.co2.isnull()
43
+ df["has_co2"] = df.co2.apply(lambda x: x is not None)
44
+ df = df.drop(columns=["Unnamed: 0"])
45
  df.to_parquet("data.parquet")
46
  return df
47
 
 
49
  def load_data():
50
  return (
51
  pd.read_parquet("data.parquet")
52
+ if Path("data.parquet").exists()
53
  else prep_dataset()
54
  )
55
 
56
 
57
+ def filter_df_by_library(filter="transformers"):
58
  df = load_data()
59
+ return df[df["library"] == filter] if filter else df
60
 
61
 
62
  @lru_cache()
 
69
  @lru_cache()
70
  def get_all_tags():
71
  df = load_data()
72
+ tags = df["tags"].to_list()
73
  return list(concat(tags))
74
 
75
 
 
101
  if not case_sensitive:
102
  tags = (tag.lower() for tag in tags)
103
  tags_frequencies = dict(frequencies(tags))
104
+ df = pd.DataFrame.from_dict(
105
+ tags_frequencies, orient="index", columns=["Count"]
106
+ ).sort_values(by="Count", ascending=False)
107
  return df.reset_index()
108
 
109
 
110
  def tag_frequency_by_library(library_filter):
111
  df = filter_df_by_library(library_filter)
112
+ tags = concat(df["tags"])
113
  tags = dict(frequencies(tags))
114
+ df = pd.DataFrame.from_dict(tags, orient="index", columns=["Count"]).sort_values(
115
+ by="Count", ascending=False
116
+ )
117
  return df.reset_index()
118
 
119
 
 
123
  top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
124
  # min_thresh = df.library.value_counts()[:min_number].index.to_list()
125
  df = df[df.library.isin(top_libs)]
126
+ return (
127
+ df.groupby("library")["has_text"]
128
+ .apply(lambda x: np.sum(x) / len(x))
129
+ .sort_values()
130
+ .plot.barh()
131
+ )
132
 
133
 
134
  def model_card_length_by_library(top_n):
 
137
  top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
138
  # min_thresh = df.library.value_counts()[:min_number].index.to_list()
139
  df = df[df.library.isin(top_libs)]
140
+ return df.groupby("library")["text_length"].describe().round().reset_index()
141
  # df = df.groupby('library')['text_length'].describe().round().reset_index()
142
  # df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})")
143
  # return df.to_markdown()
144
 
145
+
146
  def metadata_coverage_by_library(metadata_field):
147
  df = load_data()
148
+ return df.groupby("library")[metadata_field].mean().sort_values().plot.barh()
149
 
150
 
151
  df = load_data()
 
160
  with gr.Row():
161
  gr.Markdown("thsh")
162
  with gr.Row():
163
+ case_sensitive = gr.Checkbox(
164
+ True,
165
+ label="Case sensitive",
166
+ )
167
  mk = gr.Markdown()
168
  case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False)
169
  with gr.Accordion("Tag Frequencies", open=False):
170
  df = gr.Dataframe()
171
  case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False)
172
  with gr.Row():
173
+ gr.Markdown(
174
+ f"Number of tags which are case sensitive {len(get_case_sensitive_duplicate_tags())}"
175
+ )
176
  with gr.Row():
177
  with gr.Accordion("View case sensitive tag pairs", open=False):
178
  gr.Dataframe(display_case_sensitive_duplicate_tags())
 
180
  gr.Markdown("Tags by library")
181
  library_choice = gr.Dropdown(choices=libraries, label="select library")
182
  df = gr.Dataframe()
183
+ library_choice.change(
184
+ tag_frequency_by_library, [library_choice], df, queue=False
185
+ )
186
  with gr.Tab("Tag health by library"):
187
  metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
188
  plot = gr.Plot()
189
+ metadata_field.change(
190
+ metadata_coverage_by_library, [metadata_field], plot, queue=False
191
+ )
192
 
193
  with gr.Tab("Model Cards"):
194
+ gr.Markdown(
195
+ """Model cards are a key component of metadata for a model. Model cards can include both
196
  information created by a human i.e. outlining the goals behind the creation of the model and information
197
  created by a training framework. This automatically generated information can contain information about
198
+ number of epochs, learning rate, weight decay etc. """
199
+ )
200
+ min_lib_frequency = gr.Slider(
201
+ minimum=1, maximum=top_n, value=10, label="filter by top n libraries"
202
+ )
203
  with gr.Column():
204
  plot = gr.Plot()
205
+ min_lib_frequency.change(
206
+ has_model_card_by_library, [min_lib_frequency], plot, queue=False
207
+ )
208
  with gr.Column():
209
  df = gr.Dataframe()
210
+ min_lib_frequency.change(
211
+ model_card_length_by_library, [min_lib_frequency], df, queue=False
212
+ )
213
 
214
  demo.launch(debug=True)