Spaces:

librarian-bots
/

metadata_explorer

Sleeping

App Files Files Community

davanstrien HF Staff commited on Jan 20, 2023

Commit

bcf8ba9

1 Parent(s): 52415b9

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -40

app.py CHANGED Viewed

@@ -1,16 +1,14 @@
-import contextlib
-import gradio as gr
-import polars as pl
-from functools import lru_cache
-from cytoolz import concat, frequencies, topk
-from datasets import load_dataset
 from ast import literal_eval
-from typing import Union, List, Optional
-import numpy as np
 from itertools import combinations
-from toolz import unique
-import pandas as pd
 from pathlib import Path
 pd.options.plotting.backend = "plotly"
@@ -35,15 +33,15 @@ def _clean_tags(tags: Optional[Union[str, List[str]]]):
 def prep_dataset():
     ds = download_dataset()
     df = ds.to_pandas()
-    df['languages'] = df['languages'].apply(_clean_tags)
-    df['datasets'] = df['datasets'].apply(_clean_tags)
-    df['tags'] = df['tags'].apply(_clean_tags)
-    df['has_languages'] = df.languages.apply(len) > 0
-    df['has_tags'] = df.tags.apply(len) > 0
-    df['has_dataset'] = df.datasets.apply(len) > 0
-    df['has_co2'] = df.co2.isnull()
-    df['has_co2'] = df.co2.apply(lambda x: x is not None)
-    df = df.drop(columns=['Unnamed: 0'])
     df.to_parquet("data.parquet")
     return df
@@ -51,14 +49,14 @@ def prep_dataset():
 def load_data():
     return (
         pd.read_parquet("data.parquet")
-        if Path('data.parquet').exists()
         else prep_dataset()
     )
-def filter_df_by_library(filter='transformers'):
     df = load_data()
-    return df[df['library'] == filter] if filter else df
 @lru_cache()
@@ -71,7 +69,7 @@ def get_library_choices(min_freq: int = 50):
 @lru_cache()
 def get_all_tags():
     df = load_data()
-    tags = df['tags'].to_list()
     return list(concat(tags))
@@ -103,17 +101,19 @@ def tag_frequency(case_sensitive=True):
     if not case_sensitive:
         tags = (tag.lower() for tag in tags)
     tags_frequencies = dict(frequencies(tags))
-    df = pd.DataFrame.from_dict(tags_frequencies, orient='index', columns=['Count']).sort_values(
-        by='Count', ascending=False)
     return df.reset_index()
 def tag_frequency_by_library(library_filter):
     df = filter_df_by_library(library_filter)
-    tags = concat(df['tags'])
     tags = dict(frequencies(tags))
-    df = pd.DataFrame.from_dict(tags, orient='index', columns=['Count']).sort_values(
-        by='Count', ascending=False)
     return df.reset_index()
@@ -123,7 +123,12 @@ def has_model_card_by_library(top_n):
         top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
         # min_thresh = df.library.value_counts()[:min_number].index.to_list()
         df = df[df.library.isin(top_libs)]
-    return df.groupby('library')['has_text'].apply(lambda x: np.sum(x) / len(x)).sort_values().plot.barh()
 def model_card_length_by_library(top_n):
@@ -132,14 +137,15 @@ def model_card_length_by_library(top_n):
         top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
         # min_thresh = df.library.value_counts()[:min_number].index.to_list()
         df = df[df.library.isin(top_libs)]
-    return df.groupby('library')['text_length'].describe().round().reset_index()
     # df = df.groupby('library')['text_length'].describe().round().reset_index()
     # df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})")
     # return df.to_markdown()
 def metadata_coverage_by_library(metadata_field):
     df = load_data()
-    return df.groupby('library')[metadata_field].mean().sort_values().plot.barh()
 df = load_data()
@@ -154,14 +160,19 @@ with gr.Blocks() as demo:
         with gr.Row():
             gr.Markdown("thsh")
         with gr.Row():
-            case_sensitive = gr.Checkbox(True, label="Case sensitive", )
             mk = gr.Markdown()
             case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False)
         with gr.Accordion("Tag Frequencies", open=False):
             df = gr.Dataframe()
             case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False)
         with gr.Row():
-            gr.Markdown(f"Number of tags which are case sensitive {len(get_case_sensitive_duplicate_tags())}")
         with gr.Row():
             with gr.Accordion("View case sensitive tag pairs", open=False):
                 gr.Dataframe(display_case_sensitive_duplicate_tags())
@@ -169,23 +180,35 @@ with gr.Blocks() as demo:
         gr.Markdown("Tags by library")
         library_choice = gr.Dropdown(choices=libraries, label="select library")
         df = gr.Dataframe()
-        library_choice.change(tag_frequency_by_library, [library_choice], df, queue=False)
     with gr.Tab("Tag health by library"):
         metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
         plot = gr.Plot()
-        metadata_field.change(metadata_coverage_by_library, [metadata_field], plot, queue=False)
     with gr.Tab("Model Cards"):
-        gr.Markdown("""Model cards are a key component of metadata for a model. Model cards can include both
         information created by a human i.e. outlining the goals behind the creation of the model and information
         created by a training framework. This automatically generated information can contain information about
-        number of epochs, learning rate, weight decay etc. """)
-        min_lib_frequency = gr.Slider(minimum=1, maximum=top_n, value=10, label='filter by top n libraries')
         with gr.Column():
             plot = gr.Plot()
-            min_lib_frequency.change(has_model_card_by_library, [min_lib_frequency], plot, queue=False)
         with gr.Column():
             df = gr.Dataframe()
-            min_lib_frequency.change(model_card_length_by_library, [min_lib_frequency], df, queue=False)
 demo.launch(debug=True)

 from ast import literal_eval
+from functools import lru_cache
 from itertools import combinations
 from pathlib import Path
+from typing import List, Optional, Union
+import gradio as gr
+import numpy as np
+import pandas as pd
+from cytoolz import concat, frequencies, topk, unique
+from datasets import load_dataset
 pd.options.plotting.backend = "plotly"
 def prep_dataset():
     ds = download_dataset()
     df = ds.to_pandas()
+    df["languages"] = df["languages"].apply(_clean_tags)
+    df["datasets"] = df["datasets"].apply(_clean_tags)
+    df["tags"] = df["tags"].apply(_clean_tags)
+    df["has_languages"] = df.languages.apply(len) > 0
+    df["has_tags"] = df.tags.apply(len) > 0
+    df["has_dataset"] = df.datasets.apply(len) > 0
+    df["has_co2"] = df.co2.isnull()
+    df["has_co2"] = df.co2.apply(lambda x: x is not None)
+    df = df.drop(columns=["Unnamed: 0"])
     df.to_parquet("data.parquet")
     return df
 def load_data():
     return (
         pd.read_parquet("data.parquet")
+        if Path("data.parquet").exists()
         else prep_dataset()
     )
+def filter_df_by_library(filter="transformers"):
     df = load_data()
+    return df[df["library"] == filter] if filter else df
 @lru_cache()
 @lru_cache()
 def get_all_tags():
     df = load_data()
+    tags = df["tags"].to_list()
     return list(concat(tags))
     if not case_sensitive:
         tags = (tag.lower() for tag in tags)
     tags_frequencies = dict(frequencies(tags))
+    df = pd.DataFrame.from_dict(
+        tags_frequencies, orient="index", columns=["Count"]
+    ).sort_values(by="Count", ascending=False)
     return df.reset_index()
 def tag_frequency_by_library(library_filter):
     df = filter_df_by_library(library_filter)
+    tags = concat(df["tags"])
     tags = dict(frequencies(tags))
+    df = pd.DataFrame.from_dict(tags, orient="index", columns=["Count"]).sort_values(
+        by="Count", ascending=False
+    )
     return df.reset_index()
         top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
         # min_thresh = df.library.value_counts()[:min_number].index.to_list()
         df = df[df.library.isin(top_libs)]
+    return (
+        df.groupby("library")["has_text"]
+        .apply(lambda x: np.sum(x) / len(x))
+        .sort_values()
+        .plot.barh()
+    )
 def model_card_length_by_library(top_n):
         top_libs = df.library.value_counts().head(int(top_n)).index.to_list()
         # min_thresh = df.library.value_counts()[:min_number].index.to_list()
         df = df[df.library.isin(top_libs)]
+    return df.groupby("library")["text_length"].describe().round().reset_index()
     # df = df.groupby('library')['text_length'].describe().round().reset_index()
     # df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})")
     # return df.to_markdown()
 def metadata_coverage_by_library(metadata_field):
     df = load_data()
+    return df.groupby("library")[metadata_field].mean().sort_values().plot.barh()
 df = load_data()
         with gr.Row():
             gr.Markdown("thsh")
         with gr.Row():
+            case_sensitive = gr.Checkbox(
+                True,
+                label="Case sensitive",
+            )
             mk = gr.Markdown()
             case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False)
         with gr.Accordion("Tag Frequencies", open=False):
             df = gr.Dataframe()
             case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False)
         with gr.Row():
+            gr.Markdown(
+                f"Number of tags which are case sensitive {len(get_case_sensitive_duplicate_tags())}"
+            )
         with gr.Row():
             with gr.Accordion("View case sensitive tag pairs", open=False):
                 gr.Dataframe(display_case_sensitive_duplicate_tags())
         gr.Markdown("Tags by library")
         library_choice = gr.Dropdown(choices=libraries, label="select library")
         df = gr.Dataframe()
+        library_choice.change(
+            tag_frequency_by_library, [library_choice], df, queue=False
+        )
     with gr.Tab("Tag health by library"):
         metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
         plot = gr.Plot()
+        metadata_field.change(
+            metadata_coverage_by_library, [metadata_field], plot, queue=False
+        )
     with gr.Tab("Model Cards"):
+        gr.Markdown(
+            """Model cards are a key component of metadata for a model. Model cards can include both
         information created by a human i.e. outlining the goals behind the creation of the model and information
         created by a training framework. This automatically generated information can contain information about
+        number of epochs, learning rate, weight decay etc. """
+        )
+        min_lib_frequency = gr.Slider(
+            minimum=1, maximum=top_n, value=10, label="filter by top n libraries"
+        )
         with gr.Column():
             plot = gr.Plot()
+            min_lib_frequency.change(
+                has_model_card_by_library, [min_lib_frequency], plot, queue=False
+            )
         with gr.Column():
             df = gr.Dataframe()
+            min_lib_frequency.change(
+                model_card_length_by_library, [min_lib_frequency], df, queue=False
+            )
 demo.launch(debug=True)