|
from ast import literal_eval |
|
from functools import lru_cache |
|
from itertools import combinations |
|
from pathlib import Path |
|
from typing import List, Optional, Union |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
from cytoolz import concat, frequencies, topk, unique |
|
from datasets import load_dataset |
|
|
|
pd.options.plotting.backend = "plotly" |
|
|
|
def download_dataset(): |
|
return load_dataset( |
|
"open-source-metrics/model-repos-stats", |
|
split="train", |
|
ignore_verifications=True, |
|
) |
|
|
|
|
|
def _clean_tags(tags: Optional[Union[str, List[str]]]): |
|
try: |
|
tags = literal_eval(tags) |
|
if isinstance(tags, str): |
|
return [tags] |
|
if isinstance(tags, list): |
|
return [tag for tag in tags if isinstance(tag, str)] |
|
else: |
|
return [] |
|
except (ValueError, SyntaxError): |
|
return [] |
|
|
|
|
|
def _is_generated_from_tag(tags): |
|
return any("generated" in tag for tag in tags) |
|
|
|
|
|
def _parse_tags_for_generated(tags): |
|
for tag in tags: |
|
if "generated" in tag: |
|
return tag |
|
|
|
|
|
def prep_dataset(): |
|
ds = download_dataset() |
|
df = ds.to_pandas() |
|
df["languages"] = df["languages"].apply(_clean_tags) |
|
df["datasets"] = df["datasets"].apply(_clean_tags) |
|
df["tags"] = df["tags"].apply(_clean_tags) |
|
df["has_languages"] = df.languages.apply(len) > 0 |
|
df["has_tags"] = df.tags.apply(len) > 0 |
|
df["has_dataset"] = df.datasets.apply(len) > 0 |
|
df["has_co2"] = df.co2.notnull() |
|
df["has_co2"] = df.co2.apply(lambda x: x is not None) |
|
df["has_license"] = df.license.notnull() |
|
df["is_generated"] = df.tags.apply(_is_generated_from_tag) |
|
df = df.drop(columns=["Unnamed: 0"]) |
|
df.to_parquet("data.parquet") |
|
return df |
|
|
|
|
|
def load_data(): |
|
return ( |
|
pd.read_parquet("data.parquet") |
|
if Path("data.parquet").exists() |
|
else prep_dataset() |
|
) |
|
|
|
|
|
def filter_df_by_library(filter="transformers"): |
|
df = load_data() |
|
return df[df["library"] == filter] if filter else df |
|
|
|
|
|
@lru_cache() |
|
def get_library_choices(min_freq: int = 50): |
|
df = load_data() |
|
library_counts = df.library.value_counts() |
|
return library_counts[library_counts > min_freq].index.to_list() |
|
|
|
|
|
@lru_cache() |
|
def get_all_tags(): |
|
df = load_data() |
|
tags = df["tags"].to_list() |
|
return list(concat(tags)) |
|
|
|
|
|
@lru_cache() |
|
def get_case_sensitive_duplicate_tags(): |
|
tags = get_all_tags() |
|
unique_tags = unique(tags) |
|
return [ |
|
tag_combo |
|
for tag_combo in combinations(unique_tags, 2) |
|
if tag_combo[0].lower() == tag_combo[1].lower() |
|
] |
|
|
|
|
|
def display_case_sensitive_duplicate_tags(): |
|
return pd.DataFrame(get_case_sensitive_duplicate_tags()) |
|
|
|
|
|
def get_number_of_tags(case_sensitive=True): |
|
tags = set(get_all_tags()) |
|
if case_sensitive: |
|
return f"Total number of case sensitive tags: {len(tags)}" |
|
tags = {tag.lower() for tag in tags} |
|
return f"Total number of case insensitive tags: {len(tags)}" |
|
|
|
|
|
def tag_frequency(case_sensitive=True): |
|
tags = get_all_tags() |
|
if not case_sensitive: |
|
tags = (tag.lower() for tag in tags) |
|
tags_frequencies = dict(frequencies(tags)) |
|
df = pd.DataFrame.from_dict( |
|
tags_frequencies, orient="index", columns=["Count"] |
|
).sort_values(by="Count", ascending=False) |
|
return df.reset_index() |
|
|
|
|
|
def tag_frequency_by_library(library_filter): |
|
df = filter_df_by_library(library_filter) |
|
tags = concat(df["tags"]) |
|
tags = dict(frequencies(tags)) |
|
df = pd.DataFrame.from_dict(tags, orient="index", columns=["Count"]).sort_values( |
|
by="Count", ascending=False |
|
) |
|
return df.reset_index() |
|
|
|
|
|
def has_model_card_by_library(top_n): |
|
df = load_data() |
|
if top_n: |
|
top_libs = df.library.value_counts().head(int(top_n)).index.to_list() |
|
|
|
df = df[df.library.isin(top_libs)] |
|
return ( |
|
df.groupby("library")["has_text"] |
|
.apply(lambda x: np.sum(x) / len(x)) |
|
.sort_values() |
|
.plot.barh() |
|
) |
|
|
|
|
|
def model_card_length_by_library(top_n): |
|
df = load_data() |
|
if top_n: |
|
top_libs = df.library.value_counts().head(int(top_n)).index.to_list() |
|
|
|
df = df[df.library.isin(top_libs)] |
|
return df.groupby("library")["text_length"].describe().round().reset_index() |
|
|
|
|
|
|
|
|
|
|
|
def metadata_coverage_by_library(metadata_field): |
|
df = load_data() |
|
return df.groupby("library")[metadata_field].mean().sort_values().plot.barh() |
|
|
|
|
|
def metatadata_coverage_autogenerated_vs_test(): |
|
df = load_data() |
|
subset_df = df[df["is_generated"]].copy(deep=True) |
|
subset_df.reset_index() |
|
return ( |
|
df.groupby("is_generated")[[c for c in df.columns if c.startswith("has")]] |
|
.mean() |
|
.transpose() |
|
.round(6) |
|
.reset_index() |
|
.rename( |
|
columns={ |
|
True: "From autogenerated", |
|
False: "Not autogenerated", |
|
"index": "Metadata/tag field", |
|
} |
|
) |
|
) |
|
|
|
|
|
def metadata_coverage_by_autogenerated(metadata_field): |
|
df = load_data() |
|
subset_df = df[df["is_generated"]].copy(deep=True) |
|
subset_df.reset_index() |
|
subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated) |
|
return ( |
|
subset_df.groupby("autogenerated-from")[metadata_field] |
|
.mean() |
|
.sort_values() |
|
.plot.barh() |
|
) |
|
|
|
|
|
def model_card_length_by_autogenerated(): |
|
df = load_data() |
|
subset_df = df[df["is_generated"]].copy(deep=True) |
|
subset_df.reset_index() |
|
subset_df["autogenerated-from"] = subset_df.tags.apply(_parse_tags_for_generated) |
|
return ( |
|
subset_df.groupby("autogenerated-from")["text_length"] |
|
.describe() |
|
.round() |
|
.reset_index() |
|
) |
|
|
|
|
|
_ABSTRACT = """ |
|
tl;dr this dashboard aims to provide an overview of metadata associated with models hosted on the Hugging Face hub. |
|
\n |
|
Each tab of this dashboard focuses on a different aspect of model metadata on the hub. |
|
Many of the tabs in the dashboard have a particular focus on the metadata coverage for different libraries in the hub. |
|
""" |
|
|
|
|
|
df = load_data() |
|
top_n = df.library.value_counts().shape[0] |
|
libraries = [library for library in df.library.unique() if library] |
|
metadata_coverage_columns = [c for c in df.columns if c.startswith("has")] |
|
with gr.Blocks() as demo: |
|
gr.Markdown("# π€ Hub Metadata Explorer") |
|
gr.Markdown(_ABSTRACT) |
|
with gr.Tab("Tag frequencies"): |
|
gr.Markdown( |
|
"Tags are one of the key ways in which users may identify models which are of interest. This tab provides " |
|
"some visualizations of tags across *all* models (regardless of library)" |
|
) |
|
with gr.Row(): |
|
gr.Markdown( |
|
"The accordian below allows you to see the top tags for models on the hub (optionally making " |
|
"tags case insensitive" |
|
) |
|
with gr.Row(): |
|
case_sensitive = gr.Checkbox( |
|
True, |
|
label="case sensitive", |
|
) |
|
mk = gr.Markdown() |
|
case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False) |
|
with gr.Accordion("Tag Frequencies", open=False): |
|
df = gr.Dataframe() |
|
case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False) |
|
with gr.Row(): |
|
gr.Markdown( |
|
"Some tags are currently used with in cased or uncased forms i.e. 'translation' vs 'Translation'" |
|
) |
|
with gr.Row(): |
|
gr.Markdown( |
|
f"Number of tags which are currently case sensitive {len(get_case_sensitive_duplicate_tags())}" |
|
) |
|
with gr.Row(): |
|
with gr.Accordion("View case sensitive tag pairs", open=False): |
|
gr.Dataframe(display_case_sensitive_duplicate_tags()) |
|
with gr.Tab("Tags frequencies by library"): |
|
gr.Markdown( |
|
"The π€ hub hosts models from a wide range of machine learning libraries. These libraries use tags in " |
|
"slightly different ways. The table below gives a breakdown of the most frequent tags for each library." |
|
) |
|
library_choice = gr.Dropdown(choices=libraries, label="select library") |
|
df = gr.Dataframe() |
|
library_choice.change( |
|
tag_frequency_by_library, [library_choice], df, queue=False |
|
) |
|
with gr.Tab("Metadata coverage by library"): |
|
gr.Markdown( |
|
"Libraries hosting models on the Hugging Face hub take different approaches to " |
|
"metadata i.e. some libraries automatically generate metadata for a model at the end of a " |
|
"training run. These libraries may also have different types of users who take differing " |
|
"approaches to creating metadata for models they share on the hub. The below chart allows you to " |
|
"see which libraries have better coverage for key areas of model metadata. " |
|
) |
|
metadata_field = gr.Dropdown(choices=metadata_coverage_columns) |
|
plot = gr.Plot() |
|
metadata_field.change( |
|
metadata_coverage_by_library, [metadata_field], plot, queue=False |
|
) |
|
with gr.Tab("Auto generated model cards"): |
|
gr.Markdown( |
|
"Some libraries/training frameworks automatically generate a model card when pushing models to " |
|
"the hub. The below dataframe compares the metadata coverage across several tags for models " |
|
"which are pushed with autogenerated model cards compared to those without. " |
|
"" |
|
"**Note** this " |
|
"breakdown relies on tags with `autogenerated` in them." |
|
"As a result some model cards might be in the wrong category. " |
|
) |
|
gr.Dataframe(metatadata_coverage_autogenerated_vs_test()) |
|
with gr.Row(): |
|
metadata_field = gr.Dropdown(choices=metadata_coverage_columns) |
|
plot = gr.Plot() |
|
metadata_field.change( |
|
metadata_coverage_by_autogenerated, [metadata_field], plot, queue=False |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Tab("Model Cards"): |
|
gr.Markdown( |
|
"""Model cards are a key component of metadata for a model. Model cards can include both |
|
information created by a human i.e. outlining the goals behind the creation of the model and information |
|
created by a training framework. This automatically generated information can contain information about |
|
number of epochs, learning rate, weight decay etc. """ |
|
) |
|
min_lib_frequency = gr.Slider( |
|
minimum=1, maximum=top_n, value=10, label="filter by top n libraries" |
|
) |
|
with gr.Column(): |
|
plot = gr.Plot() |
|
min_lib_frequency.change( |
|
has_model_card_by_library, [min_lib_frequency], plot, queue=False |
|
) |
|
with gr.Column(): |
|
gr.Markdown("Mean length of model card by library") |
|
df = gr.Dataframe() |
|
min_lib_frequency.change( |
|
model_card_length_by_library, [min_lib_frequency], df, queue=False |
|
) |
|
|
|
demo.launch() |
|
|