Spaces:
Running
Running
import json | |
from smolagents import Tool | |
from huggingface_hub import HfApi, hf_hub_download, ModelCard | |
from datasets import Dataset, Features, Value | |
def get_model_ids(pipeline_tag: str) -> list[str]: | |
hf_api = HfApi() | |
models = hf_api.list_models( | |
library=["transformers"], | |
pipeline_tag=pipeline_tag, | |
gated=False, | |
fetch_config=True, | |
) | |
models = list(models) | |
model_ids = [model.id for model in models] | |
return model_ids | |
def get_model_card(model_id: str) -> str: | |
try: | |
model_card = ModelCard.load(model_id) | |
return model_card.text | |
except Exception as e: | |
return "" | |
def get_model_labels(model_id: str) -> list[str]: | |
hf_api = HfApi() | |
if hf_api.file_exists(model_id, filename="config.json"): | |
config_path = hf_hub_download(model_id, filename="config.json") | |
with open(config_path, "r") as f: | |
try: | |
model_config = json.load(f) | |
except json.JSONDecodeError: | |
return [""] | |
if "id2label" in model_config: | |
labels = list(model_config["id2label"].values()) | |
labels = [str(label).lower() for label in labels] | |
return labels | |
else: | |
return [""] | |
else: | |
return [""] | |
def create_dataset(pipeline_tag: str): | |
def dataset_gen(model_ids: list[str]): | |
for model_id in model_ids: | |
model_card = get_model_card(model_id) | |
model_labels = get_model_labels(model_id) | |
if len(model_labels) > 1 and len(model_card) > 0: | |
yield { | |
"model_id": model_id, | |
"model_card": model_card, | |
"model_labels": model_labels, | |
} | |
model_ids = get_model_ids(pipeline_tag) | |
dataset = Dataset.from_generator( | |
dataset_gen, | |
gen_kwargs={"model_ids": model_ids}, | |
features=Features( | |
{ | |
"model_id": Value("string"), | |
"model_card": Value("string"), | |
"model_labels": [Value("string")], | |
} | |
), | |
num_proc=12, | |
) | |
return dataset | |
if __name__ == "__main__": | |
dataset = create_dataset("object-detection") | |
print(dataset) | |
dataset.push_to_hub("stevenbucaille/object-detection-models-dataset", ) | |
# dataset.push_to_hub("stevenbucaille/object-detection-models-dataset") | |