ScouterAI / rag /create_dataset.py
stevenbucaille's picture
Add initial project structure with core functionality for image processing agents
7e327f2
raw
history blame
2.4 kB
import json
from smolagents import Tool
from huggingface_hub import HfApi, hf_hub_download, ModelCard
from datasets import Dataset, Features, Value
def get_model_ids(pipeline_tag: str) -> list[str]:
hf_api = HfApi()
models = hf_api.list_models(
library=["transformers"],
pipeline_tag=pipeline_tag,
gated=False,
fetch_config=True,
)
models = list(models)
model_ids = [model.id for model in models]
return model_ids
def get_model_card(model_id: str) -> str:
try:
model_card = ModelCard.load(model_id)
return model_card.text
except Exception as e:
return ""
def get_model_labels(model_id: str) -> list[str]:
hf_api = HfApi()
if hf_api.file_exists(model_id, filename="config.json"):
config_path = hf_hub_download(model_id, filename="config.json")
with open(config_path, "r") as f:
try:
model_config = json.load(f)
except json.JSONDecodeError:
return [""]
if "id2label" in model_config:
labels = list(model_config["id2label"].values())
labels = [str(label).lower() for label in labels]
return labels
else:
return [""]
else:
return [""]
def create_dataset(pipeline_tag: str):
def dataset_gen(model_ids: list[str]):
for model_id in model_ids:
model_card = get_model_card(model_id)
model_labels = get_model_labels(model_id)
if len(model_labels) > 1 and len(model_card) > 0:
yield {
"model_id": model_id,
"model_card": model_card,
"model_labels": model_labels,
}
model_ids = get_model_ids(pipeline_tag)
dataset = Dataset.from_generator(
dataset_gen,
gen_kwargs={"model_ids": model_ids},
features=Features(
{
"model_id": Value("string"),
"model_card": Value("string"),
"model_labels": [Value("string")],
}
),
num_proc=12,
)
return dataset
if __name__ == "__main__":
dataset = create_dataset("object-detection")
print(dataset)
dataset.push_to_hub("stevenbucaille/object-detection-models-dataset", )
# dataset.push_to_hub("stevenbucaille/object-detection-models-dataset")