ScouterAI / rag /create_instance_segmentation_dataset.py
stevenbucaille's picture
Enhance app.py with improved user interface and instructions, update model ID in llm.py, and add image classification capabilities across various components. Introduce segment anything functionality and refine README for clarity on model capabilities.
518d841
import json
from datasets import Dataset, Features, Value
from huggingface_hub import HfApi, ModelCard, hf_hub_download
def get_model_ids(pipeline_tag: str) -> list[str]:
hf_api = HfApi()
models = hf_api.list_models(
library=["transformers"],
pipeline_tag=pipeline_tag,
gated=False,
fetch_config=True,
)
models = list(models)
model_ids = [model.id for model in models]
return model_ids
def get_model_card(model_id: str) -> str:
try:
model_card = ModelCard.load(model_id)
return model_card.text
except Exception:
return ""
def get_model_config(model_id: str) -> dict | None:
hf_api = HfApi()
if hf_api.file_exists(model_id, filename="config.json"):
config_path = hf_hub_download(model_id, filename="config.json")
with open(config_path, "r") as f:
return json.load(f)
return None
def get_model_labels(model_id: str) -> list[str]:
hf_api = HfApi()
if hf_api.file_exists(model_id, filename="config.json"):
config_path = hf_hub_download(model_id, filename="config.json")
with open(config_path, "r") as f:
try:
model_config = json.load(f)
except json.JSONDecodeError:
return [""]
if "id2label" in model_config:
labels = list(model_config["id2label"].values())
labels = [str(label).lower() for label in labels]
return labels
else:
return [""]
else:
return [""]
def create_dataset(pipeline_tag: str, required_architectures: list[str] = []):
def dataset_gen(model_ids: list[str]):
for model_id in model_ids:
model_card = get_model_card(model_id)
model_config = get_model_config(model_id)
if model_config is None:
continue
if any(
required_architecture in architecture
for architecture in model_config["architectures"]
for required_architecture in required_architectures
):
model_labels = get_model_labels(model_id)
if len(model_labels) > 1 and len(model_card) > 0:
print(f"Found instance segmentation model: {model_id}")
yield {
"model_id": model_id,
"model_card": model_card,
"model_labels": model_labels,
}
model_ids = get_model_ids(pipeline_tag)
dataset = Dataset.from_generator(
dataset_gen,
gen_kwargs={"model_ids": model_ids},
features=Features(
{
"model_id": Value("string"),
"model_card": Value("string"),
"model_labels": [Value("string")],
}
),
num_proc=12,
)
return dataset
if __name__ == "__main__":
dataset = create_dataset("image-segmentation", required_architectures=["InstanceSegmentationModel"])
print(dataset)
dataset.push_to_hub("stevenbucaille/instance-segmentation-models")