Spaces:
Runtime error
Runtime error
import os | |
from datetime import datetime | |
from functools import lru_cache | |
from typing import List, Optional | |
from datasets import Dataset | |
from huggingface_hub import list_models | |
from pydantic import BaseModel, field_validator | |
from toolz import groupby | |
from tqdm.auto import tqdm | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
def get_all_models(): | |
models = list( | |
tqdm( | |
iter( | |
list_models( | |
cardData=True, full=True, limit=None, sort="downloads", direction=-1 | |
) | |
) | |
) | |
) | |
return [model for model in models if model is not None] | |
def has_base_model_info(model): | |
return bool(hasattr(model.card_data, "base_model")) | |
class HubModel(BaseModel): | |
author: Optional[str] = None | |
last_modified: Optional[datetime] = None | |
createdAt: Optional[datetime] = None | |
downloads: Optional[int] = None | |
likes: Optional[int] = None | |
library_name: Optional[str] = None | |
modelId: Optional[str] = None | |
datasets: Optional[List[str]] = None | |
language: Optional[List[str]] = None | |
base_model: Optional[str] = None | |
def ensure_list(cls, v): | |
return [v] if isinstance(v, str) else v | |
def from_original(cls, original_data: dict) -> "HubModel": | |
card_data = original_data.get("card_data", {}) | |
if card_data is None: | |
card_data = {} | |
if not isinstance(card_data, dict): | |
card_data = card_data.__dict__ | |
return cls( | |
author=original_data.get("author"), | |
last_modified=original_data.get("last_modified"), | |
createdAt=original_data.get("createdAt"), | |
downloads=original_data.get("downloads"), | |
likes=original_data.get("likes"), | |
library_name=original_data.get("library_name"), | |
modelId=original_data.get("modelId"), | |
datasets=card_data.get("datasets"), | |
language=card_data.get("language"), | |
base_model=card_data.get("base_model"), | |
) | |
def load_data(): | |
grouped_by_has_base_model_info = groupby(has_base_model_info, get_all_models()) | |
models_with_base_model_info = grouped_by_has_base_model_info.get(True) | |
models_without_base_models = grouped_by_has_base_model_info.get(False) | |
parsed_models = [ | |
HubModel.from_original(model.__dict__).model_dump() | |
for model in models_with_base_model_info | |
] | |
base_models = {model["base_model"] for model in parsed_models} | |
base_models = [ | |
model for model in tqdm(models_without_base_models) if model.id in base_models | |
] | |
base_models = [model for model in base_models if model is not None] | |
parsed_base_models = [ | |
HubModel.from_original(model.__dict__).model_dump() for model in base_models | |
] | |
ds = Dataset.from_list(parsed_models + parsed_base_models) | |
ds.push_to_hub("librarian-bots/hub_models_with_base_model_info", token=HF_TOKEN) | |
print("Pushed to hub") | |
return ds | |