|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Helpful utility functions and classes in relation to exploring API endpoints |
|
with the aim for a user-friendly interface. |
|
""" |
|
import math |
|
import re |
|
from dataclasses import dataclass |
|
from typing import TYPE_CHECKING, List, Optional, Union |
|
|
|
from ..repocard_data import ModelCardData |
|
|
|
|
|
if TYPE_CHECKING: |
|
from ..hf_api import ModelInfo |
|
|
|
|
|
def _is_emission_within_treshold(model_info: "ModelInfo", minimum_threshold: float, maximum_threshold: float) -> bool: |
|
"""Checks if a model's emission is within a given threshold. |
|
|
|
Args: |
|
model_info (`ModelInfo`): |
|
A model info object containing the model's emission information. |
|
minimum_threshold (`float`): |
|
A minimum carbon threshold to filter by, such as 1. |
|
maximum_threshold (`float`): |
|
A maximum carbon threshold to filter by, such as 10. |
|
|
|
Returns: |
|
`bool`: Whether the model's emission is within the given threshold. |
|
""" |
|
if minimum_threshold is None and maximum_threshold is None: |
|
raise ValueError("Both `minimum_threshold` and `maximum_threshold` cannot both be `None`") |
|
if minimum_threshold is None: |
|
minimum_threshold = -1 |
|
if maximum_threshold is None: |
|
maximum_threshold = math.inf |
|
|
|
card_data = getattr(model_info, "card_data", None) |
|
if card_data is None or not isinstance(card_data, (dict, ModelCardData)): |
|
return False |
|
|
|
|
|
emission = card_data.get("co2_eq_emissions", None) |
|
if isinstance(emission, dict): |
|
emission = emission["emissions"] |
|
if not emission: |
|
return False |
|
|
|
|
|
matched = re.search(r"\d+\.\d+|\d+", str(emission)) |
|
if matched is None: |
|
return False |
|
|
|
emission_value = float(matched.group(0)) |
|
return minimum_threshold <= emission_value <= maximum_threshold |
|
|
|
|
|
@dataclass |
|
class DatasetFilter: |
|
""" |
|
A class that converts human-readable dataset search parameters into ones |
|
compatible with the REST API. For all parameters capitalization does not |
|
matter. |
|
|
|
Args: |
|
author (`str`, *optional*): |
|
A string or list of strings that can be used to identify datasets on |
|
the Hub by the original uploader (author or organization), such as |
|
`facebook` or `huggingface`. |
|
benchmark (`str` or `List`, *optional*): |
|
A string or list of strings that can be used to identify datasets on |
|
the Hub by their official benchmark. |
|
dataset_name (`str`, *optional*): |
|
A string or list of strings that can be used to identify datasets on |
|
the Hub by its name, such as `SQAC` or `wikineural` |
|
language_creators (`str` or `List`, *optional*): |
|
A string or list of strings that can be used to identify datasets on |
|
the Hub with how the data was curated, such as `crowdsourced` or |
|
`machine_generated`. |
|
language (`str` or `List`, *optional*): |
|
A string or list of strings representing a two-character language to |
|
filter datasets by on the Hub. |
|
multilinguality (`str` or `List`, *optional*): |
|
A string or list of strings representing a filter for datasets that |
|
contain multiple languages. |
|
size_categories (`str` or `List`, *optional*): |
|
A string or list of strings that can be used to identify datasets on |
|
the Hub by the size of the dataset such as `100K<n<1M` or |
|
`1M<n<10M`. |
|
task_categories (`str` or `List`, *optional*): |
|
A string or list of strings that can be used to identify datasets on |
|
the Hub by the designed task, such as `audio_classification` or |
|
`named_entity_recognition`. |
|
task_ids (`str` or `List`, *optional*): |
|
A string or list of strings that can be used to identify datasets on |
|
the Hub by the specific task such as `speech_emotion_recognition` or |
|
`paraphrase`. |
|
|
|
Examples: |
|
|
|
```py |
|
>>> from huggingface_hub import DatasetFilter |
|
|
|
>>> # Using author |
|
>>> new_filter = DatasetFilter(author="facebook") |
|
|
|
>>> # Using benchmark |
|
>>> new_filter = DatasetFilter(benchmark="raft") |
|
|
|
>>> # Using dataset_name |
|
>>> new_filter = DatasetFilter(dataset_name="wikineural") |
|
|
|
>>> # Using language_creator |
|
>>> new_filter = DatasetFilter(language_creator="crowdsourced") |
|
|
|
>>> # Using language |
|
>>> new_filter = DatasetFilter(language="en") |
|
|
|
>>> # Using multilinguality |
|
>>> new_filter = DatasetFilter(multilinguality="multilingual") |
|
|
|
>>> # Using size_categories |
|
>>> new_filter = DatasetFilter(size_categories="100K<n<1M") |
|
|
|
>>> # Using task_categories |
|
>>> new_filter = DatasetFilter(task_categories="audio_classification") |
|
|
|
>>> # Using task_ids |
|
>>> new_filter = DatasetFilter(task_ids="paraphrase") |
|
``` |
|
""" |
|
|
|
author: Optional[str] = None |
|
benchmark: Optional[Union[str, List[str]]] = None |
|
dataset_name: Optional[str] = None |
|
language_creators: Optional[Union[str, List[str]]] = None |
|
language: Optional[Union[str, List[str]]] = None |
|
multilinguality: Optional[Union[str, List[str]]] = None |
|
size_categories: Optional[Union[str, List[str]]] = None |
|
task_categories: Optional[Union[str, List[str]]] = None |
|
task_ids: Optional[Union[str, List[str]]] = None |
|
|
|
|
|
@dataclass |
|
class ModelFilter: |
|
""" |
|
A class that converts human-readable model search parameters into ones |
|
compatible with the REST API. For all parameters capitalization does not |
|
matter. |
|
|
|
Args: |
|
author (`str`, *optional*): |
|
A string that can be used to identify models on the Hub by the |
|
original uploader (author or organization), such as `facebook` or |
|
`huggingface`. |
|
library (`str` or `List`, *optional*): |
|
A string or list of strings of foundational libraries models were |
|
originally trained from, such as pytorch, tensorflow, or allennlp. |
|
language (`str` or `List`, *optional*): |
|
A string or list of strings of languages, both by name and country |
|
code, such as "en" or "English" |
|
model_name (`str`, *optional*): |
|
A string that contain complete or partial names for models on the |
|
Hub, such as "bert" or "bert-base-cased" |
|
task (`str` or `List`, *optional*): |
|
A string or list of strings of tasks models were designed for, such |
|
as: "fill-mask" or "automatic-speech-recognition" |
|
tags (`str` or `List`, *optional*): |
|
A string tag or a list of tags to filter models on the Hub by, such |
|
as `text-generation` or `spacy`. |
|
trained_dataset (`str` or `List`, *optional*): |
|
A string tag or a list of string tags of the trained dataset for a |
|
model on the Hub. |
|
|
|
|
|
```python |
|
>>> from huggingface_hub import ModelFilter |
|
|
|
>>> # For the author_or_organization |
|
>>> new_filter = ModelFilter(author_or_organization="facebook") |
|
|
|
>>> # For the library |
|
>>> new_filter = ModelFilter(library="pytorch") |
|
|
|
>>> # For the language |
|
>>> new_filter = ModelFilter(language="french") |
|
|
|
>>> # For the model_name |
|
>>> new_filter = ModelFilter(model_name="bert") |
|
|
|
>>> # For the task |
|
>>> new_filter = ModelFilter(task="text-classification") |
|
|
|
>>> from huggingface_hub import HfApi |
|
|
|
>>> api = HfApi() |
|
# To list model tags |
|
|
|
>>> new_filter = ModelFilter(tags="benchmark:raft") |
|
|
|
>>> # Related to the dataset |
|
>>> new_filter = ModelFilter(trained_dataset="common_voice") |
|
``` |
|
""" |
|
|
|
author: Optional[str] = None |
|
library: Optional[Union[str, List[str]]] = None |
|
language: Optional[Union[str, List[str]]] = None |
|
model_name: Optional[str] = None |
|
task: Optional[Union[str, List[str]]] = None |
|
trained_dataset: Optional[Union[str, List[str]]] = None |
|
tags: Optional[Union[str, List[str]]] = None |
|
|