EcoMindAI / services /huggingface.py
FILALIHicham's picture
Add dynamic components in the Infrastructure tab
f3a0d02
from huggingface_hub import login
from datasets import load_dataset, Dataset, concatenate_datasets
import json
from config import HF_TOKEN, DATASET_NAME
def init_huggingface():
"""Initialize Hugging Face authentication."""
if HF_TOKEN is None:
raise ValueError("Hugging Face token not found in environment variables.")
login(token=HF_TOKEN)
def update_dataset(json_data):
"""Update the Hugging Face dataset with new data."""
if json_data is None or json_data.startswith("The following fields are required"):
return json_data or "No data to submit. Please fill in all required fields."
try:
data = json.loads(json_data)
except json.JSONDecodeError:
return "Invalid JSON data. Please ensure all required fields are filled correctly."
try:
dataset = load_dataset(DATASET_NAME, split="train")
except:
dataset = Dataset.from_dict({})
new_data = create_flattened_data(data)
new_dataset = Dataset.from_dict(new_data)
if len(dataset) > 0:
updated_dataset = concatenate_datasets([dataset, new_dataset])
else:
updated_dataset = new_dataset
updated_dataset.push_to_hub(DATASET_NAME)
return "Data submitted successfully and dataset updated!"
def create_flattened_data(data):
"""Create a flattened data structure for the dataset."""
# Handle hyperparameters
hyperparameters = data.get("task", {}).get("algorithms", [{}])[0].get("hyperparameters", {}).get("values", [])
# Process hyperparameters
hyperparameter_names = []
hyperparameter_values = []
for hp in hyperparameters:
if "name" in hp and "value" in hp: # Match the keys used in JSON
hyperparameter_names.append(hp["name"])
hyperparameter_values.append(str(hp["value"]))
hyperparameter_name_str = ", ".join(hyperparameter_names) if hyperparameter_names else None
hyperparameter_value_str = ", ".join(hyperparameter_values) if hyperparameter_values else None
# Handle inference properties
inference_props = data.get("task", {}).get("dataset", [{}])[0].get("inferenceProperties", [])
# Process inference properties
inference_data = []
for props in inference_props:
if props:
inference_data.append({
"nbRequest": props.get("nbRequest"),
"nbTokensInput": props.get("nbTokensInput"),
"nbWordsInput": props.get("nbWordsInput"),
"nbTokensOutput": props.get("nbTokensOutput"),
"nbWordsOutput": props.get("nbWordsOutput"),
"contextWindowSize": props.get("contextWindowSize"),
"cache": props.get("cache")
})
nbRequest_str = ", ".join([str(p["nbRequest"]) for p in inference_data if p.get("nbRequest")]) if inference_data else None
nbTokensInput_str = ", ".join([str(p["nbTokensInput"]) for p in inference_data if p.get("nbTokensInput")]) if inference_data else None
nbWordsInput_str = ", ".join([str(p["nbWordsInput"]) for p in inference_data if p.get("nbWordsInput")]) if inference_data else None
nbTokensOutput_str = ", ".join([str(p["nbTokensOutput"]) for p in inference_data if p.get("nbTokensOutput")]) if inference_data else None
nbWordsOutput_str = ", ".join([str(p["nbWordsOutput"]) for p in inference_data if p.get("nbWordsOutput")]) if inference_data else None
contextWindowSize_str = ", ".join([str(p["contextWindowSize"]) for p in inference_data if p.get("contextWindowSize")]) if inference_data else None
cache_str = ", ".join([str(p["cache"]) for p in inference_data if p.get("cache")]) if inference_data else None
# Handle components
components = data.get("infrastructure", {}).get("components", [])
component_data = []
for comp in components:
if comp:
component_data.append({
"componentName": comp.get("componentName"),
"nbComponent": comp.get("nbComponent"),
"memorySize": comp.get("memorySize"),
"manufacturer": comp.get("manufacturer"),
"family": comp.get("family"),
"series": comp.get("series"),
"share": comp.get("share")
})
componentName_str = ", ".join([str(p["componentName"]) for p in component_data if p.get("componentName")]) if component_data else None
nbComponent_str = ", ".join([str(p["nbComponent"]) for p in component_data if p.get("nbComponent")]) if component_data else None
memorySize_str = ", ".join([str(p["memorySize"]) for p in component_data if p.get("memorySize")]) if component_data else None
manufacturer_infra_str = ", ".join([str(p["manufacturer"]) for p in component_data if p.get("manufacturer")]) if component_data else None
family_str = ", ".join([str(p["family"]) for p in component_data if p.get("family")]) if component_data else None
series_str = ", ".join([str(p["series"]) for p in component_data if p.get("series")]) if component_data else None
share_str = ", ".join([str(p["share"]) for p in component_data if p.get("share")]) if component_data else None
return {
# Header
"licensing": [data["header"]["licensing"]],
"formatVersion": [data["header"]["formatVersion"]],
"formatVersionSpecificationUri": [data["header"]["formatVersionSpecificationUri"]],
"reportId": [data["header"]["reportId"]],
"reportDatetime": [data["header"]["reportDatetime"]],
"reportStatus": [data["header"]["reportStatus"]],
"publisher_name": [data["header"]["publisher"]["name"]],
"publisher_division": [data["header"]["publisher"]["division"]],
"publisher_projectName": [data["header"]["publisher"]["projectName"]],
"publisher_confidentialityLevel": [data["header"]["publisher"]["confidentialityLevel"]],
"publisher_publicKey": [data["header"]["publisher"]["publicKey"]],
# Task
"taskType": [data["task"]["taskType"]],
"taskFamily": [data["task"]["taskFamily"]],
"taskStage": [data["task"]["taskStage"]],
"algorithmName": [data["task"]["algorithms"][0]["algorithmName"]],
"framework": [data["task"]["algorithms"][0]["framework"]],
"frameworkVersion": [data["task"]["algorithms"][0]["frameworkVersion"]],
"classPath": [data["task"]["algorithms"][0]["classPath"]],
"tuning_method": [data["task"]["algorithms"][0]["hyperparameters"]["tuning_method"]],
"hyperparameterName": [hyperparameter_name_str],
"hyperparameterValue": [hyperparameter_value_str],
"quantization": [data["task"]["algorithms"][0]["quantization"]],
"dataType": [data["task"]["dataset"][0]["dataType"]],
"fileType": [data["task"]["dataset"][0]["fileType"]],
"volume": [data["task"]["dataset"][0]["volume"]],
"volumeUnit": [data["task"]["dataset"][0]["volumeUnit"]],
"items": [data["task"]["dataset"][0]["items"]],
"shape_item": [data["task"]["dataset"][0]["shape"][0]["item"]],
"nbRequest": [nbRequest_str],
"nbTokensInput": [nbTokensInput_str],
"nbWordsInput": [nbWordsInput_str],
"nbTokensOutput": [nbTokensOutput_str],
"nbWordsOutput": [nbWordsOutput_str],
"contextWindowSize": [contextWindowSize_str],
"cache": [cache_str],
"source": [data["task"]["dataset"][0]["source"]],
"sourceUri": [data["task"]["dataset"][0]["sourceUri"]],
"owner": [data["task"]["dataset"][0]["owner"]],
"measuredAccuracy": [data["task"]["measuredAccuracy"]],
"estimatedAccuracy": [data["task"]["estimatedAccuracy"]],
# Measures
"measurementMethod": [data["measures"][0]["measurementMethod"]],
"manufacturer": [data["measures"][0]["manufacturer"]],
"version": [data["measures"][0]["version"]],
"cpuTrackingMode": [data["measures"][0]["cpuTrackingMode"]],
"gpuTrackingMode": [data["measures"][0]["gpuTrackingMode"]],
"averageUtilizationCpu": [data["measures"][0]["averageUtilizationCpu"]],
"averageUtilizationGpu": [data["measures"][0]["averageUtilizationGpu"]],
"serverSideInference": [data["measures"][0]["serverSideInference"]],
"unit": [data["measures"][0]["unit"]],
"powerCalibrationMeasurement": [data["measures"][0]["powerCalibrationMeasurement"]],
"durationCalibrationMeasurement": [data["measures"][0]["durationCalibrationMeasurement"]],
"powerConsumption": [data["measures"][0]["powerConsumption"]],
"measurementDuration": [data["measures"][0]["measurementDuration"]],
"measurementDateTime": [data["measures"][0]["measurementDateTime"]],
# System
"os": [data["system"]["os"]],
"distribution": [data["system"]["distribution"]],
"distributionVersion": [data["system"]["distributionVersion"]],
# Software
"language": [data["software"]["language"]],
"version_software": [data["software"]["version"]],
# Infrastructure
"infraType": [data["infrastructure"]["infraType"]],
"cloudProvider": [data["infrastructure"]["cloudProvider"]],
"cloudInstance": [data["infrastructure"]["cloudInstance"]],
"componentName": [componentName_str],
"nbComponent": [nbComponent_str],
"memorySize": [memorySize_str],
"manufacturer_infra": [manufacturer_infra_str],
"family": [family_str],
"series": [series_str],
"share": [share_str],
# Environment
"country": [data["environment"]["country"]],
"latitude": [data["environment"]["latitude"]],
"longitude": [data["environment"]["longitude"]],
"location": [data["environment"]["location"]],
"powerSupplierType": [data["environment"]["powerSupplierType"]],
"powerSource": [data["environment"]["powerSource"]],
"powerSourceCarbonIntensity": [data["environment"]["powerSourceCarbonIntensity"]],
# Quality
"quality": [data["quality"]],
# Hash
"hashAlgorithm": [data["$hash"]["hashAlgorithm"]],
"cryptographicAlgorithm": [data["$hash"]["cryptographicAlgorithm"]],
"value": [data["$hash"]["ecryptedValue"]]
}