from huggingface_hub import login from datasets import load_dataset, Dataset, concatenate_datasets import json from config import HF_TOKEN, DATASET_NAME def init_huggingface(): """Initialize Hugging Face authentication.""" if HF_TOKEN is None: raise ValueError("Hugging Face token not found in environment variables.") login(token=HF_TOKEN) def update_dataset(json_data): """Update the Hugging Face dataset with new data.""" if json_data is None or json_data.startswith("The following fields are required"): return json_data or "No data to submit. Please fill in all required fields." try: data = json.loads(json_data) except json.JSONDecodeError: return "Invalid JSON data. Please ensure all required fields are filled correctly." try: dataset = load_dataset(DATASET_NAME, split="train") except: dataset = Dataset.from_dict({}) new_data = create_flattened_data(data) new_dataset = Dataset.from_dict(new_data) if len(dataset) > 0: updated_dataset = concatenate_datasets([dataset, new_dataset]) else: updated_dataset = new_dataset updated_dataset.push_to_hub(DATASET_NAME) return "Data submitted successfully and dataset updated!" def create_flattened_data(data): """Create a flattened data structure for the dataset.""" # Handle hyperparameters hyperparameters = data.get("task", {}).get("algorithms", [{}])[0].get("hyperparameters", {}).get("values", []) # Process hyperparameters hyperparameter_names = [] hyperparameter_values = [] for hp in hyperparameters: if "name" in hp and "value" in hp: # Match the keys used in JSON hyperparameter_names.append(hp["name"]) hyperparameter_values.append(str(hp["value"])) hyperparameter_name_str = ", ".join(hyperparameter_names) if hyperparameter_names else None hyperparameter_value_str = ", ".join(hyperparameter_values) if hyperparameter_values else None # Handle inference properties inference_props = data.get("task", {}).get("dataset", [{}])[0].get("inferenceProperties", []) # Process inference properties inference_data = [] for props in inference_props: if props: inference_data.append({ "nbRequest": props.get("nbRequest"), "nbTokensInput": props.get("nbTokensInput"), "nbWordsInput": props.get("nbWordsInput"), "nbTokensOutput": props.get("nbTokensOutput"), "nbWordsOutput": props.get("nbWordsOutput"), "contextWindowSize": props.get("contextWindowSize"), "cache": props.get("cache") }) nbRequest_str = ", ".join([str(p["nbRequest"]) for p in inference_data if p.get("nbRequest")]) if inference_data else None nbTokensInput_str = ", ".join([str(p["nbTokensInput"]) for p in inference_data if p.get("nbTokensInput")]) if inference_data else None nbWordsInput_str = ", ".join([str(p["nbWordsInput"]) for p in inference_data if p.get("nbWordsInput")]) if inference_data else None nbTokensOutput_str = ", ".join([str(p["nbTokensOutput"]) for p in inference_data if p.get("nbTokensOutput")]) if inference_data else None nbWordsOutput_str = ", ".join([str(p["nbWordsOutput"]) for p in inference_data if p.get("nbWordsOutput")]) if inference_data else None contextWindowSize_str = ", ".join([str(p["contextWindowSize"]) for p in inference_data if p.get("contextWindowSize")]) if inference_data else None cache_str = ", ".join([str(p["cache"]) for p in inference_data if p.get("cache")]) if inference_data else None # Handle components components = data.get("infrastructure", {}).get("components", []) component_data = [] for comp in components: if comp: component_data.append({ "componentName": comp.get("componentName"), "nbComponent": comp.get("nbComponent"), "memorySize": comp.get("memorySize"), "manufacturer": comp.get("manufacturer"), "family": comp.get("family"), "series": comp.get("series"), "share": comp.get("share") }) componentName_str = ", ".join([str(p["componentName"]) for p in component_data if p.get("componentName")]) if component_data else None nbComponent_str = ", ".join([str(p["nbComponent"]) for p in component_data if p.get("nbComponent")]) if component_data else None memorySize_str = ", ".join([str(p["memorySize"]) for p in component_data if p.get("memorySize")]) if component_data else None manufacturer_infra_str = ", ".join([str(p["manufacturer"]) for p in component_data if p.get("manufacturer")]) if component_data else None family_str = ", ".join([str(p["family"]) for p in component_data if p.get("family")]) if component_data else None series_str = ", ".join([str(p["series"]) for p in component_data if p.get("series")]) if component_data else None share_str = ", ".join([str(p["share"]) for p in component_data if p.get("share")]) if component_data else None return { # Header "licensing": [data["header"]["licensing"]], "formatVersion": [data["header"]["formatVersion"]], "formatVersionSpecificationUri": [data["header"]["formatVersionSpecificationUri"]], "reportId": [data["header"]["reportId"]], "reportDatetime": [data["header"]["reportDatetime"]], "reportStatus": [data["header"]["reportStatus"]], "publisher_name": [data["header"]["publisher"]["name"]], "publisher_division": [data["header"]["publisher"]["division"]], "publisher_projectName": [data["header"]["publisher"]["projectName"]], "publisher_confidentialityLevel": [data["header"]["publisher"]["confidentialityLevel"]], "publisher_publicKey": [data["header"]["publisher"]["publicKey"]], # Task "taskType": [data["task"]["taskType"]], "taskFamily": [data["task"]["taskFamily"]], "taskStage": [data["task"]["taskStage"]], "algorithmName": [data["task"]["algorithms"][0]["algorithmName"]], "framework": [data["task"]["algorithms"][0]["framework"]], "frameworkVersion": [data["task"]["algorithms"][0]["frameworkVersion"]], "classPath": [data["task"]["algorithms"][0]["classPath"]], "tuning_method": [data["task"]["algorithms"][0]["hyperparameters"]["tuning_method"]], "hyperparameterName": [hyperparameter_name_str], "hyperparameterValue": [hyperparameter_value_str], "quantization": [data["task"]["algorithms"][0]["quantization"]], "dataType": [data["task"]["dataset"][0]["dataType"]], "fileType": [data["task"]["dataset"][0]["fileType"]], "volume": [data["task"]["dataset"][0]["volume"]], "volumeUnit": [data["task"]["dataset"][0]["volumeUnit"]], "items": [data["task"]["dataset"][0]["items"]], "shape_item": [data["task"]["dataset"][0]["shape"][0]["item"]], "nbRequest": [nbRequest_str], "nbTokensInput": [nbTokensInput_str], "nbWordsInput": [nbWordsInput_str], "nbTokensOutput": [nbTokensOutput_str], "nbWordsOutput": [nbWordsOutput_str], "contextWindowSize": [contextWindowSize_str], "cache": [cache_str], "source": [data["task"]["dataset"][0]["source"]], "sourceUri": [data["task"]["dataset"][0]["sourceUri"]], "owner": [data["task"]["dataset"][0]["owner"]], "measuredAccuracy": [data["task"]["measuredAccuracy"]], "estimatedAccuracy": [data["task"]["estimatedAccuracy"]], # Measures "measurementMethod": [data["measures"][0]["measurementMethod"]], "manufacturer": [data["measures"][0]["manufacturer"]], "version": [data["measures"][0]["version"]], "cpuTrackingMode": [data["measures"][0]["cpuTrackingMode"]], "gpuTrackingMode": [data["measures"][0]["gpuTrackingMode"]], "averageUtilizationCpu": [data["measures"][0]["averageUtilizationCpu"]], "averageUtilizationGpu": [data["measures"][0]["averageUtilizationGpu"]], "serverSideInference": [data["measures"][0]["serverSideInference"]], "unit": [data["measures"][0]["unit"]], "powerCalibrationMeasurement": [data["measures"][0]["powerCalibrationMeasurement"]], "durationCalibrationMeasurement": [data["measures"][0]["durationCalibrationMeasurement"]], "powerConsumption": [data["measures"][0]["powerConsumption"]], "measurementDuration": [data["measures"][0]["measurementDuration"]], "measurementDateTime": [data["measures"][0]["measurementDateTime"]], # System "os": [data["system"]["os"]], "distribution": [data["system"]["distribution"]], "distributionVersion": [data["system"]["distributionVersion"]], # Software "language": [data["software"]["language"]], "version_software": [data["software"]["version"]], # Infrastructure "infraType": [data["infrastructure"]["infraType"]], "cloudProvider": [data["infrastructure"]["cloudProvider"]], "cloudInstance": [data["infrastructure"]["cloudInstance"]], "componentName": [componentName_str], "nbComponent": [nbComponent_str], "memorySize": [memorySize_str], "manufacturer_infra": [manufacturer_infra_str], "family": [family_str], "series": [series_str], "share": [share_str], # Environment "country": [data["environment"]["country"]], "latitude": [data["environment"]["latitude"]], "longitude": [data["environment"]["longitude"]], "location": [data["environment"]["location"]], "powerSupplierType": [data["environment"]["powerSupplierType"]], "powerSource": [data["environment"]["powerSource"]], "powerSourceCarbonIntensity": [data["environment"]["powerSourceCarbonIntensity"]], # Quality "quality": [data["quality"]], # Hash "hashAlgorithm": [data["$hash"]["hashAlgorithm"]], "cryptographicAlgorithm": [data["$hash"]["cryptographicAlgorithm"]], "value": [data["$hash"]["ecryptedValue"]] }