from datasets import Dataset, load_dataset import logging from typing import Optional, Dict, List import pandas as pd from src.api.exceptions import DatasetNotFoundError, DatasetPushError # Set up structured logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) class HuggingFaceService: async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None: """Push the dataset to Hugging Face Hub.""" try: logger.info(f"Creating Hugging Face Dataset: {dataset_name}...") ds = Dataset.from_pandas(df).remove_columns("__index_level_0__") ds.push_to_hub(dataset_name) logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}") except Exception as e: logger.error(f"Failed to push dataset to Hugging Face Hub: {e}") raise DatasetPushError(f"Failed to push dataset: {e}") async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]: """Read a dataset from Hugging Face Hub.""" try: logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...") ds = load_dataset(dataset_name) df = ds["train"].to_pandas() return df except Exception as e: logger.error(f"Failed to read dataset: {e}") raise DatasetNotFoundError(f"Dataset not found: {e}") async def update_dataset( self, dataset_name: str, updates: Dict[str, List] ) -> Optional[pd.DataFrame]: """Update a dataset on Hugging Face Hub.""" try: df = await self.read_dataset(dataset_name) for column, values in updates.items(): if column in df.columns: df[column] = values else: logger.warning(f"Column '{column}' not found in dataset.") await self.push_to_hub(df, dataset_name) return df except Exception as e: logger.error(f"Failed to update dataset: {e}") raise DatasetPushError(f"Failed to update dataset: {e}") async def delete_columns( self, dataset_name: str, columns: List[str] ) -> Optional[pd.DataFrame]: """Delete columns from a dataset on Hugging Face Hub.""" try: df = await self.read_dataset(dataset_name) for column in columns: if column in df.columns: df.drop(column, axis=1, inplace=True) else: logger.warning(f"Column '{column}' not found in dataset.") await self.push_to_hub(df, dataset_name) return df except Exception as e: logger.error(f"Failed to delete columns: {e}") raise DatasetPushError(f"Failed to delete columns: {e}")