# from datasets import Dataset, load_dataset # import logging # from typing import Optional, Dict, List # import pandas as pd # from src.api.exceptions import DatasetNotFoundError, DatasetPushError # # Set up structured logging # logging.basicConfig( # level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" # ) # logger = logging.getLogger(__name__) # class HuggingFaceService: # async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None: # """Push the dataset to Hugging Face Hub.""" # try: # logger.info(f"Creating Hugging Face Dataset: {dataset_name}...") # ds = Dataset.from_pandas(df) # ds.push_to_hub(dataset_name) # logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}") # except Exception as e: # logger.error(f"Failed to push dataset to Hugging Face Hub: {e}") # raise DatasetPushError(f"Failed to push dataset: {e}") # async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]: # """Read a dataset from Hugging Face Hub.""" # try: # logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...") # ds = load_dataset(dataset_name) # df = ds["train"].to_pandas() # return df # except Exception as e: # logger.error(f"Failed to read dataset: {e}") # raise DatasetNotFoundError(f"Dataset not found: {e}") # async def update_dataset( # self, dataset_name: str, updates: Dict[str, List] # ) -> Optional[pd.DataFrame]: # """Update a dataset on Hugging Face Hub.""" # try: # df = await self.read_dataset(dataset_name) # for column, values in updates.items(): # if column in df.columns: # df[column] = values # else: # logger.warning(f"Column '{column}' not found in dataset.") # await self.push_to_hub(df, dataset_name) # return df # except Exception as e: # logger.error(f"Failed to update dataset: {e}") # raise DatasetPushError(f"Failed to update dataset: {e}") # async def delete_columns( # self, dataset_name: str, columns: List[str] # ) -> Optional[pd.DataFrame]: # """Delete columns from a dataset on Hugging Face Hub.""" # try: # df = await self.read_dataset(dataset_name) # for column in columns: # if column in df.columns: # df.drop(column, axis=1, inplace=True) # else: # logger.warning(f"Column '{column}' not found in dataset.") # await self.push_to_hub(df, dataset_name) # return df # except Exception as e: # logger.error(f"Failed to delete columns: {e}") # raise DatasetPushError(f"Failed to delete columns: {e}") from datasets import Dataset, load_dataset from huggingface_hub import HfApi, HfFolder import logging from typing import Optional, Dict, List import pandas as pd from src.api.exceptions import ( DatasetNotFoundError, DatasetPushError, DatasetDeleteError, ) # Set up structured logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) class HuggingFaceService: def __init__(self, hf_token: Optional[str] = None): """Initialize the HuggingFaceService with an optional token.""" self.hf_api = HfApi() if hf_token: HfFolder.save_token(hf_token) # Save the token for authentication async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None: """Push the dataset to Hugging Face Hub.""" try: logger.info(f"Creating Hugging Face Dataset: {dataset_name}...") ds = Dataset.from_pandas(df) ds.push_to_hub(dataset_name) logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}") except Exception as e: logger.error(f"Failed to push dataset to Hugging Face Hub: {e}") raise DatasetPushError(f"Failed to push dataset: {e}") async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]: """Read a dataset from Hugging Face Hub.""" try: logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...") ds = load_dataset(dataset_name) df = ds["train"].to_pandas() return df except Exception as e: logger.error(f"Failed to read dataset: {e}") raise DatasetNotFoundError(f"Dataset not found: {e}") async def update_dataset( self, dataset_name: str, updates: Dict[str, List] ) -> Optional[pd.DataFrame]: """Update a dataset on Hugging Face Hub.""" try: df = await self.read_dataset(dataset_name) for column, values in updates.items(): if column in df.columns: df[column] = values else: logger.warning(f"Column '{column}' not found in dataset.") await self.push_to_hub(df, dataset_name) return df except Exception as e: logger.error(f"Failed to update dataset: {e}") raise DatasetPushError(f"Failed to update dataset: {e}") async def delete_dataset(self, dataset_name: str) -> None: """Delete a dataset from Hugging Face Hub.""" try: logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...") self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset") logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}") except Exception as e: logger.error(f"Failed to delete dataset: {e}") raise DatasetDeleteError(f"Failed to delete dataset: {e}")