# from datasets import Dataset, load_dataset, concatenate_datasets # from huggingface_hub import HfApi, HfFolder # import logging # import os # from typing import Optional, Dict, List # import pandas as pd # from src.api.services.embedding_service import EmbeddingService # from src.api.exceptions import ( # DatasetNotFoundError, # DatasetPushError, # DatasetDeleteError, # ) # # Set up structured logging # logging.basicConfig( # level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" # ) # logger = logging.getLogger(__name__) # class HuggingFaceService: # def __init__(self, hf_token: Optional[str] = None): # """Initialize the HuggingFaceService with an optional token.""" # self.hf_api = HfApi() # if hf_token: # HfFolder.save_token(hf_token) # Save the token for authentication # async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None: # """Push the dataset to Hugging Face Hub.""" # try: # logger.info(f"Creating Hugging Face Dataset: {dataset_name}...") # ds = Dataset.from_pandas(df) # ds.push_to_hub(dataset_name) # logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}") # except Exception as e: # logger.error(f"Failed to push dataset to Hugging Face Hub: {e}") # raise DatasetPushError(f"Failed to push dataset: {e}") # async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]: # """Read a dataset from Hugging Face Hub.""" # try: # logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...") # ds = load_dataset(dataset_name) # df = ds["train"].to_dict() # return df # except Exception as e: # logger.error(f"Failed to read dataset: {e}") # raise DatasetNotFoundError(f"Dataset not found: {e}") # async def update_dataset( # self, # dataset_name: str, # updates: Dict[str, List], # target_column: str, # output_column: str = "embeddings", # ) -> Optional[pd.DataFrame]: # """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset.""" # try: # # Step 1: Load the existing dataset from Hugging Face Hub # logger.info( # f"Loading existing dataset from Hugging Face Hub: {dataset_name}..." # ) # existing_ds = await self.read_dataset(dataset_name) # existing_df = pd.DataFrame(existing_ds) # # Step 2: Convert the new updates into a DataFrame # logger.info("Converting updates to DataFrame...") # new_df = pd.DataFrame(updates) # # Step 3: Generate embeddings for the new data # logger.info("Generating embeddings for the new data...") # embedding_service = EmbeddingService( # openai_api_key=os.getenv("OPENAI_API_KEY") # ) # Get the embedding service # new_df = await embedding_service.create_embeddings( # new_df, target_column, output_column # ) # # Step 4: Concatenate the existing DataFrame with the new DataFrame # logger.info("Concatenating existing dataset with new data...") # updated_df = pd.concat([existing_df, new_df], ignore_index=True) # # Step 5: Push the updated dataset back to Hugging Face Hub # logger.info( # f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..." # ) # await self.push_to_hub(updated_df, dataset_name) # return updated_df # except Exception as e: # logger.error(f"Failed to update dataset: {e}") # raise DatasetPushError(f"Failed to update dataset: {e}") # async def delete_dataset(self, dataset_name: str) -> None: # """Delete a dataset from Hugging Face Hub.""" # try: # logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...") # self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset") # logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}") # except Exception as e: # logger.error(f"Failed to delete dataset: {e}") # raise DatasetDeleteError(f"Failed to delete dataset: {e}") from datasets import Dataset, load_dataset, concatenate_datasets from huggingface_hub import HfApi, HfFolder import logging import os from typing import Optional, Dict, List from src.api.services.embedding_service import EmbeddingService from src.api.exceptions import ( DatasetNotFoundError, DatasetPushError, DatasetDeleteError, ) # Set up structured logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) class HuggingFaceService: def __init__(self, hf_token: Optional[str] = None): """Initialize the HuggingFaceService with an optional token.""" self.hf_api = HfApi() if hf_token: HfFolder.save_token(hf_token) # Save the token for authentication async def push_to_hub(self, dataset: Dataset, dataset_name: str) -> None: """Push the dataset to Hugging Face Hub.""" try: logger.info(f"Creating Hugging Face Dataset: {dataset_name}...") dataset.push_to_hub(dataset_name) logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}") except Exception as e: logger.error(f"Failed to push dataset to Hugging Face Hub: {e}") raise DatasetPushError(f"Failed to push dataset: {e}") async def read_dataset(self, dataset_name: str) -> Optional[Dataset]: """Read a dataset from Hugging Face Hub.""" try: logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...") dataset = load_dataset(dataset_name) return dataset["train"] except Exception as e: logger.error(f"Failed to read dataset: {e}") raise DatasetNotFoundError(f"Dataset not found: {e}") async def update_dataset( self, dataset_name: str, updates: Dict[str, List], target_column: str, output_column: str = "embeddings", ) -> Optional[Dataset]: """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset.""" try: # Step 1: Load the existing dataset from Hugging Face Hub logger.info( f"Loading existing dataset from Hugging Face Hub: {dataset_name}..." ) existing_dataset = await self.read_dataset(dataset_name) # Step 2: Convert the new updates into a Dataset logger.info("Converting updates to Dataset...") new_dataset = Dataset.from_dict(updates) # Step 3: Generate embeddings for the new data logger.info("Generating embeddings for the new data...") embedding_service = EmbeddingService( openai_api_key=os.getenv("OPENAI_API_KEY") ) # Get the embedding service new_dataset = await embedding_service.create_embeddings( new_dataset, target_column, output_column ) # Step 4: Concatenate the existing Dataset with the new Dataset logger.info("Concatenating existing dataset with new data...") updated_dataset = concatenate_datasets([existing_dataset, new_dataset]) # Step 5: Push the updated dataset back to Hugging Face Hub logger.info( f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..." ) await self.push_to_hub(updated_dataset, dataset_name) return updated_dataset except Exception as e: logger.error(f"Failed to update dataset: {e}") raise DatasetPushError(f"Failed to update dataset: {e}") async def delete_dataset(self, dataset_name: str) -> None: """Delete a dataset from Hugging Face Hub.""" try: logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...") self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset") logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}") except Exception as e: logger.error(f"Failed to delete dataset: {e}") raise DatasetDeleteError(f"Failed to delete dataset: {e}")