from datasets import Dataset, load_dataset, concatenate_datasets from huggingface_hub import HfApi, HfFolder import logging import os from typing import Optional, Dict, List import pandas as pd from src.api.services.embedding_service import EmbeddingService from src.api.exceptions import ( DatasetNotFoundError, DatasetPushError, DatasetDeleteError, ) # Set up structured logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) class HuggingFaceService: def __init__(self, hf_token: Optional[str] = None): """Initialize the HuggingFaceService with an optional token.""" self.hf_api = HfApi() if hf_token: HfFolder.save_token(hf_token) # Save the token for authentication async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None: """Push the dataset to Hugging Face Hub.""" try: logger.info(f"Creating Hugging Face Dataset: {dataset_name}...") ds = Dataset.from_pandas(df) ds.push_to_hub(dataset_name) logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}") except Exception as e: logger.error(f"Failed to push dataset to Hugging Face Hub: {e}") raise DatasetPushError(f"Failed to push dataset: {e}") async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]: """Read a dataset from Hugging Face Hub.""" try: logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...") ds = load_dataset(dataset_name) df = ds["train"].to_dict() return df except Exception as e: logger.error(f"Failed to read dataset: {e}") raise DatasetNotFoundError(f"Dataset not found: {e}") async def update_dataset( self, dataset_name: str, updates: Dict[str, List], target_column: str, output_column: str = "embeddings", ) -> Optional[pd.DataFrame]: """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset.""" try: # Step 1: Load the existing dataset from Hugging Face Hub logger.info( f"Loading existing dataset from Hugging Face Hub: {dataset_name}..." ) existing_ds = await self.read_dataset(dataset_name) existing_df = pd.DataFrame(existing_ds) # Step 2: Convert the new updates into a DataFrame logger.info("Converting updates to DataFrame...") new_df = pd.DataFrame(updates) # Step 3: Generate embeddings for the new data logger.info("Generating embeddings for the new data...") embedding_service = EmbeddingService( openai_api_key=os.getenv("OPENAI_API_KEY") ) # Get the embedding service new_df = await embedding_service.create_embeddings( new_df, target_column, output_column ) # Step 4: Concatenate the existing DataFrame with the new DataFrame logger.info("Concatenating existing dataset with new data...") updated_df = pd.concat([existing_df, new_df], ignore_index=True) # Step 5: Push the updated dataset back to Hugging Face Hub logger.info( f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..." ) await self.push_to_hub(updated_df, dataset_name) return updated_df except Exception as e: logger.error(f"Failed to update dataset: {e}") raise DatasetPushError(f"Failed to update dataset: {e}") async def delete_dataset(self, dataset_name: str) -> None: """Delete a dataset from Hugging Face Hub.""" try: logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...") self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset") logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}") except Exception as e: logger.error(f"Failed to delete dataset: {e}") raise DatasetDeleteError(f"Failed to delete dataset: {e}")