Spaces:
Running
Running
from datasets import Dataset, load_dataset, concatenate_datasets | |
from huggingface_hub import HfApi, HfFolder | |
import logging | |
from typing import Optional, Dict, List | |
import pandas as pd | |
from src.api.dependency import get_embedding_service, get_huggingface_service | |
from src.api.exceptions import ( | |
DatasetNotFoundError, | |
DatasetPushError, | |
DatasetDeleteError, | |
) | |
# Set up structured logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
) | |
logger = logging.getLogger(__name__) | |
class HuggingFaceService: | |
def __init__(self, hf_token: Optional[str] = None): | |
"""Initialize the HuggingFaceService with an optional token.""" | |
self.hf_api = HfApi() | |
if hf_token: | |
HfFolder.save_token(hf_token) # Save the token for authentication | |
async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None: | |
"""Push the dataset to Hugging Face Hub.""" | |
try: | |
logger.info(f"Creating Hugging Face Dataset: {dataset_name}...") | |
ds = Dataset.from_pandas(df) | |
ds.push_to_hub(dataset_name) | |
logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}") | |
except Exception as e: | |
logger.error(f"Failed to push dataset to Hugging Face Hub: {e}") | |
raise DatasetPushError(f"Failed to push dataset: {e}") | |
async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]: | |
"""Read a dataset from Hugging Face Hub.""" | |
try: | |
logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...") | |
ds = load_dataset(dataset_name) | |
df = ds["train"].to_dict() | |
return df | |
except Exception as e: | |
logger.error(f"Failed to read dataset: {e}") | |
raise DatasetNotFoundError(f"Dataset not found: {e}") | |
# async def update_dataset( | |
# self, dataset_name: str, updates: Dict[str, List] | |
# ) -> Optional[pd.DataFrame]: | |
# """Update a dataset on Hugging Face Hub.""" | |
# embedding_service = get_embedding_service() | |
# try: | |
# df_src = await self.read_dataset(dataset_name) | |
# df_src = Dataset.from_dict(df_src) | |
# df_update = Dataset.from_dict(updates) | |
# df = concatenate_datasets(df_src, df_update) | |
# # for column, values in updates.items(): | |
# # if column in df.columns: | |
# # df[column] = values | |
# # else: | |
# # logger.warning(f"Column '{column}' not found in dataset.") | |
# # await self.push_to_hub(df, dataset_name) | |
# # return df | |
# except Exception as e: | |
# logger.error(f"Failed to update dataset: {e}") | |
# raise DatasetPushError(f"Failed to update dataset: {e}") | |
async def update_dataset( | |
self, | |
dataset_name: str, | |
updates: Dict[str, List], | |
target_column: str, | |
output_column: str = "embeddings", | |
) -> Optional[pd.DataFrame]: | |
"""Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset.""" | |
try: | |
# Step 1: Load the existing dataset from Hugging Face Hub | |
logger.info( | |
f"Loading existing dataset from Hugging Face Hub: {dataset_name}..." | |
) | |
existing_ds = await self.read_dataset(dataset_name) | |
existing_df = pd.DataFrame(existing_ds) | |
# Step 2: Convert the new updates into a DataFrame | |
logger.info("Converting updates to DataFrame...") | |
new_df = pd.DataFrame(updates) | |
# Step 3: Generate embeddings for the new data | |
logger.info("Generating embeddings for the new data...") | |
embedding_service = get_embedding_service() # Get the embedding service | |
new_df = await embedding_service.create_embeddings( | |
new_df, target_column, output_column | |
) | |
# Step 4: Concatenate the existing DataFrame with the new DataFrame | |
logger.info("Concatenating existing dataset with new data...") | |
updated_df = pd.concat([existing_df, new_df], ignore_index=True) | |
# Step 5: Push the updated dataset back to Hugging Face Hub | |
logger.info( | |
f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..." | |
) | |
await self.push_to_hub(updated_df, dataset_name) | |
# return updated_df | |
except Exception as e: | |
logger.error(f"Failed to update dataset: {e}") | |
raise DatasetPushError(f"Failed to update dataset: {e}") | |
async def delete_dataset(self, dataset_name: str) -> None: | |
"""Delete a dataset from Hugging Face Hub.""" | |
try: | |
logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...") | |
self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset") | |
logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}") | |
except Exception as e: | |
logger.error(f"Failed to delete dataset: {e}") | |
raise DatasetDeleteError(f"Failed to delete dataset: {e}") | |