Similarity_Search / src /api /services /huggingface_service.py
amaye15
Feat - Use huggingface dataset instead of pandas
0611c31
raw
history blame
8.76 kB
# from datasets import Dataset, load_dataset, concatenate_datasets
# from huggingface_hub import HfApi, HfFolder
# import logging
# import os
# from typing import Optional, Dict, List
# import pandas as pd
# from src.api.services.embedding_service import EmbeddingService
# from src.api.exceptions import (
# DatasetNotFoundError,
# DatasetPushError,
# DatasetDeleteError,
# )
# # Set up structured logging
# logging.basicConfig(
# level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# )
# logger = logging.getLogger(__name__)
# class HuggingFaceService:
# def __init__(self, hf_token: Optional[str] = None):
# """Initialize the HuggingFaceService with an optional token."""
# self.hf_api = HfApi()
# if hf_token:
# HfFolder.save_token(hf_token) # Save the token for authentication
# async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
# """Push the dataset to Hugging Face Hub."""
# try:
# logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
# ds = Dataset.from_pandas(df)
# ds.push_to_hub(dataset_name)
# logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
# except Exception as e:
# logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
# raise DatasetPushError(f"Failed to push dataset: {e}")
# async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
# """Read a dataset from Hugging Face Hub."""
# try:
# logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
# ds = load_dataset(dataset_name)
# df = ds["train"].to_dict()
# return df
# except Exception as e:
# logger.error(f"Failed to read dataset: {e}")
# raise DatasetNotFoundError(f"Dataset not found: {e}")
# async def update_dataset(
# self,
# dataset_name: str,
# updates: Dict[str, List],
# target_column: str,
# output_column: str = "embeddings",
# ) -> Optional[pd.DataFrame]:
# """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset."""
# try:
# # Step 1: Load the existing dataset from Hugging Face Hub
# logger.info(
# f"Loading existing dataset from Hugging Face Hub: {dataset_name}..."
# )
# existing_ds = await self.read_dataset(dataset_name)
# existing_df = pd.DataFrame(existing_ds)
# # Step 2: Convert the new updates into a DataFrame
# logger.info("Converting updates to DataFrame...")
# new_df = pd.DataFrame(updates)
# # Step 3: Generate embeddings for the new data
# logger.info("Generating embeddings for the new data...")
# embedding_service = EmbeddingService(
# openai_api_key=os.getenv("OPENAI_API_KEY")
# ) # Get the embedding service
# new_df = await embedding_service.create_embeddings(
# new_df, target_column, output_column
# )
# # Step 4: Concatenate the existing DataFrame with the new DataFrame
# logger.info("Concatenating existing dataset with new data...")
# updated_df = pd.concat([existing_df, new_df], ignore_index=True)
# # Step 5: Push the updated dataset back to Hugging Face Hub
# logger.info(
# f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..."
# )
# await self.push_to_hub(updated_df, dataset_name)
# return updated_df
# except Exception as e:
# logger.error(f"Failed to update dataset: {e}")
# raise DatasetPushError(f"Failed to update dataset: {e}")
# async def delete_dataset(self, dataset_name: str) -> None:
# """Delete a dataset from Hugging Face Hub."""
# try:
# logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...")
# self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset")
# logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}")
# except Exception as e:
# logger.error(f"Failed to delete dataset: {e}")
# raise DatasetDeleteError(f"Failed to delete dataset: {e}")
from datasets import Dataset, load_dataset, concatenate_datasets
from huggingface_hub import HfApi, HfFolder
import logging
import os
from typing import Optional, Dict, List
from src.api.services.embedding_service import EmbeddingService
from src.api.exceptions import (
DatasetNotFoundError,
DatasetPushError,
DatasetDeleteError,
)
# Set up structured logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
class HuggingFaceService:
def __init__(self, hf_token: Optional[str] = None):
"""Initialize the HuggingFaceService with an optional token."""
self.hf_api = HfApi()
if hf_token:
HfFolder.save_token(hf_token) # Save the token for authentication
async def push_to_hub(self, dataset: Dataset, dataset_name: str) -> None:
"""Push the dataset to Hugging Face Hub."""
try:
logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
dataset.push_to_hub(dataset_name)
logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
except Exception as e:
logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
raise DatasetPushError(f"Failed to push dataset: {e}")
async def read_dataset(self, dataset_name: str) -> Optional[Dataset]:
"""Read a dataset from Hugging Face Hub."""
try:
logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
dataset = load_dataset(dataset_name)
return dataset["train"]
except Exception as e:
logger.error(f"Failed to read dataset: {e}")
raise DatasetNotFoundError(f"Dataset not found: {e}")
async def update_dataset(
self,
dataset_name: str,
updates: Dict[str, List],
target_column: str,
output_column: str = "embeddings",
) -> Optional[Dataset]:
"""Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset."""
try:
# Step 1: Load the existing dataset from Hugging Face Hub
logger.info(
f"Loading existing dataset from Hugging Face Hub: {dataset_name}..."
)
existing_dataset = await self.read_dataset(dataset_name)
# Step 2: Convert the new updates into a Dataset
logger.info("Converting updates to Dataset...")
new_dataset = Dataset.from_dict(updates)
# Step 3: Generate embeddings for the new data
logger.info("Generating embeddings for the new data...")
embedding_service = EmbeddingService(
openai_api_key=os.getenv("OPENAI_API_KEY")
) # Get the embedding service
new_dataset = await embedding_service.create_embeddings(
new_dataset, target_column, output_column
)
# Step 4: Concatenate the existing Dataset with the new Dataset
logger.info("Concatenating existing dataset with new data...")
updated_dataset = concatenate_datasets([existing_dataset, new_dataset])
# Step 5: Push the updated dataset back to Hugging Face Hub
logger.info(
f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..."
)
await self.push_to_hub(updated_dataset, dataset_name)
return updated_dataset
except Exception as e:
logger.error(f"Failed to update dataset: {e}")
raise DatasetPushError(f"Failed to update dataset: {e}")
async def delete_dataset(self, dataset_name: str) -> None:
"""Delete a dataset from Hugging Face Hub."""
try:
logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...")
self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset")
logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}")
except Exception as e:
logger.error(f"Failed to delete dataset: {e}")
raise DatasetDeleteError(f"Failed to delete dataset: {e}")