Similarity_Search / src /api /services /huggingface_service.py
amaye15
Intial Deployment
2cb9dec
raw
history blame
2.9 kB
from datasets import Dataset, load_dataset
import logging
from typing import Optional, Dict, List
import pandas as pd
from src.api.exceptions import DatasetNotFoundError, DatasetPushError
# Set up structured logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
class HuggingFaceService:
async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
"""Push the dataset to Hugging Face Hub."""
try:
logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
ds = Dataset.from_pandas(df).remove_columns("__index_level_0__")
ds.push_to_hub(dataset_name)
logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
except Exception as e:
logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
raise DatasetPushError(f"Failed to push dataset: {e}")
async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
"""Read a dataset from Hugging Face Hub."""
try:
logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
ds = load_dataset(dataset_name)
df = ds["train"].to_pandas()
return df
except Exception as e:
logger.error(f"Failed to read dataset: {e}")
raise DatasetNotFoundError(f"Dataset not found: {e}")
async def update_dataset(
self, dataset_name: str, updates: Dict[str, List]
) -> Optional[pd.DataFrame]:
"""Update a dataset on Hugging Face Hub."""
try:
df = await self.read_dataset(dataset_name)
for column, values in updates.items():
if column in df.columns:
df[column] = values
else:
logger.warning(f"Column '{column}' not found in dataset.")
await self.push_to_hub(df, dataset_name)
return df
except Exception as e:
logger.error(f"Failed to update dataset: {e}")
raise DatasetPushError(f"Failed to update dataset: {e}")
async def delete_columns(
self, dataset_name: str, columns: List[str]
) -> Optional[pd.DataFrame]:
"""Delete columns from a dataset on Hugging Face Hub."""
try:
df = await self.read_dataset(dataset_name)
for column in columns:
if column in df.columns:
df.drop(column, axis=1, inplace=True)
else:
logger.warning(f"Column '{column}' not found in dataset.")
await self.push_to_hub(df, dataset_name)
return df
except Exception as e:
logger.error(f"Failed to delete columns: {e}")
raise DatasetPushError(f"Failed to delete columns: {e}")