Spaces:
Running
Running
from datasets import Dataset, load_dataset | |
import logging | |
from typing import Optional, Dict, List | |
import pandas as pd | |
from src.api.exceptions import DatasetNotFoundError, DatasetPushError | |
# Set up structured logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
) | |
logger = logging.getLogger(__name__) | |
class HuggingFaceService: | |
async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None: | |
"""Push the dataset to Hugging Face Hub.""" | |
try: | |
logger.info(f"Creating Hugging Face Dataset: {dataset_name}...") | |
ds = Dataset.from_pandas(df).remove_columns("__index_level_0__") | |
ds.push_to_hub(dataset_name) | |
logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}") | |
except Exception as e: | |
logger.error(f"Failed to push dataset to Hugging Face Hub: {e}") | |
raise DatasetPushError(f"Failed to push dataset: {e}") | |
async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]: | |
"""Read a dataset from Hugging Face Hub.""" | |
try: | |
logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...") | |
ds = load_dataset(dataset_name) | |
df = ds["train"].to_pandas() | |
return df | |
except Exception as e: | |
logger.error(f"Failed to read dataset: {e}") | |
raise DatasetNotFoundError(f"Dataset not found: {e}") | |
async def update_dataset( | |
self, dataset_name: str, updates: Dict[str, List] | |
) -> Optional[pd.DataFrame]: | |
"""Update a dataset on Hugging Face Hub.""" | |
try: | |
df = await self.read_dataset(dataset_name) | |
for column, values in updates.items(): | |
if column in df.columns: | |
df[column] = values | |
else: | |
logger.warning(f"Column '{column}' not found in dataset.") | |
await self.push_to_hub(df, dataset_name) | |
return df | |
except Exception as e: | |
logger.error(f"Failed to update dataset: {e}") | |
raise DatasetPushError(f"Failed to update dataset: {e}") | |
async def delete_columns( | |
self, dataset_name: str, columns: List[str] | |
) -> Optional[pd.DataFrame]: | |
"""Delete columns from a dataset on Hugging Face Hub.""" | |
try: | |
df = await self.read_dataset(dataset_name) | |
for column in columns: | |
if column in df.columns: | |
df.drop(column, axis=1, inplace=True) | |
else: | |
logger.warning(f"Column '{column}' not found in dataset.") | |
await self.push_to_hub(df, dataset_name) | |
return df | |
except Exception as e: | |
logger.error(f"Failed to delete columns: {e}") | |
raise DatasetPushError(f"Failed to delete columns: {e}") | |