Spaces:
Running
Running
File size: 6,018 Bytes
a106258 2cb9dec a106258 2cb9dec a106258 2cb9dec a106258 2cb9dec a106258 2cb9dec a106258 2cb9dec a106258 2cb9dec a106258 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
# from datasets import Dataset, load_dataset
# import logging
# from typing import Optional, Dict, List
# import pandas as pd
# from src.api.exceptions import DatasetNotFoundError, DatasetPushError
# # Set up structured logging
# logging.basicConfig(
# level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# )
# logger = logging.getLogger(__name__)
# class HuggingFaceService:
# async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
# """Push the dataset to Hugging Face Hub."""
# try:
# logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
# ds = Dataset.from_pandas(df)
# ds.push_to_hub(dataset_name)
# logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
# except Exception as e:
# logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
# raise DatasetPushError(f"Failed to push dataset: {e}")
# async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
# """Read a dataset from Hugging Face Hub."""
# try:
# logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
# ds = load_dataset(dataset_name)
# df = ds["train"].to_pandas()
# return df
# except Exception as e:
# logger.error(f"Failed to read dataset: {e}")
# raise DatasetNotFoundError(f"Dataset not found: {e}")
# async def update_dataset(
# self, dataset_name: str, updates: Dict[str, List]
# ) -> Optional[pd.DataFrame]:
# """Update a dataset on Hugging Face Hub."""
# try:
# df = await self.read_dataset(dataset_name)
# for column, values in updates.items():
# if column in df.columns:
# df[column] = values
# else:
# logger.warning(f"Column '{column}' not found in dataset.")
# await self.push_to_hub(df, dataset_name)
# return df
# except Exception as e:
# logger.error(f"Failed to update dataset: {e}")
# raise DatasetPushError(f"Failed to update dataset: {e}")
# async def delete_columns(
# self, dataset_name: str, columns: List[str]
# ) -> Optional[pd.DataFrame]:
# """Delete columns from a dataset on Hugging Face Hub."""
# try:
# df = await self.read_dataset(dataset_name)
# for column in columns:
# if column in df.columns:
# df.drop(column, axis=1, inplace=True)
# else:
# logger.warning(f"Column '{column}' not found in dataset.")
# await self.push_to_hub(df, dataset_name)
# return df
# except Exception as e:
# logger.error(f"Failed to delete columns: {e}")
# raise DatasetPushError(f"Failed to delete columns: {e}")
from datasets import Dataset, load_dataset
from huggingface_hub import HfApi, HfFolder
import logging
from typing import Optional, Dict, List
import pandas as pd
from src.api.exceptions import (
DatasetNotFoundError,
DatasetPushError,
DatasetDeleteError,
)
# Set up structured logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
class HuggingFaceService:
def __init__(self, hf_token: Optional[str] = None):
"""Initialize the HuggingFaceService with an optional token."""
self.hf_api = HfApi()
if hf_token:
HfFolder.save_token(hf_token) # Save the token for authentication
async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
"""Push the dataset to Hugging Face Hub."""
try:
logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
ds = Dataset.from_pandas(df)
ds.push_to_hub(dataset_name)
logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
except Exception as e:
logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
raise DatasetPushError(f"Failed to push dataset: {e}")
async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
"""Read a dataset from Hugging Face Hub."""
try:
logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
ds = load_dataset(dataset_name)
df = ds["train"].to_pandas()
return df
except Exception as e:
logger.error(f"Failed to read dataset: {e}")
raise DatasetNotFoundError(f"Dataset not found: {e}")
async def update_dataset(
self, dataset_name: str, updates: Dict[str, List]
) -> Optional[pd.DataFrame]:
"""Update a dataset on Hugging Face Hub."""
try:
df = await self.read_dataset(dataset_name)
for column, values in updates.items():
if column in df.columns:
df[column] = values
else:
logger.warning(f"Column '{column}' not found in dataset.")
await self.push_to_hub(df, dataset_name)
return df
except Exception as e:
logger.error(f"Failed to update dataset: {e}")
raise DatasetPushError(f"Failed to update dataset: {e}")
async def delete_dataset(self, dataset_name: str) -> None:
"""Delete a dataset from Hugging Face Hub."""
try:
logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...")
self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset")
logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}")
except Exception as e:
logger.error(f"Failed to delete dataset: {e}")
raise DatasetDeleteError(f"Failed to delete dataset: {e}")
|