File size: 2,901 Bytes
2cb9dec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from datasets import Dataset, load_dataset
import logging
from typing import Optional, Dict, List
import pandas as pd
from src.api.exceptions import DatasetNotFoundError, DatasetPushError

# Set up structured logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


class HuggingFaceService:
    async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
        """Push the dataset to Hugging Face Hub."""
        try:
            logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
            ds = Dataset.from_pandas(df).remove_columns("__index_level_0__")
            ds.push_to_hub(dataset_name)
            logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
        except Exception as e:
            logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
            raise DatasetPushError(f"Failed to push dataset: {e}")

    async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
        """Read a dataset from Hugging Face Hub."""
        try:
            logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
            ds = load_dataset(dataset_name)
            df = ds["train"].to_pandas()
            return df
        except Exception as e:
            logger.error(f"Failed to read dataset: {e}")
            raise DatasetNotFoundError(f"Dataset not found: {e}")

    async def update_dataset(
        self, dataset_name: str, updates: Dict[str, List]
    ) -> Optional[pd.DataFrame]:
        """Update a dataset on Hugging Face Hub."""
        try:
            df = await self.read_dataset(dataset_name)
            for column, values in updates.items():
                if column in df.columns:
                    df[column] = values
                else:
                    logger.warning(f"Column '{column}' not found in dataset.")
            await self.push_to_hub(df, dataset_name)
            return df
        except Exception as e:
            logger.error(f"Failed to update dataset: {e}")
            raise DatasetPushError(f"Failed to update dataset: {e}")

    async def delete_columns(
        self, dataset_name: str, columns: List[str]
    ) -> Optional[pd.DataFrame]:
        """Delete columns from a dataset on Hugging Face Hub."""
        try:
            df = await self.read_dataset(dataset_name)
            for column in columns:
                if column in df.columns:
                    df.drop(column, axis=1, inplace=True)
                else:
                    logger.warning(f"Column '{column}' not found in dataset.")
            await self.push_to_hub(df, dataset_name)
            return df
        except Exception as e:
            logger.error(f"Failed to delete columns: {e}")
            raise DatasetPushError(f"Failed to delete columns: {e}")