File size: 4,340 Bytes
6f4f307
a106258
2cb9dec
4f1c09c
2cb9dec
 
4f1c09c
a106258
 
 
 
 
2cb9dec
 
 
 
 
 
 
 
 
a106258
 
 
 
 
 
2cb9dec
 
 
 
a106258
2cb9dec
 
 
 
 
 
 
 
 
 
 
768cc49
2cb9dec
 
 
 
 
 
6f4f307
 
 
 
 
2cb9dec
6f4f307
2cb9dec
6f4f307
 
 
 
 
 
 
 
 
 
 
 
 
4f1c09c
 
 
6f4f307
 
 
 
 
 
 
 
 
 
 
 
 
 
9bda6d8
2cb9dec
 
 
 
a106258
 
2cb9dec
a106258
 
 
2cb9dec
a106258
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from datasets import Dataset, load_dataset, concatenate_datasets
from huggingface_hub import HfApi, HfFolder
import logging
import os
from typing import Optional, Dict, List
import pandas as pd
from src.api.services.embedding_service import EmbeddingService
from src.api.exceptions import (
    DatasetNotFoundError,
    DatasetPushError,
    DatasetDeleteError,
)

# Set up structured logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


class HuggingFaceService:
    def __init__(self, hf_token: Optional[str] = None):
        """Initialize the HuggingFaceService with an optional token."""
        self.hf_api = HfApi()
        if hf_token:
            HfFolder.save_token(hf_token)  # Save the token for authentication

    async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
        """Push the dataset to Hugging Face Hub."""
        try:
            logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
            ds = Dataset.from_pandas(df)
            ds.push_to_hub(dataset_name)
            logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
        except Exception as e:
            logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
            raise DatasetPushError(f"Failed to push dataset: {e}")

    async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
        """Read a dataset from Hugging Face Hub."""
        try:
            logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
            ds = load_dataset(dataset_name)
            df = ds["train"].to_dict()
            return df
        except Exception as e:
            logger.error(f"Failed to read dataset: {e}")
            raise DatasetNotFoundError(f"Dataset not found: {e}")

    async def update_dataset(
        self,
        dataset_name: str,
        updates: Dict[str, List],
        target_column: str,
        output_column: str = "embeddings",
    ) -> Optional[pd.DataFrame]:
        """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset."""
        try:
            # Step 1: Load the existing dataset from Hugging Face Hub
            logger.info(
                f"Loading existing dataset from Hugging Face Hub: {dataset_name}..."
            )
            existing_ds = await self.read_dataset(dataset_name)
            existing_df = pd.DataFrame(existing_ds)

            # Step 2: Convert the new updates into a DataFrame
            logger.info("Converting updates to DataFrame...")
            new_df = pd.DataFrame(updates)

            # Step 3: Generate embeddings for the new data
            logger.info("Generating embeddings for the new data...")
            embedding_service = EmbeddingService(
                openai_api_key=os.getenv("OPENAI_API_KEY")
            )  # Get the embedding service
            new_df = await embedding_service.create_embeddings(
                new_df, target_column, output_column
            )

            # Step 4: Concatenate the existing DataFrame with the new DataFrame
            logger.info("Concatenating existing dataset with new data...")
            updated_df = pd.concat([existing_df, new_df], ignore_index=True)

            # Step 5: Push the updated dataset back to Hugging Face Hub
            logger.info(
                f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..."
            )
            await self.push_to_hub(updated_df, dataset_name)

            return updated_df
        except Exception as e:
            logger.error(f"Failed to update dataset: {e}")
            raise DatasetPushError(f"Failed to update dataset: {e}")

    async def delete_dataset(self, dataset_name: str) -> None:
        """Delete a dataset from Hugging Face Hub."""
        try:
            logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...")
            self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset")
            logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}")
        except Exception as e:
            logger.error(f"Failed to delete dataset: {e}")
            raise DatasetDeleteError(f"Failed to delete dataset: {e}")