File size: 8,757 Bytes
0611c31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f4f307
a106258
2cb9dec
4f1c09c
2cb9dec
4f1c09c
a106258
 
 
 
 
2cb9dec
 
 
 
 
 
 
 
 
a106258
 
 
 
 
 
0611c31
2cb9dec
 
 
0611c31
2cb9dec
 
 
 
 
0611c31
2cb9dec
 
 
0611c31
 
2cb9dec
 
 
 
 
6f4f307
 
 
 
 
0611c31
6f4f307
2cb9dec
6f4f307
 
 
 
0611c31
6f4f307
0611c31
 
 
6f4f307
 
 
4f1c09c
 
 
0611c31
 
6f4f307
 
0611c31
6f4f307
0611c31
6f4f307
 
 
 
 
0611c31
6f4f307
0611c31
2cb9dec
 
 
 
a106258
 
2cb9dec
a106258
 
 
2cb9dec
a106258
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# from datasets import Dataset, load_dataset, concatenate_datasets
# from huggingface_hub import HfApi, HfFolder
# import logging
# import os
# from typing import Optional, Dict, List
# import pandas as pd
# from src.api.services.embedding_service import EmbeddingService
# from src.api.exceptions import (
#     DatasetNotFoundError,
#     DatasetPushError,
#     DatasetDeleteError,
# )

# # Set up structured logging
# logging.basicConfig(
#     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# )
# logger = logging.getLogger(__name__)


# class HuggingFaceService:
#     def __init__(self, hf_token: Optional[str] = None):
#         """Initialize the HuggingFaceService with an optional token."""
#         self.hf_api = HfApi()
#         if hf_token:
#             HfFolder.save_token(hf_token)  # Save the token for authentication

#     async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
#         """Push the dataset to Hugging Face Hub."""
#         try:
#             logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
#             ds = Dataset.from_pandas(df)
#             ds.push_to_hub(dataset_name)
#             logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
#         except Exception as e:
#             logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
#             raise DatasetPushError(f"Failed to push dataset: {e}")

#     async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
#         """Read a dataset from Hugging Face Hub."""
#         try:
#             logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
#             ds = load_dataset(dataset_name)
#             df = ds["train"].to_dict()
#             return df
#         except Exception as e:
#             logger.error(f"Failed to read dataset: {e}")
#             raise DatasetNotFoundError(f"Dataset not found: {e}")

#     async def update_dataset(
#         self,
#         dataset_name: str,
#         updates: Dict[str, List],
#         target_column: str,
#         output_column: str = "embeddings",
#     ) -> Optional[pd.DataFrame]:
#         """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset."""
#         try:
#             # Step 1: Load the existing dataset from Hugging Face Hub
#             logger.info(
#                 f"Loading existing dataset from Hugging Face Hub: {dataset_name}..."
#             )
#             existing_ds = await self.read_dataset(dataset_name)
#             existing_df = pd.DataFrame(existing_ds)

#             # Step 2: Convert the new updates into a DataFrame
#             logger.info("Converting updates to DataFrame...")
#             new_df = pd.DataFrame(updates)

#             # Step 3: Generate embeddings for the new data
#             logger.info("Generating embeddings for the new data...")
#             embedding_service = EmbeddingService(
#                 openai_api_key=os.getenv("OPENAI_API_KEY")
#             )  # Get the embedding service
#             new_df = await embedding_service.create_embeddings(
#                 new_df, target_column, output_column
#             )

#             # Step 4: Concatenate the existing DataFrame with the new DataFrame
#             logger.info("Concatenating existing dataset with new data...")
#             updated_df = pd.concat([existing_df, new_df], ignore_index=True)

#             # Step 5: Push the updated dataset back to Hugging Face Hub
#             logger.info(
#                 f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..."
#             )
#             await self.push_to_hub(updated_df, dataset_name)

#             return updated_df
#         except Exception as e:
#             logger.error(f"Failed to update dataset: {e}")
#             raise DatasetPushError(f"Failed to update dataset: {e}")

#     async def delete_dataset(self, dataset_name: str) -> None:
#         """Delete a dataset from Hugging Face Hub."""
#         try:
#             logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...")
#             self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset")
#             logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}")
#         except Exception as e:
#             logger.error(f"Failed to delete dataset: {e}")
#             raise DatasetDeleteError(f"Failed to delete dataset: {e}")

from datasets import Dataset, load_dataset, concatenate_datasets
from huggingface_hub import HfApi, HfFolder
import logging
import os
from typing import Optional, Dict, List
from src.api.services.embedding_service import EmbeddingService
from src.api.exceptions import (
    DatasetNotFoundError,
    DatasetPushError,
    DatasetDeleteError,
)

# Set up structured logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


class HuggingFaceService:
    def __init__(self, hf_token: Optional[str] = None):
        """Initialize the HuggingFaceService with an optional token."""
        self.hf_api = HfApi()
        if hf_token:
            HfFolder.save_token(hf_token)  # Save the token for authentication

    async def push_to_hub(self, dataset: Dataset, dataset_name: str) -> None:
        """Push the dataset to Hugging Face Hub."""
        try:
            logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
            dataset.push_to_hub(dataset_name)
            logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
        except Exception as e:
            logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
            raise DatasetPushError(f"Failed to push dataset: {e}")

    async def read_dataset(self, dataset_name: str) -> Optional[Dataset]:
        """Read a dataset from Hugging Face Hub."""
        try:
            logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
            dataset = load_dataset(dataset_name)
            return dataset["train"]
        except Exception as e:
            logger.error(f"Failed to read dataset: {e}")
            raise DatasetNotFoundError(f"Dataset not found: {e}")

    async def update_dataset(
        self,
        dataset_name: str,
        updates: Dict[str, List],
        target_column: str,
        output_column: str = "embeddings",
    ) -> Optional[Dataset]:
        """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset."""
        try:
            # Step 1: Load the existing dataset from Hugging Face Hub
            logger.info(
                f"Loading existing dataset from Hugging Face Hub: {dataset_name}..."
            )
            existing_dataset = await self.read_dataset(dataset_name)

            # Step 2: Convert the new updates into a Dataset
            logger.info("Converting updates to Dataset...")
            new_dataset = Dataset.from_dict(updates)

            # Step 3: Generate embeddings for the new data
            logger.info("Generating embeddings for the new data...")
            embedding_service = EmbeddingService(
                openai_api_key=os.getenv("OPENAI_API_KEY")
            )  # Get the embedding service
            new_dataset = await embedding_service.create_embeddings(
                new_dataset, target_column, output_column
            )

            # Step 4: Concatenate the existing Dataset with the new Dataset
            logger.info("Concatenating existing dataset with new data...")
            updated_dataset = concatenate_datasets([existing_dataset, new_dataset])

            # Step 5: Push the updated dataset back to Hugging Face Hub
            logger.info(
                f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..."
            )
            await self.push_to_hub(updated_dataset, dataset_name)

            return updated_dataset
        except Exception as e:
            logger.error(f"Failed to update dataset: {e}")
            raise DatasetPushError(f"Failed to update dataset: {e}")

    async def delete_dataset(self, dataset_name: str) -> None:
        """Delete a dataset from Hugging Face Hub."""
        try:
            logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...")
            self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset")
            logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}")
        except Exception as e:
            logger.error(f"Failed to delete dataset: {e}")
            raise DatasetDeleteError(f"Failed to delete dataset: {e}")