Spaces:
Running
Running
File size: 8,757 Bytes
0611c31 6f4f307 a106258 2cb9dec 4f1c09c 2cb9dec 4f1c09c a106258 2cb9dec a106258 0611c31 2cb9dec 0611c31 2cb9dec 0611c31 2cb9dec 0611c31 2cb9dec 6f4f307 0611c31 6f4f307 2cb9dec 6f4f307 0611c31 6f4f307 0611c31 6f4f307 4f1c09c 0611c31 6f4f307 0611c31 6f4f307 0611c31 6f4f307 0611c31 6f4f307 0611c31 2cb9dec a106258 2cb9dec a106258 2cb9dec a106258 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# from datasets import Dataset, load_dataset, concatenate_datasets
# from huggingface_hub import HfApi, HfFolder
# import logging
# import os
# from typing import Optional, Dict, List
# import pandas as pd
# from src.api.services.embedding_service import EmbeddingService
# from src.api.exceptions import (
# DatasetNotFoundError,
# DatasetPushError,
# DatasetDeleteError,
# )
# # Set up structured logging
# logging.basicConfig(
# level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# )
# logger = logging.getLogger(__name__)
# class HuggingFaceService:
# def __init__(self, hf_token: Optional[str] = None):
# """Initialize the HuggingFaceService with an optional token."""
# self.hf_api = HfApi()
# if hf_token:
# HfFolder.save_token(hf_token) # Save the token for authentication
# async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
# """Push the dataset to Hugging Face Hub."""
# try:
# logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
# ds = Dataset.from_pandas(df)
# ds.push_to_hub(dataset_name)
# logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
# except Exception as e:
# logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
# raise DatasetPushError(f"Failed to push dataset: {e}")
# async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
# """Read a dataset from Hugging Face Hub."""
# try:
# logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
# ds = load_dataset(dataset_name)
# df = ds["train"].to_dict()
# return df
# except Exception as e:
# logger.error(f"Failed to read dataset: {e}")
# raise DatasetNotFoundError(f"Dataset not found: {e}")
# async def update_dataset(
# self,
# dataset_name: str,
# updates: Dict[str, List],
# target_column: str,
# output_column: str = "embeddings",
# ) -> Optional[pd.DataFrame]:
# """Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset."""
# try:
# # Step 1: Load the existing dataset from Hugging Face Hub
# logger.info(
# f"Loading existing dataset from Hugging Face Hub: {dataset_name}..."
# )
# existing_ds = await self.read_dataset(dataset_name)
# existing_df = pd.DataFrame(existing_ds)
# # Step 2: Convert the new updates into a DataFrame
# logger.info("Converting updates to DataFrame...")
# new_df = pd.DataFrame(updates)
# # Step 3: Generate embeddings for the new data
# logger.info("Generating embeddings for the new data...")
# embedding_service = EmbeddingService(
# openai_api_key=os.getenv("OPENAI_API_KEY")
# ) # Get the embedding service
# new_df = await embedding_service.create_embeddings(
# new_df, target_column, output_column
# )
# # Step 4: Concatenate the existing DataFrame with the new DataFrame
# logger.info("Concatenating existing dataset with new data...")
# updated_df = pd.concat([existing_df, new_df], ignore_index=True)
# # Step 5: Push the updated dataset back to Hugging Face Hub
# logger.info(
# f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..."
# )
# await self.push_to_hub(updated_df, dataset_name)
# return updated_df
# except Exception as e:
# logger.error(f"Failed to update dataset: {e}")
# raise DatasetPushError(f"Failed to update dataset: {e}")
# async def delete_dataset(self, dataset_name: str) -> None:
# """Delete a dataset from Hugging Face Hub."""
# try:
# logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...")
# self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset")
# logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}")
# except Exception as e:
# logger.error(f"Failed to delete dataset: {e}")
# raise DatasetDeleteError(f"Failed to delete dataset: {e}")
from datasets import Dataset, load_dataset, concatenate_datasets
from huggingface_hub import HfApi, HfFolder
import logging
import os
from typing import Optional, Dict, List
from src.api.services.embedding_service import EmbeddingService
from src.api.exceptions import (
DatasetNotFoundError,
DatasetPushError,
DatasetDeleteError,
)
# Set up structured logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
class HuggingFaceService:
def __init__(self, hf_token: Optional[str] = None):
"""Initialize the HuggingFaceService with an optional token."""
self.hf_api = HfApi()
if hf_token:
HfFolder.save_token(hf_token) # Save the token for authentication
async def push_to_hub(self, dataset: Dataset, dataset_name: str) -> None:
"""Push the dataset to Hugging Face Hub."""
try:
logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
dataset.push_to_hub(dataset_name)
logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
except Exception as e:
logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
raise DatasetPushError(f"Failed to push dataset: {e}")
async def read_dataset(self, dataset_name: str) -> Optional[Dataset]:
"""Read a dataset from Hugging Face Hub."""
try:
logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
dataset = load_dataset(dataset_name)
return dataset["train"]
except Exception as e:
logger.error(f"Failed to read dataset: {e}")
raise DatasetNotFoundError(f"Dataset not found: {e}")
async def update_dataset(
self,
dataset_name: str,
updates: Dict[str, List],
target_column: str,
output_column: str = "embeddings",
) -> Optional[Dataset]:
"""Update a dataset on Hugging Face Hub by generating embeddings for new data and concatenating it with the existing dataset."""
try:
# Step 1: Load the existing dataset from Hugging Face Hub
logger.info(
f"Loading existing dataset from Hugging Face Hub: {dataset_name}..."
)
existing_dataset = await self.read_dataset(dataset_name)
# Step 2: Convert the new updates into a Dataset
logger.info("Converting updates to Dataset...")
new_dataset = Dataset.from_dict(updates)
# Step 3: Generate embeddings for the new data
logger.info("Generating embeddings for the new data...")
embedding_service = EmbeddingService(
openai_api_key=os.getenv("OPENAI_API_KEY")
) # Get the embedding service
new_dataset = await embedding_service.create_embeddings(
new_dataset, target_column, output_column
)
# Step 4: Concatenate the existing Dataset with the new Dataset
logger.info("Concatenating existing dataset with new data...")
updated_dataset = concatenate_datasets([existing_dataset, new_dataset])
# Step 5: Push the updated dataset back to Hugging Face Hub
logger.info(
f"Pushing updated dataset to Hugging Face Hub: {dataset_name}..."
)
await self.push_to_hub(updated_dataset, dataset_name)
return updated_dataset
except Exception as e:
logger.error(f"Failed to update dataset: {e}")
raise DatasetPushError(f"Failed to update dataset: {e}")
async def delete_dataset(self, dataset_name: str) -> None:
"""Delete a dataset from Hugging Face Hub."""
try:
logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...")
self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset")
logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}")
except Exception as e:
logger.error(f"Failed to delete dataset: {e}")
raise DatasetDeleteError(f"Failed to delete dataset: {e}")
|