amaye15 commited on
Commit
a106258
·
1 Parent(s): 0ba3d8c

Feat - Delet Embeddings - Updated

Browse files
src/api/models/embedding_models.py CHANGED
@@ -20,4 +20,3 @@ class UpdateEmbeddingRequest(BaseModel):
20
 
21
  class DeleteEmbeddingRequest(BaseModel):
22
  dataset_name: str
23
- columns: List[str] # List of columns to delete
 
20
 
21
  class DeleteEmbeddingRequest(BaseModel):
22
  dataset_name: str
 
src/api/services/huggingface_service.py CHANGED
@@ -1,8 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from datasets import Dataset, load_dataset
 
2
  import logging
3
  from typing import Optional, Dict, List
4
  import pandas as pd
5
- from src.api.exceptions import DatasetNotFoundError, DatasetPushError
 
 
 
 
6
 
7
  # Set up structured logging
8
  logging.basicConfig(
@@ -12,11 +87,17 @@ logger = logging.getLogger(__name__)
12
 
13
 
14
  class HuggingFaceService:
 
 
 
 
 
 
15
  async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
16
  """Push the dataset to Hugging Face Hub."""
17
  try:
18
  logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
19
- ds = Dataset.from_pandas(df) # .remove_columns("__index_level_0__")
20
  ds.push_to_hub(dataset_name)
21
  logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
22
  except Exception as e:
@@ -51,19 +132,12 @@ class HuggingFaceService:
51
  logger.error(f"Failed to update dataset: {e}")
52
  raise DatasetPushError(f"Failed to update dataset: {e}")
53
 
54
- async def delete_columns(
55
- self, dataset_name: str, columns: List[str]
56
- ) -> Optional[pd.DataFrame]:
57
- """Delete columns from a dataset on Hugging Face Hub."""
58
  try:
59
- df = await self.read_dataset(dataset_name)
60
- for column in columns:
61
- if column in df.columns:
62
- df.drop(column, axis=1, inplace=True)
63
- else:
64
- logger.warning(f"Column '{column}' not found in dataset.")
65
- await self.push_to_hub(df, dataset_name)
66
- return df
67
  except Exception as e:
68
- logger.error(f"Failed to delete columns: {e}")
69
- raise DatasetPushError(f"Failed to delete columns: {e}")
 
1
+ # from datasets import Dataset, load_dataset
2
+ # import logging
3
+ # from typing import Optional, Dict, List
4
+ # import pandas as pd
5
+ # from src.api.exceptions import DatasetNotFoundError, DatasetPushError
6
+
7
+ # # Set up structured logging
8
+ # logging.basicConfig(
9
+ # level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
10
+ # )
11
+ # logger = logging.getLogger(__name__)
12
+
13
+
14
+ # class HuggingFaceService:
15
+ # async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
16
+ # """Push the dataset to Hugging Face Hub."""
17
+ # try:
18
+ # logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
19
+ # ds = Dataset.from_pandas(df)
20
+ # ds.push_to_hub(dataset_name)
21
+ # logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
22
+ # except Exception as e:
23
+ # logger.error(f"Failed to push dataset to Hugging Face Hub: {e}")
24
+ # raise DatasetPushError(f"Failed to push dataset: {e}")
25
+
26
+ # async def read_dataset(self, dataset_name: str) -> Optional[pd.DataFrame]:
27
+ # """Read a dataset from Hugging Face Hub."""
28
+ # try:
29
+ # logger.info(f"Loading dataset from Hugging Face Hub: {dataset_name}...")
30
+ # ds = load_dataset(dataset_name)
31
+ # df = ds["train"].to_pandas()
32
+ # return df
33
+ # except Exception as e:
34
+ # logger.error(f"Failed to read dataset: {e}")
35
+ # raise DatasetNotFoundError(f"Dataset not found: {e}")
36
+
37
+ # async def update_dataset(
38
+ # self, dataset_name: str, updates: Dict[str, List]
39
+ # ) -> Optional[pd.DataFrame]:
40
+ # """Update a dataset on Hugging Face Hub."""
41
+ # try:
42
+ # df = await self.read_dataset(dataset_name)
43
+ # for column, values in updates.items():
44
+ # if column in df.columns:
45
+ # df[column] = values
46
+ # else:
47
+ # logger.warning(f"Column '{column}' not found in dataset.")
48
+ # await self.push_to_hub(df, dataset_name)
49
+ # return df
50
+ # except Exception as e:
51
+ # logger.error(f"Failed to update dataset: {e}")
52
+ # raise DatasetPushError(f"Failed to update dataset: {e}")
53
+
54
+ # async def delete_columns(
55
+ # self, dataset_name: str, columns: List[str]
56
+ # ) -> Optional[pd.DataFrame]:
57
+ # """Delete columns from a dataset on Hugging Face Hub."""
58
+ # try:
59
+ # df = await self.read_dataset(dataset_name)
60
+ # for column in columns:
61
+ # if column in df.columns:
62
+ # df.drop(column, axis=1, inplace=True)
63
+ # else:
64
+ # logger.warning(f"Column '{column}' not found in dataset.")
65
+ # await self.push_to_hub(df, dataset_name)
66
+ # return df
67
+ # except Exception as e:
68
+ # logger.error(f"Failed to delete columns: {e}")
69
+ # raise DatasetPushError(f"Failed to delete columns: {e}")
70
+
71
  from datasets import Dataset, load_dataset
72
+ from huggingface_hub import HfApi, HfFolder
73
  import logging
74
  from typing import Optional, Dict, List
75
  import pandas as pd
76
+ from src.api.exceptions import (
77
+ DatasetNotFoundError,
78
+ DatasetPushError,
79
+ DatasetDeleteError,
80
+ )
81
 
82
  # Set up structured logging
83
  logging.basicConfig(
 
87
 
88
 
89
  class HuggingFaceService:
90
+ def __init__(self, hf_token: Optional[str] = None):
91
+ """Initialize the HuggingFaceService with an optional token."""
92
+ self.hf_api = HfApi()
93
+ if hf_token:
94
+ HfFolder.save_token(hf_token) # Save the token for authentication
95
+
96
  async def push_to_hub(self, df: pd.DataFrame, dataset_name: str) -> None:
97
  """Push the dataset to Hugging Face Hub."""
98
  try:
99
  logger.info(f"Creating Hugging Face Dataset: {dataset_name}...")
100
+ ds = Dataset.from_pandas(df)
101
  ds.push_to_hub(dataset_name)
102
  logger.info(f"Dataset pushed to Hugging Face Hub: {dataset_name}")
103
  except Exception as e:
 
132
  logger.error(f"Failed to update dataset: {e}")
133
  raise DatasetPushError(f"Failed to update dataset: {e}")
134
 
135
+ async def delete_dataset(self, dataset_name: str) -> None:
136
+ """Delete a dataset from Hugging Face Hub."""
 
 
137
  try:
138
+ logger.info(f"Deleting dataset from Hugging Face Hub: {dataset_name}...")
139
+ self.hf_api.delete_repo(repo_id=dataset_name, repo_type="dataset")
140
+ logger.info(f"Dataset deleted from Hugging Face Hub: {dataset_name}")
 
 
 
 
 
141
  except Exception as e:
142
+ logger.error(f"Failed to delete dataset: {e}")
143
+ raise DatasetDeleteError(f"Failed to delete dataset: {e}")
src/main.py CHANGED
@@ -168,8 +168,8 @@ async def delete_embeddings(
168
  Delete embeddings from a Hugging Face dataset.
169
  """
170
  try:
171
- df = await huggingface_service.delete_columns(
172
- request.dataset_name, request.columns
173
  )
174
  return {
175
  "message": "Embeddings deleted successfully.",
 
168
  Delete embeddings from a Hugging Face dataset.
169
  """
170
  try:
171
+ await huggingface_service.delete_dataset(
172
+ request.dataset_name
173
  )
174
  return {
175
  "message": "Embeddings deleted successfully.",