Spaces:
Running
Running
Commit
·
fdc0b3d
1
Parent(s):
65997b1
generalization of deleting rows for each dataset (marketplace manufacturer seller)
Browse files
src/api/models/embedding_models.py
CHANGED
@@ -42,9 +42,10 @@ class DeleteEmbeddingRequest(BaseModel):
|
|
42 |
dataset_name: str
|
43 |
|
44 |
|
45 |
-
class
|
46 |
dataset_name: str
|
47 |
-
|
|
|
48 |
|
49 |
|
50 |
# Request model for the /embed endpoint
|
|
|
42 |
dataset_name: str
|
43 |
|
44 |
|
45 |
+
class DeleteByColumnRequest(BaseModel):
|
46 |
dataset_name: str
|
47 |
+
key_column: str
|
48 |
+
keys_to_delete: List[str]
|
49 |
|
50 |
|
51 |
# Request model for the /embed endpoint
|
src/api/services/huggingface_service.py
CHANGED
@@ -102,11 +102,11 @@ class HuggingFaceService:
|
|
102 |
raise DatasetDeleteError(f"Failed to delete dataset: {e}")
|
103 |
|
104 |
|
105 |
-
async def delete_rows_from_dataset(self, dataset_name: str,
|
106 |
"""
|
107 |
-
Loads a dataset, filters out rows based on a list of
|
108 |
"""
|
109 |
-
if not
|
110 |
return
|
111 |
|
112 |
# Step 1: Load the existing dataset
|
@@ -114,10 +114,10 @@ class HuggingFaceService:
|
|
114 |
dataset = await self.read_dataset(dataset_name)
|
115 |
|
116 |
# Step 2 : Filter the dataset to EXCLUDE the rows with the given product_types
|
117 |
-
logger.info(f"Filtering out
|
118 |
initial_row_count = len(dataset)
|
119 |
|
120 |
-
filtered_dataset = dataset.filter(lambda
|
121 |
|
122 |
final_row_count = len(filtered_dataset)
|
123 |
logger.info(f"{initial_row_count - final_row_count} rows deleted.")
|
|
|
102 |
raise DatasetDeleteError(f"Failed to delete dataset: {e}")
|
103 |
|
104 |
|
105 |
+
async def delete_rows_from_dataset(self, dataset_name: str, key_column: str, keys_to_delete: List[str]):
|
106 |
"""
|
107 |
+
Loads a dataset, filters out rows based on a list of keys in a specific column, and pushes it back.
|
108 |
"""
|
109 |
+
if not keys_to_delete:
|
110 |
return
|
111 |
|
112 |
# Step 1: Load the existing dataset
|
|
|
114 |
dataset = await self.read_dataset(dataset_name)
|
115 |
|
116 |
# Step 2 : Filter the dataset to EXCLUDE the rows with the given product_types
|
117 |
+
logger.info(f"Filtering out rows where column {key_column} is in {keys_to_delete}")
|
118 |
initial_row_count = len(dataset)
|
119 |
|
120 |
+
filtered_dataset = dataset.filter(lambda element: element[key_column] not in keys_to_delete)
|
121 |
|
122 |
final_row_count = len(filtered_dataset)
|
123 |
logger.info(f"{initial_row_count - final_row_count} rows deleted.")
|
src/main.py
CHANGED
@@ -7,9 +7,9 @@ from typing import List, Dict
|
|
7 |
from datasets import Dataset
|
8 |
from src.api.models.embedding_models import (
|
9 |
CreateEmbeddingRequest,
|
|
|
10 |
ReadEmbeddingRequest,
|
11 |
UpdateEmbeddingRequest,
|
12 |
-
DeleteRowsRequest,
|
13 |
DeleteEmbeddingRequest,
|
14 |
EmbedRequest,
|
15 |
SearchEmbeddingRequest,
|
@@ -231,21 +231,22 @@ async def delete_embeddings(
|
|
231 |
raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
|
232 |
|
233 |
|
234 |
-
@app.post("/
|
235 |
-
async def
|
236 |
-
request:
|
237 |
huggingface_service: HuggingFaceService = Depends(get_huggingface_service)
|
238 |
):
|
239 |
"""
|
240 |
-
Deletes specific rows from a Hugging Face dataset based on
|
241 |
try:
|
242 |
await huggingface_service.delete_rows_from_dataset(
|
243 |
-
request.dataset_name, request.
|
244 |
)
|
245 |
return {
|
246 |
"message": "Rows deleted succesfully from dataset.",
|
247 |
"dataset_name": request.dataset_name,
|
248 |
-
"
|
|
|
249 |
}
|
250 |
except DatasetNotFoundError as e:
|
251 |
raise HTTPException(status_code=404, detail=str(e))
|
|
|
7 |
from datasets import Dataset
|
8 |
from src.api.models.embedding_models import (
|
9 |
CreateEmbeddingRequest,
|
10 |
+
DeleteByColumnRequest,
|
11 |
ReadEmbeddingRequest,
|
12 |
UpdateEmbeddingRequest,
|
|
|
13 |
DeleteEmbeddingRequest,
|
14 |
EmbedRequest,
|
15 |
SearchEmbeddingRequest,
|
|
|
231 |
raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
|
232 |
|
233 |
|
234 |
+
@app.post("/delete_rows_by_key")
|
235 |
+
async def delete_rows_by_key(
|
236 |
+
request: DeleteByColumnRequest,
|
237 |
huggingface_service: HuggingFaceService = Depends(get_huggingface_service)
|
238 |
):
|
239 |
"""
|
240 |
+
Deletes specific rows from a Hugging Face dataset based on a key column and values."""
|
241 |
try:
|
242 |
await huggingface_service.delete_rows_from_dataset(
|
243 |
+
request.dataset_name, request.key_column, request.keys_to_delete
|
244 |
)
|
245 |
return {
|
246 |
"message": "Rows deleted succesfully from dataset.",
|
247 |
"dataset_name": request.dataset_name,
|
248 |
+
"key_column": request.key_column,
|
249 |
+
"deleted_keys": request.keys_to_delete,
|
250 |
}
|
251 |
except DatasetNotFoundError as e:
|
252 |
raise HTTPException(status_code=404, detail=str(e))
|