Spaces:
Sleeping
Sleeping
Commit
·
fdc0b3d
1
Parent(s):
65997b1
generalization of deleting rows for each dataset (marketplace manufacturer seller)
Browse files
src/api/models/embedding_models.py
CHANGED
|
@@ -42,9 +42,10 @@ class DeleteEmbeddingRequest(BaseModel):
|
|
| 42 |
dataset_name: str
|
| 43 |
|
| 44 |
|
| 45 |
-
class
|
| 46 |
dataset_name: str
|
| 47 |
-
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
# Request model for the /embed endpoint
|
|
|
|
| 42 |
dataset_name: str
|
| 43 |
|
| 44 |
|
| 45 |
+
class DeleteByColumnRequest(BaseModel):
|
| 46 |
dataset_name: str
|
| 47 |
+
key_column: str
|
| 48 |
+
keys_to_delete: List[str]
|
| 49 |
|
| 50 |
|
| 51 |
# Request model for the /embed endpoint
|
src/api/services/huggingface_service.py
CHANGED
|
@@ -102,11 +102,11 @@ class HuggingFaceService:
|
|
| 102 |
raise DatasetDeleteError(f"Failed to delete dataset: {e}")
|
| 103 |
|
| 104 |
|
| 105 |
-
async def delete_rows_from_dataset(self, dataset_name: str,
|
| 106 |
"""
|
| 107 |
-
Loads a dataset, filters out rows based on a list of
|
| 108 |
"""
|
| 109 |
-
if not
|
| 110 |
return
|
| 111 |
|
| 112 |
# Step 1: Load the existing dataset
|
|
@@ -114,10 +114,10 @@ class HuggingFaceService:
|
|
| 114 |
dataset = await self.read_dataset(dataset_name)
|
| 115 |
|
| 116 |
# Step 2 : Filter the dataset to EXCLUDE the rows with the given product_types
|
| 117 |
-
logger.info(f"Filtering out
|
| 118 |
initial_row_count = len(dataset)
|
| 119 |
|
| 120 |
-
filtered_dataset = dataset.filter(lambda
|
| 121 |
|
| 122 |
final_row_count = len(filtered_dataset)
|
| 123 |
logger.info(f"{initial_row_count - final_row_count} rows deleted.")
|
|
|
|
| 102 |
raise DatasetDeleteError(f"Failed to delete dataset: {e}")
|
| 103 |
|
| 104 |
|
| 105 |
+
async def delete_rows_from_dataset(self, dataset_name: str, key_column: str, keys_to_delete: List[str]):
|
| 106 |
"""
|
| 107 |
+
Loads a dataset, filters out rows based on a list of keys in a specific column, and pushes it back.
|
| 108 |
"""
|
| 109 |
+
if not keys_to_delete:
|
| 110 |
return
|
| 111 |
|
| 112 |
# Step 1: Load the existing dataset
|
|
|
|
| 114 |
dataset = await self.read_dataset(dataset_name)
|
| 115 |
|
| 116 |
# Step 2 : Filter the dataset to EXCLUDE the rows with the given product_types
|
| 117 |
+
logger.info(f"Filtering out rows where column {key_column} is in {keys_to_delete}")
|
| 118 |
initial_row_count = len(dataset)
|
| 119 |
|
| 120 |
+
filtered_dataset = dataset.filter(lambda element: element[key_column] not in keys_to_delete)
|
| 121 |
|
| 122 |
final_row_count = len(filtered_dataset)
|
| 123 |
logger.info(f"{initial_row_count - final_row_count} rows deleted.")
|
src/main.py
CHANGED
|
@@ -7,9 +7,9 @@ from typing import List, Dict
|
|
| 7 |
from datasets import Dataset
|
| 8 |
from src.api.models.embedding_models import (
|
| 9 |
CreateEmbeddingRequest,
|
|
|
|
| 10 |
ReadEmbeddingRequest,
|
| 11 |
UpdateEmbeddingRequest,
|
| 12 |
-
DeleteRowsRequest,
|
| 13 |
DeleteEmbeddingRequest,
|
| 14 |
EmbedRequest,
|
| 15 |
SearchEmbeddingRequest,
|
|
@@ -231,21 +231,22 @@ async def delete_embeddings(
|
|
| 231 |
raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
|
| 232 |
|
| 233 |
|
| 234 |
-
@app.post("/
|
| 235 |
-
async def
|
| 236 |
-
request:
|
| 237 |
huggingface_service: HuggingFaceService = Depends(get_huggingface_service)
|
| 238 |
):
|
| 239 |
"""
|
| 240 |
-
Deletes specific rows from a Hugging Face dataset based on
|
| 241 |
try:
|
| 242 |
await huggingface_service.delete_rows_from_dataset(
|
| 243 |
-
request.dataset_name, request.
|
| 244 |
)
|
| 245 |
return {
|
| 246 |
"message": "Rows deleted succesfully from dataset.",
|
| 247 |
"dataset_name": request.dataset_name,
|
| 248 |
-
"
|
|
|
|
| 249 |
}
|
| 250 |
except DatasetNotFoundError as e:
|
| 251 |
raise HTTPException(status_code=404, detail=str(e))
|
|
|
|
| 7 |
from datasets import Dataset
|
| 8 |
from src.api.models.embedding_models import (
|
| 9 |
CreateEmbeddingRequest,
|
| 10 |
+
DeleteByColumnRequest,
|
| 11 |
ReadEmbeddingRequest,
|
| 12 |
UpdateEmbeddingRequest,
|
|
|
|
| 13 |
DeleteEmbeddingRequest,
|
| 14 |
EmbedRequest,
|
| 15 |
SearchEmbeddingRequest,
|
|
|
|
| 231 |
raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
|
| 232 |
|
| 233 |
|
| 234 |
+
@app.post("/delete_rows_by_key")
|
| 235 |
+
async def delete_rows_by_key(
|
| 236 |
+
request: DeleteByColumnRequest,
|
| 237 |
huggingface_service: HuggingFaceService = Depends(get_huggingface_service)
|
| 238 |
):
|
| 239 |
"""
|
| 240 |
+
Deletes specific rows from a Hugging Face dataset based on a key column and values."""
|
| 241 |
try:
|
| 242 |
await huggingface_service.delete_rows_from_dataset(
|
| 243 |
+
request.dataset_name, request.key_column, request.keys_to_delete
|
| 244 |
)
|
| 245 |
return {
|
| 246 |
"message": "Rows deleted succesfully from dataset.",
|
| 247 |
"dataset_name": request.dataset_name,
|
| 248 |
+
"key_column": request.key_column,
|
| 249 |
+
"deleted_keys": request.keys_to_delete,
|
| 250 |
}
|
| 251 |
except DatasetNotFoundError as e:
|
| 252 |
raise HTTPException(status_code=404, detail=str(e))
|