amaye15 commited on
Commit
abfb1fb
·
1 Parent(s): 9bda6d8
Dockerfile CHANGED
@@ -1,55 +1,3 @@
1
- # # Stage 1: Build stage
2
- # FROM python:3.12-slim as builder
3
-
4
- # # Set environment variables
5
- # ENV PYTHONDONTWRITEBYTECODE=1
6
- # ENV PYTHONUNBUFFERED=1
7
-
8
- # # Create a non-root user
9
- # RUN useradd -m -u 1000 user
10
-
11
- # # Set the working directory
12
- # WORKDIR /app
13
-
14
- # # Copy only the requirements file first to leverage Docker cache
15
- # COPY --chown=user ./requirements.txt /app/requirements.txt
16
-
17
- # # Install dependencies in a virtual environment
18
- # RUN python -m venv /opt/venv
19
- # ENV PATH="/opt/venv/bin:$PATH"
20
- # RUN pip install --no-cache-dir --upgrade pip && \
21
- # pip install --no-cache-dir -r requirements.txt
22
-
23
- # # Copy the rest of the application code
24
- # COPY --chown=user . /app
25
-
26
- # # Stage 2: Runtime stage
27
- # FROM python:3.12-slim
28
-
29
- # # Create a non-root user
30
- # RUN useradd -m -u 1000 user
31
- # USER user
32
-
33
- # # Copy the virtual environment from the builder stage
34
- # COPY --from=builder /opt/venv /opt/venv
35
- # ENV PATH="/opt/venv/bin:$PATH"
36
-
37
- # # Set the working directory
38
- # WORKDIR /app
39
-
40
- # # Copy only the necessary files from the builder stage
41
- # COPY --from=builder --chown=user /app /app
42
-
43
- # # Expose the port the app runs on
44
- # EXPOSE 7860
45
-
46
- # # Health check to ensure the application is running
47
- # HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
48
- # CMD curl -f http://localhost:7860/health || exit 1
49
-
50
- # # Command to run the application with hot reloading
51
- # CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "7860", "--reload"]
52
-
53
  # Stage 1: Build stage
54
  FROM python:3.12-slim as builder
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Stage 1: Build stage
2
  FROM python:3.12-slim as builder
3
 
docker-compose.yml CHANGED
@@ -16,6 +16,8 @@ services:
16
  interval: 30s
17
  timeout: 10s
18
  retries: 3
 
 
19
  # depends_on:
20
  # - db # If you have a database service, add it here
21
 
 
16
  interval: 30s
17
  timeout: 10s
18
  retries: 3
19
+
20
+ # Could be useful later on
21
  # depends_on:
22
  # - db # If you have a database service, add it here
23
 
src/api/models/embedding_models.py CHANGED
@@ -26,9 +26,6 @@ class ReadEmbeddingRequest(BaseModel):
26
  # max_concurrent_requests: int = 10
27
  # dataset_name: str = "re-mind/product_type_embedding"
28
 
29
- from pydantic import BaseModel
30
- from typing import Dict, List
31
-
32
 
33
  class UpdateEmbeddingRequest(BaseModel):
34
  dataset_name: str = "re-mind/product_type_embedding"
 
26
  # max_concurrent_requests: int = 10
27
  # dataset_name: str = "re-mind/product_type_embedding"
28
 
 
 
 
29
 
30
  class UpdateEmbeddingRequest(BaseModel):
31
  dataset_name: str = "re-mind/product_type_embedding"
src/api/services/huggingface_service.py CHANGED
@@ -47,31 +47,6 @@ class HuggingFaceService:
47
  logger.error(f"Failed to read dataset: {e}")
48
  raise DatasetNotFoundError(f"Dataset not found: {e}")
49
 
50
- # async def update_dataset(
51
- # self, dataset_name: str, updates: Dict[str, List]
52
- # ) -> Optional[pd.DataFrame]:
53
- # """Update a dataset on Hugging Face Hub."""
54
-
55
- # embedding_service = get_embedding_service()
56
-
57
- # try:
58
- # df_src = await self.read_dataset(dataset_name)
59
- # df_src = Dataset.from_dict(df_src)
60
- # df_update = Dataset.from_dict(updates)
61
-
62
- # df = concatenate_datasets(df_src, df_update)
63
-
64
- # # for column, values in updates.items():
65
- # # if column in df.columns:
66
- # # df[column] = values
67
- # # else:
68
- # # logger.warning(f"Column '{column}' not found in dataset.")
69
- # # await self.push_to_hub(df, dataset_name)
70
- # # return df
71
- # except Exception as e:
72
- # logger.error(f"Failed to update dataset: {e}")
73
- # raise DatasetPushError(f"Failed to update dataset: {e}")
74
-
75
  async def update_dataset(
76
  self,
77
  dataset_name: str,
 
47
  logger.error(f"Failed to read dataset: {e}")
48
  raise DatasetNotFoundError(f"Dataset not found: {e}")
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  async def update_dataset(
51
  self,
52
  dataset_name: str,
src/main.py CHANGED
@@ -1,191 +1,3 @@
1
- # import os
2
- # from fastapi import FastAPI, Depends, HTTPException
3
- # from fastapi.responses import JSONResponse, RedirectResponse
4
- # from fastapi.middleware.gzip import GZipMiddleware
5
- # from pydantic import BaseModel
6
- # from typing import List, Dict
7
- # from src.api.models.embedding_models import (
8
- # CreateEmbeddingRequest,
9
- # ReadEmbeddingRequest,
10
- # UpdateEmbeddingRequest,
11
- # DeleteEmbeddingRequest,
12
- # )
13
- # from src.api.database import get_db, Database, QueryExecutionError, HealthCheckError
14
- # from src.api.services.embedding_service import EmbeddingService
15
- # from src.api.services.huggingface_service import HuggingFaceService
16
- # from src.api.exceptions import DatasetNotFoundError, DatasetPushError, OpenAIError
17
- # import pandas as pd
18
- # import logging
19
- # from dotenv import load_dotenv
20
-
21
- # # Load environment variables
22
- # load_dotenv()
23
-
24
- # # Set up structured logging
25
- # logging.basicConfig(
26
- # level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
27
- # )
28
- # logger = logging.getLogger(__name__)
29
-
30
- # description = """A FastAPI application for similarity search with PostgreSQL and OpenAI embeddings.
31
-
32
- # Direct/API URL:
33
- # https://re-mind-similarity-search.hf.space
34
- # """
35
-
36
- # # Initialize FastAPI app
37
- # app = FastAPI(
38
- # title="Similarity Search API",
39
- # description=description,
40
- # version="1.0.0",
41
- # )
42
-
43
- # app.add_middleware(GZipMiddleware, minimum_size=1000)
44
-
45
-
46
- # # Root endpoint redirects to /docs
47
- # @app.get("/")
48
- # async def root():
49
- # return RedirectResponse(url="/docs")
50
-
51
-
52
- # # Health check endpoint
53
- # @app.get("/health")
54
- # async def health_check(db: Database = Depends(get_db)):
55
- # try:
56
- # is_healthy = await db.health_check()
57
- # if not is_healthy:
58
- # raise HTTPException(status_code=500, detail="Database is unhealthy")
59
- # return {"status": "healthy"}
60
- # except HealthCheckError as e:
61
- # raise HTTPException(status_code=500, detail=str(e))
62
-
63
-
64
- # # Dependency to get EmbeddingService
65
- # def get_embedding_service() -> EmbeddingService:
66
- # return EmbeddingService(openai_api_key=os.getenv("OPENAI_API_KEY"))
67
-
68
-
69
- # # Dependency to get HuggingFaceService
70
- # def get_huggingface_service() -> HuggingFaceService:
71
- # return HuggingFaceService()
72
-
73
-
74
- # # Endpoint to create embeddings
75
- # @app.post("/create_embedding")
76
- # async def create_embedding(
77
- # request: CreateEmbeddingRequest,
78
- # db: Database = Depends(get_db),
79
- # embedding_service: EmbeddingService = Depends(get_embedding_service),
80
- # huggingface_service: HuggingFaceService = Depends(get_huggingface_service),
81
- # ):
82
- # """
83
- # Create embeddings for the target column in the dataset.
84
- # """
85
- # try:
86
- # # Step 1: Query the database
87
- # logger.info("Fetching data from the database...")
88
- # result = await db.fetch(request.query)
89
- # df = pd.DataFrame(result)
90
-
91
- # # Step 2: Generate embeddings
92
- # df = await embedding_service.create_embeddings(
93
- # df, request.target_column, request.output_column
94
- # )
95
-
96
- # # Step 3: Push to Hugging Face Hub
97
- # await huggingface_service.push_to_hub(df, request.dataset_name)
98
-
99
- # return JSONResponse(
100
- # content={
101
- # "message": "Embeddings created and pushed to Hugging Face Hub.",
102
- # "dataset_name": request.dataset_name,
103
- # "num_rows": len(df),
104
- # }
105
- # )
106
- # except QueryExecutionError as e:
107
- # logger.error(f"Database query failed: {e}")
108
- # raise HTTPException(status_code=500, detail=f"Database query failed: {e}")
109
- # except OpenAIError as e:
110
- # logger.error(f"OpenAI API error: {e}")
111
- # raise HTTPException(status_code=500, detail=f"OpenAI API error: {e}")
112
- # except DatasetPushError as e:
113
- # logger.error(f"Failed to push dataset: {e}")
114
- # raise HTTPException(status_code=500, detail=f"Failed to push dataset: {e}")
115
- # except Exception as e:
116
- # logger.error(f"An error occurred: {e}")
117
- # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
118
-
119
-
120
- # # Endpoint to read embeddings
121
- # # @app.get("/read_embeddings/{dataset_name}")
122
- # @app.post("/read_embeddings")
123
- # async def read_embeddings(
124
- # request: ReadEmbeddingRequest,
125
- # huggingface_service: HuggingFaceService = Depends(get_huggingface_service),
126
- # ):
127
- # """
128
- # Read embeddings from a Hugging Face dataset.
129
- # """
130
- # try:
131
- # df = await huggingface_service.read_dataset(request.dataset_name)
132
- # return df
133
- # except DatasetNotFoundError as e:
134
- # logger.error(f"Dataset not found: {e}")
135
- # raise HTTPException(status_code=404, detail=f"Dataset not found: {e}")
136
- # except Exception as e:
137
- # logger.error(f"An error occurred: {e}")
138
- # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
139
-
140
-
141
- # # Endpoint to update embeddings
142
- # @app.post("/update_embeddings")
143
- # async def update_embeddings(
144
- # request: UpdateEmbeddingRequest,
145
- # huggingface_service: HuggingFaceService = Depends(get_huggingface_service),
146
- # ):
147
- # """
148
- # Update embeddings in a Hugging Face dataset.
149
- # """
150
- # try:
151
- # df = await huggingface_service.update_dataset(
152
- # request.dataset_name, request.updates
153
- # )
154
- # return {
155
- # "message": "Embeddings updated successfully.",
156
- # "dataset_name": request.dataset_name,
157
- # }
158
- # except DatasetPushError as e:
159
- # logger.error(f"Failed to update dataset: {e}")
160
- # raise HTTPException(status_code=500, detail=f"Failed to update dataset: {e}")
161
- # except Exception as e:
162
- # logger.error(f"An error occurred: {e}")
163
- # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
164
-
165
-
166
- # # Endpoint to delete embeddings
167
- # @app.post("/delete_embeddings")
168
- # async def delete_embeddings(
169
- # request: DeleteEmbeddingRequest,
170
- # huggingface_service: HuggingFaceService = Depends(get_huggingface_service),
171
- # ):
172
- # """
173
- # Delete embeddings from a Hugging Face dataset.
174
- # """
175
- # try:
176
- # await huggingface_service.delete_dataset(request.dataset_name)
177
- # return {
178
- # "message": "Embeddings deleted successfully.",
179
- # "dataset_name": request.dataset_name,
180
- # }
181
- # except DatasetPushError as e:
182
- # logger.error(f"Failed to delete columns: {e}")
183
- # raise HTTPException(status_code=500, detail=f"Failed to delete columns: {e}")
184
- # except Exception as e:
185
- # logger.error(f"An error occurred: {e}")
186
- # raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
187
-
188
-
189
  import os
190
  from fastapi import FastAPI, Depends, HTTPException
191
  from fastapi.responses import JSONResponse, RedirectResponse
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  from fastapi import FastAPI, Depends, HTTPException
3
  from fastapi.responses import JSONResponse, RedirectResponse