bardd commited on
Commit
16d282e
·
verified ·
1 Parent(s): 106d63e

first commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set environment variables
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+ ENV TZ=UTC
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ libsqlite3-dev \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Set working directory
14
+ WORKDIR /app
15
+
16
+ # Install Python dependencies
17
+ RUN pip install --no-cache-dir --upgrade pip setuptools && \
18
+ pip install --no-cache-dir \
19
+ fastapi==0.113.0 \
20
+ uvicorn==0.30.6 \
21
+ joblib==1.4.2 \
22
+ chromadb==0.5.5 \
23
+ openai==0.28.0 \
24
+ numpy==1.26.4
25
+
26
+ # Copy your application files
27
+ COPY . .
28
+
29
+ # Command to run your application
30
+ CMD ["python3", "main.py"]
31
+
README ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## README for `Content Base Filtering` Feature
2
+
3
+ ### Overview
4
+ This FastAPI application performs content-based filtering using embeddings from a serialized dataset.
5
+
6
+ ### Requirements
7
+ To run this application, you need to install the dependencies listed in `requirements.txt`. Use the following command:
8
+ ```bash
9
+ pip install -r requirements.txt
10
+ ```
11
+
12
+ ### Data
13
+ The application uses a serialized dataset stored in `data_compressed.pkl`. This file contains embeddings and IDs. To deserialize this file, use the `joblib` library as shown in the code:
14
+ ```python
15
+ embd_id = joblib.load('data_compressed.pkl')
16
+ ```
17
+
18
+ ### API
19
+ The application exposes a single endpoint `/search` that accepts a JSON body with a `user_search_query` field.
20
+
21
+ **Input:**
22
+ ```json
23
+ {
24
+ "user_search_query": "user search query here"
25
+ }
26
+ ```
27
+
28
+ **Output:**
29
+ A list of original IDs of the listings that match the search query.
30
+
31
+ ### Running the Application
32
+ To run the application, use the following command:
33
+ ```bash
34
+ uvicorn content_base_filtering:app --host 0.0.0.0 --port 8000
35
+ ```
36
+
37
+ ### Usage
38
+ 1. Send a POST request to `http://localhost:8000/search` with a JSON body containing your search query.
39
+ 2. The application will return a list of original IDs of the listings that match the search query.
40
+
41
+ Note: Make sure to replace the `openai.api_key` and `openai.api_base` variables with your actual OpenAI API credentials.
__init__.py ADDED
File without changes
__pycache__/constants.cpython-39.pyc ADDED
Binary file (303 Bytes). View file
 
__pycache__/content_base_filtering.cpython-39.pyc ADDED
Binary file (2.26 kB). View file
 
__pycache__/services.cpython-39.pyc ADDED
Binary file (2.26 kB). View file
 
config/__pycache__/constants.cpython-39.pyc ADDED
Binary file (310 Bytes). View file
 
config/constants.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ DEEPINFRA_API_KEY = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL"
2
+ DEEPINFRA_ENDPOINT_URL = "https://api.deepinfra.com/v1/openai"
3
+ DEEPINFRA_MODEL_TAG = "BAAI/bge-base-en-v1.5"
data/__pycache__/data_loader.cpython-39.pyc ADDED
Binary file (355 Bytes). View file
 
data/data_compressed.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fdd0aa7863ef419228d8527f79acc295f2ccb3841e75ea2c52abf959b77d60a
3
+ size 89663942
data/data_loader.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import joblib
2
+
3
+ def load_data():
4
+ embd_id = joblib.load('data/data_compressed.pkl')
5
+ return embd_id
6
+
data_models/__pycache__/models.cpython-39.pyc ADDED
Binary file (394 Bytes). View file
 
data_models/models.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from pydantic import BaseModel
2
+ class SearchQuery(BaseModel):
3
+ user_search_query: str
docker_delete.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ echo "Stopping all running containers..."
4
+ docker stop $(docker ps -q)
5
+
6
+ echo "Removing all containers..."
7
+ docker rm $(docker ps -a -q)
8
+
9
+ echo "Removing all Docker images..."
10
+ docker rmi $(docker images -q)
11
+
12
+ echo "Removing all Docker volumes..."
13
+ docker volume rm $(docker volume ls -q)
14
+
15
+ echo "Removing all Docker networks..."
16
+ docker network rm $(docker network ls -q | grep -v "bridge\|host\|none")
17
+
18
+ echo "Pruning all unused Docker objects (dangling images, stopped containers, unused networks)..."
19
+ docker system prune -f --volumes
20
+
21
+ echo "Docker cleanup complete!"
main.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ from fastapi import FastAPI
3
+ from data_models.models import SearchQuery
4
+ from services import SearchService
5
+ from data.data_loader import load_data
6
+
7
+ #Initiating FastAPI
8
+ app = FastAPI()
9
+
10
+ #Initaiting SearchService
11
+ search_service = SearchService()
12
+
13
+ #Loading data from serialized file
14
+ embd_id = load_data()
15
+
16
+ #Ingesting data into VectorDB
17
+ search_service.ingest_data(embd_id)
18
+
19
+ @app.post("/search")
20
+ async def search(query: SearchQuery):
21
+ return search_service.search(query.user_search_query)
22
+
23
+ if __name__ == "__main__":
24
+ import uvicorn
25
+ uvicorn.run(app, host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi==0.113.0
2
+ uvicorn==0.30.6
3
+ joblib==1.4.2
4
+ chromadb==0.5.5
5
+ openai==0.28.0
6
+ numpy==1.26.4
services.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # services.py
2
+ from typing import List
3
+ import numpy as np
4
+ from chromadb import Client
5
+ import openai
6
+ from config.constants import DEEPINFRA_MODEL_TAG, DEEPINFRA_ENDPOINT_URL, DEEPINFRA_API_KEY
7
+
8
+ class SearchService:
9
+ def __init__(self):
10
+ self.client = Client()
11
+ self.collection_name = "listing_collection"
12
+ self.collection = self.client.create_collection(
13
+ name=self.collection_name,
14
+ metadata={
15
+ 'description': 'real_estate_listing',
16
+ "hnsw:construction_ef": 64,
17
+ "hnsw:M": 32,
18
+ "hnsw:search_ef": 32,
19
+ },
20
+ embedding_function=None,
21
+ )
22
+
23
+ def ingest_data(self, embd_id):
24
+ # Add embeddings to the collection with original IDs as metadata
25
+ embeddings = embd_id[:, 1:].astype(float)
26
+ original_ids = [f"PTFS{num}" for num in embd_id[:, 0].astype('int64')]
27
+ ids = [str(i) for i in range(len(original_ids))]
28
+ self.collection.add(
29
+ ids=ids,
30
+ embeddings=embeddings,
31
+ metadatas=[{"original_id": id} for id in original_ids],
32
+ )
33
+
34
+ def search(self, query: str) -> List[str]:
35
+ # Create an OpenAI client with DeepInfra
36
+ openai.api_key = DEEPINFRA_API_KEY
37
+ openai.api_base = DEEPINFRA_ENDPOINT_URL
38
+ # Convert the search query to embeddings
39
+ embeddings = openai.Embedding.create(input=query, model=DEEPINFRA_MODEL_TAG, encoding_format="float")
40
+ query_embedding = embeddings.data[0].embedding
41
+
42
+ # Search for similar embeddings
43
+ results = self.collection.query(np.array([query_embedding]), n_results=10)
44
+
45
+ # Extract the original IDs from the results
46
+ original_ids = [metadata["original_id"] for metadata in results["metadatas"][0]]
47
+ return original_ids