first commit
Browse files- Dockerfile +31 -0
- README +41 -0
- __init__.py +0 -0
- __pycache__/constants.cpython-39.pyc +0 -0
- __pycache__/content_base_filtering.cpython-39.pyc +0 -0
- __pycache__/services.cpython-39.pyc +0 -0
- config/__pycache__/constants.cpython-39.pyc +0 -0
- config/constants.py +3 -0
- data/__pycache__/data_loader.cpython-39.pyc +0 -0
- data/data_compressed.pkl +3 -0
- data/data_loader.py +6 -0
- data_models/__pycache__/models.cpython-39.pyc +0 -0
- data_models/models.py +3 -0
- docker_delete.sh +21 -0
- main.py +25 -0
- requirements.txt +6 -0
- services.py +47 -0
Dockerfile
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11-slim
|
2 |
+
|
3 |
+
# Set environment variables
|
4 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
5 |
+
ENV TZ=UTC
|
6 |
+
|
7 |
+
# Install system dependencies
|
8 |
+
RUN apt-get update && apt-get install -y \
|
9 |
+
build-essential \
|
10 |
+
libsqlite3-dev \
|
11 |
+
&& rm -rf /var/lib/apt/lists/*
|
12 |
+
|
13 |
+
# Set working directory
|
14 |
+
WORKDIR /app
|
15 |
+
|
16 |
+
# Install Python dependencies
|
17 |
+
RUN pip install --no-cache-dir --upgrade pip setuptools && \
|
18 |
+
pip install --no-cache-dir \
|
19 |
+
fastapi==0.113.0 \
|
20 |
+
uvicorn==0.30.6 \
|
21 |
+
joblib==1.4.2 \
|
22 |
+
chromadb==0.5.5 \
|
23 |
+
openai==0.28.0 \
|
24 |
+
numpy==1.26.4
|
25 |
+
|
26 |
+
# Copy your application files
|
27 |
+
COPY . .
|
28 |
+
|
29 |
+
# Command to run your application
|
30 |
+
CMD ["python3", "main.py"]
|
31 |
+
|
README
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## README for `Content Base Filtering` Feature
|
2 |
+
|
3 |
+
### Overview
|
4 |
+
This FastAPI application performs content-based filtering using embeddings from a serialized dataset.
|
5 |
+
|
6 |
+
### Requirements
|
7 |
+
To run this application, you need to install the dependencies listed in `requirements.txt`. Use the following command:
|
8 |
+
```bash
|
9 |
+
pip install -r requirements.txt
|
10 |
+
```
|
11 |
+
|
12 |
+
### Data
|
13 |
+
The application uses a serialized dataset stored in `data_compressed.pkl`. This file contains embeddings and IDs. To deserialize this file, use the `joblib` library as shown in the code:
|
14 |
+
```python
|
15 |
+
embd_id = joblib.load('data_compressed.pkl')
|
16 |
+
```
|
17 |
+
|
18 |
+
### API
|
19 |
+
The application exposes a single endpoint `/search` that accepts a JSON body with a `user_search_query` field.
|
20 |
+
|
21 |
+
**Input:**
|
22 |
+
```json
|
23 |
+
{
|
24 |
+
"user_search_query": "user search query here"
|
25 |
+
}
|
26 |
+
```
|
27 |
+
|
28 |
+
**Output:**
|
29 |
+
A list of original IDs of the listings that match the search query.
|
30 |
+
|
31 |
+
### Running the Application
|
32 |
+
To run the application, use the following command:
|
33 |
+
```bash
|
34 |
+
uvicorn content_base_filtering:app --host 0.0.0.0 --port 8000
|
35 |
+
```
|
36 |
+
|
37 |
+
### Usage
|
38 |
+
1. Send a POST request to `http://localhost:8000/search` with a JSON body containing your search query.
|
39 |
+
2. The application will return a list of original IDs of the listings that match the search query.
|
40 |
+
|
41 |
+
Note: Make sure to replace the `openai.api_key` and `openai.api_base` variables with your actual OpenAI API credentials.
|
__init__.py
ADDED
File without changes
|
__pycache__/constants.cpython-39.pyc
ADDED
Binary file (303 Bytes). View file
|
|
__pycache__/content_base_filtering.cpython-39.pyc
ADDED
Binary file (2.26 kB). View file
|
|
__pycache__/services.cpython-39.pyc
ADDED
Binary file (2.26 kB). View file
|
|
config/__pycache__/constants.cpython-39.pyc
ADDED
Binary file (310 Bytes). View file
|
|
config/constants.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
DEEPINFRA_API_KEY = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL"
|
2 |
+
DEEPINFRA_ENDPOINT_URL = "https://api.deepinfra.com/v1/openai"
|
3 |
+
DEEPINFRA_MODEL_TAG = "BAAI/bge-base-en-v1.5"
|
data/__pycache__/data_loader.cpython-39.pyc
ADDED
Binary file (355 Bytes). View file
|
|
data/data_compressed.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9fdd0aa7863ef419228d8527f79acc295f2ccb3841e75ea2c52abf959b77d60a
|
3 |
+
size 89663942
|
data/data_loader.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import joblib
|
2 |
+
|
3 |
+
def load_data():
|
4 |
+
embd_id = joblib.load('data/data_compressed.pkl')
|
5 |
+
return embd_id
|
6 |
+
|
data_models/__pycache__/models.cpython-39.pyc
ADDED
Binary file (394 Bytes). View file
|
|
data_models/models.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel
|
2 |
+
class SearchQuery(BaseModel):
|
3 |
+
user_search_query: str
|
docker_delete.sh
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
echo "Stopping all running containers..."
|
4 |
+
docker stop $(docker ps -q)
|
5 |
+
|
6 |
+
echo "Removing all containers..."
|
7 |
+
docker rm $(docker ps -a -q)
|
8 |
+
|
9 |
+
echo "Removing all Docker images..."
|
10 |
+
docker rmi $(docker images -q)
|
11 |
+
|
12 |
+
echo "Removing all Docker volumes..."
|
13 |
+
docker volume rm $(docker volume ls -q)
|
14 |
+
|
15 |
+
echo "Removing all Docker networks..."
|
16 |
+
docker network rm $(docker network ls -q | grep -v "bridge\|host\|none")
|
17 |
+
|
18 |
+
echo "Pruning all unused Docker objects (dangling images, stopped containers, unused networks)..."
|
19 |
+
docker system prune -f --volumes
|
20 |
+
|
21 |
+
echo "Docker cleanup complete!"
|
main.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# main.py
|
2 |
+
from fastapi import FastAPI
|
3 |
+
from data_models.models import SearchQuery
|
4 |
+
from services import SearchService
|
5 |
+
from data.data_loader import load_data
|
6 |
+
|
7 |
+
#Initiating FastAPI
|
8 |
+
app = FastAPI()
|
9 |
+
|
10 |
+
#Initaiting SearchService
|
11 |
+
search_service = SearchService()
|
12 |
+
|
13 |
+
#Loading data from serialized file
|
14 |
+
embd_id = load_data()
|
15 |
+
|
16 |
+
#Ingesting data into VectorDB
|
17 |
+
search_service.ingest_data(embd_id)
|
18 |
+
|
19 |
+
@app.post("/search")
|
20 |
+
async def search(query: SearchQuery):
|
21 |
+
return search_service.search(query.user_search_query)
|
22 |
+
|
23 |
+
if __name__ == "__main__":
|
24 |
+
import uvicorn
|
25 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.113.0
|
2 |
+
uvicorn==0.30.6
|
3 |
+
joblib==1.4.2
|
4 |
+
chromadb==0.5.5
|
5 |
+
openai==0.28.0
|
6 |
+
numpy==1.26.4
|
services.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# services.py
|
2 |
+
from typing import List
|
3 |
+
import numpy as np
|
4 |
+
from chromadb import Client
|
5 |
+
import openai
|
6 |
+
from config.constants import DEEPINFRA_MODEL_TAG, DEEPINFRA_ENDPOINT_URL, DEEPINFRA_API_KEY
|
7 |
+
|
8 |
+
class SearchService:
|
9 |
+
def __init__(self):
|
10 |
+
self.client = Client()
|
11 |
+
self.collection_name = "listing_collection"
|
12 |
+
self.collection = self.client.create_collection(
|
13 |
+
name=self.collection_name,
|
14 |
+
metadata={
|
15 |
+
'description': 'real_estate_listing',
|
16 |
+
"hnsw:construction_ef": 64,
|
17 |
+
"hnsw:M": 32,
|
18 |
+
"hnsw:search_ef": 32,
|
19 |
+
},
|
20 |
+
embedding_function=None,
|
21 |
+
)
|
22 |
+
|
23 |
+
def ingest_data(self, embd_id):
|
24 |
+
# Add embeddings to the collection with original IDs as metadata
|
25 |
+
embeddings = embd_id[:, 1:].astype(float)
|
26 |
+
original_ids = [f"PTFS{num}" for num in embd_id[:, 0].astype('int64')]
|
27 |
+
ids = [str(i) for i in range(len(original_ids))]
|
28 |
+
self.collection.add(
|
29 |
+
ids=ids,
|
30 |
+
embeddings=embeddings,
|
31 |
+
metadatas=[{"original_id": id} for id in original_ids],
|
32 |
+
)
|
33 |
+
|
34 |
+
def search(self, query: str) -> List[str]:
|
35 |
+
# Create an OpenAI client with DeepInfra
|
36 |
+
openai.api_key = DEEPINFRA_API_KEY
|
37 |
+
openai.api_base = DEEPINFRA_ENDPOINT_URL
|
38 |
+
# Convert the search query to embeddings
|
39 |
+
embeddings = openai.Embedding.create(input=query, model=DEEPINFRA_MODEL_TAG, encoding_format="float")
|
40 |
+
query_embedding = embeddings.data[0].embedding
|
41 |
+
|
42 |
+
# Search for similar embeddings
|
43 |
+
results = self.collection.query(np.array([query_embedding]), n_results=10)
|
44 |
+
|
45 |
+
# Extract the original IDs from the results
|
46 |
+
original_ids = [metadata["original_id"] for metadata in results["metadatas"][0]]
|
47 |
+
return original_ids
|