Spaces:

bardd
/

search_pt

Sleeping

App Files Files Community

bardd commited on Sep 20, 2024

Commit

16d282e

verified ·

1 Parent(s): 106d63e

first commit

Browse files

Files changed (17) hide show

Dockerfile +31 -0
README +41 -0
__init__.py +0 -0
__pycache__/constants.cpython-39.pyc +0 -0
__pycache__/content_base_filtering.cpython-39.pyc +0 -0
__pycache__/services.cpython-39.pyc +0 -0
config/__pycache__/constants.cpython-39.pyc +0 -0
config/constants.py +3 -0
data/__pycache__/data_loader.cpython-39.pyc +0 -0
data/data_compressed.pkl +3 -0
data/data_loader.py +6 -0
data_models/__pycache__/models.cpython-39.pyc +0 -0
data_models/models.py +3 -0
docker_delete.sh +21 -0
main.py +25 -0
requirements.txt +6 -0
services.py +47 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.11-slim
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=UTC
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libsqlite3-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir \
+    fastapi==0.113.0 \
+    uvicorn==0.30.6 \
+    joblib==1.4.2 \
+    chromadb==0.5.5 \
+    openai==0.28.0 \
+    numpy==1.26.4
+# Copy your application files
+COPY . .
+# Command to run your application
+CMD ["python3", "main.py"]

README ADDED Viewed

	@@ -0,0 +1,41 @@

+## README for `Content Base Filtering` Feature
+### Overview
+This FastAPI application performs content-based filtering using embeddings from a serialized dataset.
+### Requirements
+To run this application, you need to install the dependencies listed in `requirements.txt`. Use the following command:
+```bash
+pip install -r requirements.txt
+```
+### Data
+The application uses a serialized dataset stored in `data_compressed.pkl`. This file contains embeddings and IDs. To deserialize this file, use the `joblib` library as shown in the code:
+```python
+embd_id = joblib.load('data_compressed.pkl')
+```
+### API
+The application exposes a single endpoint `/search` that accepts a JSON body with a `user_search_query` field.
+**Input:**
+```json
+{
+  "user_search_query": "user search query here"
+}
+```
+**Output:**
+A list of original IDs of the listings that match the search query.
+### Running the Application
+To run the application, use the following command:
+```bash
+uvicorn content_base_filtering:app --host 0.0.0.0 --port 8000
+```
+### Usage
+1. Send a POST request to `http://localhost:8000/search` with a JSON body containing your search query.
+2. The application will return a list of original IDs of the listings that match the search query.
+Note: Make sure to replace the `openai.api_key` and `openai.api_base` variables with your actual OpenAI API credentials.

__init__.py ADDED Viewed

File without changes

__pycache__/constants.cpython-39.pyc ADDED Viewed

Binary file (303 Bytes). View file

__pycache__/content_base_filtering.cpython-39.pyc ADDED Viewed

Binary file (2.26 kB). View file

__pycache__/services.cpython-39.pyc ADDED Viewed

Binary file (2.26 kB). View file

config/__pycache__/constants.cpython-39.pyc ADDED Viewed

Binary file (310 Bytes). View file

config/constants.py ADDED Viewed

	@@ -0,0 +1,3 @@

+DEEPINFRA_API_KEY = "XQsQHnO39Fcx27k8Jke5UqmDcUWUQRBL"
+DEEPINFRA_ENDPOINT_URL = "https://api.deepinfra.com/v1/openai"
+DEEPINFRA_MODEL_TAG = "BAAI/bge-base-en-v1.5"

data/__pycache__/data_loader.cpython-39.pyc ADDED Viewed

Binary file (355 Bytes). View file

data/data_compressed.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fdd0aa7863ef419228d8527f79acc295f2ccb3841e75ea2c52abf959b77d60a
+size 89663942

data/data_loader.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import joblib
+def load_data():
+    embd_id = joblib.load('data/data_compressed.pkl')
+    return embd_id

data_models/__pycache__/models.cpython-39.pyc ADDED Viewed

Binary file (394 Bytes). View file

data_models/models.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from pydantic import BaseModel
+class SearchQuery(BaseModel):
+    user_search_query: str

docker_delete.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash
+echo "Stopping all running containers..."
+docker stop $(docker ps -q)
+echo "Removing all containers..."
+docker rm $(docker ps -a -q)
+echo "Removing all Docker images..."
+docker rmi $(docker images -q)
+echo "Removing all Docker volumes..."
+docker volume rm $(docker volume ls -q)
+echo "Removing all Docker networks..."
+docker network rm $(docker network ls -q | grep -v "bridge\|host\|none")
+echo "Pruning all unused Docker objects (dangling images, stopped containers, unused networks)..."
+docker system prune -f --volumes
+echo "Docker cleanup complete!"

main.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# main.py
+from fastapi import FastAPI
+from data_models.models import SearchQuery
+from services import SearchService
+from data.data_loader import load_data
+#Initiating FastAPI
+app = FastAPI()
+#Initaiting SearchService
+search_service = SearchService()
+#Loading data from serialized file
+embd_id = load_data()
+#Ingesting data into VectorDB
+search_service.ingest_data(embd_id)
+@app.post("/search")
+async def search(query: SearchQuery):
+    return search_service.search(query.user_search_query)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=port)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi==0.113.0
+uvicorn==0.30.6
+joblib==1.4.2
+chromadb==0.5.5
+openai==0.28.0
+numpy==1.26.4

services.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# services.py
+from typing import List
+import numpy as np
+from chromadb import Client
+import openai
+from config.constants import DEEPINFRA_MODEL_TAG, DEEPINFRA_ENDPOINT_URL, DEEPINFRA_API_KEY
+class SearchService:
+    def __init__(self):
+        self.client = Client()
+        self.collection_name = "listing_collection"
+        self.collection = self.client.create_collection(
+            name=self.collection_name,
+            metadata={
+                'description': 'real_estate_listing',
+                "hnsw:construction_ef": 64,
+                "hnsw:M": 32,
+                "hnsw:search_ef": 32,
+            },
+            embedding_function=None,
+        )
+    def ingest_data(self, embd_id):
+        # Add embeddings to the collection with original IDs as metadata
+        embeddings = embd_id[:, 1:].astype(float)
+        original_ids = [f"PTFS{num}" for num in embd_id[:, 0].astype('int64')]
+        ids = [str(i) for i in range(len(original_ids))]
+        self.collection.add(
+            ids=ids,
+            embeddings=embeddings,
+            metadatas=[{"original_id": id} for id in original_ids],
+        )
+    def search(self, query: str) -> List[str]:
+        # Create an OpenAI client with DeepInfra
+        openai.api_key = DEEPINFRA_API_KEY
+        openai.api_base = DEEPINFRA_ENDPOINT_URL
+        # Convert the search query to embeddings
+        embeddings = openai.Embedding.create(input=query, model=DEEPINFRA_MODEL_TAG, encoding_format="float")
+        query_embedding = embeddings.data[0].embedding
+        # Search for similar embeddings
+        results = self.collection.query(np.array([query_embedding]), n_results=10)
+        # Extract the original IDs from the results
+        original_ids = [metadata["original_id"] for metadata in results["metadatas"][0]]
+        return original_ids