File size: 1,866 Bytes
16d282e
 
 
 
 
3ad3642
 
 
16d282e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# services.py
from typing import List
import numpy as np
from chromadb import Client
import openai
from config.constants import DEEPINFRA_MODEL_TAG, DEEPINFRA_ENDPOINT_URL
import os
DEEPINFRA_API_KEY = os.getenv('DEEPINFRA_API_KEY')

class SearchService:
    def __init__(self):
        self.client = Client()
        self.collection_name = "listing_collection"
        self.collection = self.client.create_collection(
            name=self.collection_name,
            metadata={
                'description': 'real_estate_listing',
                "hnsw:construction_ef": 64,
                "hnsw:M": 32,
                "hnsw:search_ef": 32,
            },
            embedding_function=None,
        )

    def ingest_data(self, embd_id):
        # Add embeddings to the collection with original IDs as metadata
        embeddings = embd_id[:, 1:].astype(float)
        original_ids = [f"PTFS{num}" for num in embd_id[:, 0].astype('int64')]
        ids = [str(i) for i in range(len(original_ids))]
        self.collection.add(
            ids=ids,
            embeddings=embeddings,
            metadatas=[{"original_id": id} for id in original_ids],
        )

    def search(self, query: str) -> List[str]:
        # Create an OpenAI client with DeepInfra 
        openai.api_key = DEEPINFRA_API_KEY
        openai.api_base = DEEPINFRA_ENDPOINT_URL
        # Convert the search query to embeddings
        embeddings = openai.Embedding.create(input=query, model=DEEPINFRA_MODEL_TAG, encoding_format="float")
        query_embedding = embeddings.data[0].embedding

        # Search for similar embeddings
        results = self.collection.query(np.array([query_embedding]), n_results=10)

        # Extract the original IDs from the results
        original_ids = [metadata["original_id"] for metadata in results["metadatas"][0]]
        return original_ids