File size: 2,498 Bytes
fe51e27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import numpy as np
from typing import List, Tuple

from .similarity import cosine_similarity
from .vectorizer import Vectorizer
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class PromptSearchEngine:
    def __init__(self):
        self.vectorizer = Vectorizer(init_pinecone=False)
        self.vectorizer._data_loaded = True
        self.prompts = self.vectorizer.prompts
        self.corpus_vectors = self.vectorizer.transform(self.prompts)
        self.index_name = self.vectorizer.pinecone_index_name

    def most_similar(self, query: str, n: int = 5, use_pinecone=True) -> List[Tuple[float, str]]:
        logger.info(f"Encoding query: {query}")
        query_vector = self.vectorizer.transform([query])[0]
        logger.info(f"Encoded query vector: {query_vector}")
        if use_pinecone:
            logger.info(f"I'm doing pinecone vector search because the use_pinecone is: {use_pinecone}")
            try:
                # Convert numpy array to list of native Python floats
                query_vector_list = query_vector.tolist()
                search_result = self.vectorizer.index.query(
                    vector=query_vector_list,
                    top_k=n,
                    include_metadata=True
                )
                logger.info(f"Search result: {search_result}")

                # Retrieve and format the results
                results = [(match['score'], match['metadata']['text']) for match in search_result['matches'] if
                           'text' in match['metadata']]
            except Exception as e:
                logger.error(f"Pinecone query failed: {e}")
                logger.info("Falling back to cosine similarity search.")

                # Fallback to cosine similarity search
                similarities = cosine_similarity(query_vector, self.corpus_vectors)
                top_n_indices = np.argsort(similarities)[-n:][::-1]
                results = [(float(similarities[i]), self.prompts[i]) for i in top_n_indices]
        else:
            logger.info(f"I'm cosine similarity search because the use_pinecone is: {use_pinecone}")
            logger.info("Using cosine similarity for search")
            similarities = cosine_similarity(query_vector, self.corpus_vectors)
            top_n_indices = np.argsort(similarities)[-n:][::-1]
            results = [(float(similarities[i]), self.prompts[i]) for i in top_n_indices]
        return results