supertskone commited on
Commit
f63ca0c
·
verified ·
1 Parent(s): 93e9fef

Upload search_engine.py

Browse files
Files changed (1) hide show
  1. app/search_engine.py +54 -0
app/search_engine.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import List, Tuple
3
+
4
+ from .similarity import cosine_similarity
5
+ from .vectorizer import Vectorizer
6
+ import logging
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class PromptSearchEngine:
14
+ def __init__(self):
15
+ self.vectorizer = Vectorizer(init_pinecone=False)
16
+ self.vectorizer._data_loaded = True
17
+ self.prompts = self.vectorizer.prompts
18
+ self.corpus_vectors = self.vectorizer.transform(self.prompts)
19
+ self.index_name = self.vectorizer.pinecone_index_name
20
+
21
+ def most_similar(self, query: str, n: int = 5, use_pinecone=True) -> List[Tuple[float, str]]:
22
+ logger.info(f"Encoding query: {query}")
23
+ query_vector = self.vectorizer.transform([query])[0]
24
+ logger.info(f"Encoded query vector: {query_vector}")
25
+ if use_pinecone:
26
+ logger.info(f"I'm doing pinecone vector search because the use_pinecone is: {use_pinecone}")
27
+ try:
28
+ # Convert numpy array to list of native Python floats
29
+ query_vector_list = query_vector.tolist()
30
+ search_result = self.vectorizer.index.query(
31
+ vector=query_vector_list,
32
+ top_k=n,
33
+ include_metadata=True
34
+ )
35
+ logger.info(f"Search result: {search_result}")
36
+
37
+ # Retrieve and format the results
38
+ results = [(match['score'], match['metadata']['text']) for match in search_result['matches'] if
39
+ 'text' in match['metadata']]
40
+ except Exception as e:
41
+ logger.error(f"Pinecone query failed: {e}")
42
+ logger.info("Falling back to cosine similarity search.")
43
+
44
+ # Fallback to cosine similarity search
45
+ similarities = cosine_similarity(query_vector, self.corpus_vectors)
46
+ top_n_indices = np.argsort(similarities)[-n:][::-1]
47
+ results = [(float(similarities[i]), self.prompts[i]) for i in top_n_indices]
48
+ else:
49
+ logger.info(f"I'm cosine similarity search because the use_pinecone is: {use_pinecone}")
50
+ logger.info("Using cosine similarity for search")
51
+ similarities = cosine_similarity(query_vector, self.corpus_vectors)
52
+ top_n_indices = np.argsort(similarities)[-n:][::-1]
53
+ results = [(float(similarities[i]), self.prompts[i]) for i in top_n_indices]
54
+ return results