File size: 9,340 Bytes
7f683f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# retriever_pinecone.py

import os
import time
import traceback
import urllib.parse  # Keep for potential future ID decoding if needed
from pinecone import Pinecone
import openai  # For generating query embeddings
from typing import List, Dict  # <<< --- ADD THIS IMPORT ---

# --- Configuration ---
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use index host or name directly
INDEX_NAME = "chassidus-index"  # Match the index used in upload script
EMBEDDING_MODEL = "text-embedding-3-large"  # Ensure this matches your embedding model

print(f"Retriever using Pinecone Index: {INDEX_NAME}")
# Removed Environment print, less relevant for v3 client usage
print(f"Retriever using OpenAI Embedding Model: {EMBEDDING_MODEL}")
# --- End Configuration ---

# --- Initialize OpenAI Client ---
openai_client = None
if OPENAI_API_KEY:
    try:
        openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
        print("OpenAI client initialized for retriever.")
    except Exception as e:
        print(f"Error initializing OpenAI client for retriever: {e}")
        traceback.print_exc()
else:
    print(
        "Warning: OPENAI_API_KEY not found. Retriever requires it for query embeddings."
    )

# --- Initialize Pinecone Client and Index ---
pc = None
index = None
if PINECONE_API_KEY:
    try:
        print("Initializing Pinecone client for retriever...")
        pc = Pinecone(api_key=PINECONE_API_KEY)
        print(f"Connecting to index '{INDEX_NAME}'...")

        # Check if index exists before connecting
        if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
            print(
                f"Error: Index '{INDEX_NAME}' does not exist. Cannot connect retriever."
            )
        else:
            index = pc.Index(INDEX_NAME)
            print("Connected to Pinecone index for retriever.")
            # Verify connection with stats
            stats = index.describe_index_stats()
            print(f"Index stats: {stats}")
            if stats.total_vector_count == 0:
                print(f"Warning: Pinecone index '{INDEX_NAME}' is empty.")

    except Exception as e:
        print(
            f"Error initializing Pinecone or connecting to index for retriever: {e}"
        )
        traceback.print_exc()
else:
    print(
        "Error: PINECONE_API_KEY not found. Cannot initialize Pinecone client."
    )


# --- Status Check ---
def check_retriever_status():
    """Checks if the Pinecone retriever is ready."""
    status = True
    messages = []
    if not OPENAI_API_KEY:
        status = False
        messages.append("OpenAI API Key missing.")
    if not openai_client:
        status = False
        messages.append("OpenAI client initialization failed.")
    if not PINECONE_API_KEY:
        status = False
        messages.append("Pinecone API Key missing.")
    if not pc:
        status = False
        messages.append("Pinecone client failed to initialize.")
    if not index:  # Check if index object was successfully created
        status = False
        messages.append(
            f"Pinecone index '{INDEX_NAME}' could not be connected to or doesn't exist."
        )
    elif index:
        try:
            stats = index.describe_index_stats()
            if stats.total_vector_count == 0:
                messages.append(
                    f"Retriever ready, but Pinecone index '{INDEX_NAME}' is empty."
                )
        except Exception as stats_err:
            status = False
            messages.append(
                f"Failed to get stats for index '{INDEX_NAME}': {stats_err}")

    if status and not messages:
        messages.append("Retriever ready.")

    return status, " ".join(messages)


# --- Retrieval Function ---
def get_embedding(text, model=EMBEDDING_MODEL):
    """Generates embedding for the given text using OpenAI."""
    if not openai_client:
        raise ValueError("OpenAI client not initialized.")
    try:
        text = text.replace("\n", " ")
        response = openai_client.embeddings.create(input=[text], model=model)
        return response.data[0].embedding
    except Exception as e:
        print(f"Error getting embedding for text: '{text[:100]}...'")
        traceback.print_exc()
        return None


# This is line 114 where the error occurred, now List and Dict are defined via import
def find_similar_paragraphs(query_text: str,
                            n_results: int = 10) -> List[Dict]:
    """
    Retrieves similar paragraphs from Pinecone based on the query text.
    Searches against combined Hebrew+English embeddings.
    Retrieves metadata including separate hebrew_text and english_text.
    """
    ready, message = check_retriever_status()
    if not ready or index is None:  # Check index specifically
        print(f"Retriever not ready: {message}")
        return []

    print(f"\nRetrieving similar paragraphs for: '{query_text[:100]}...'")
    start_time = time.time()

    try:
        # 1. Get query embedding
        print("Generating query embedding...")
        query_embedding = get_embedding(query_text)
        if query_embedding is None:
            print("Failed to generate query embedding.")
            return []
        embed_time = time.time() - start_time
        print(f"Query embedding generated in {embed_time:.4f} seconds.")

        # 2. Query Pinecone
        print(
            f"Querying Pinecone index '{INDEX_NAME}' for top {n_results} results..."
        )
        query_start_time = time.time()
        response = index.query(
            vector=query_embedding,
            top_k=n_results,
            include_metadata=True  # Essential to get the text back
        )
        query_time = time.time() - query_start_time
        print(f"Pinecone query completed in {query_time:.4f} seconds.")

        # 3. Process results
        formatted_results = []
        if not response or not response.matches:
            print("No results found by Pinecone for this query.")
            return []

        print(
            f"Processing {len(response.matches)} raw results from Pinecone...")
        for match in response.matches:
            score = match.score  # Cosine similarity score (higher is better)
            vector_id = match.id  # The ID stored in Pinecone (should be original_id)
            metadata = match.metadata if match.metadata else {}

            # --- Extract data from metadata ---
            # Use .get() with defaults for robustness
            original_id = metadata.get(
                'original_id', vector_id)  # Fallback to vector_id if missing
            hebrew_text = metadata.get('hebrew_text', '')
            english_text = metadata.get('english_text', '')
            source_name = metadata.get('source_name', 'Unknown Source')

            # Calculate distance from similarity score (for consistency if needed)
            # Distance = 1 - Cosine Similarity
            distance = 1.0 - score

            doc_data = {
                "vector_id": vector_id,  # The ID used in Pinecone
                "original_id":
                original_id,  # The original ID from the source JSON
                "source_name": source_name,
                "hebrew_text": hebrew_text,
                "english_text": english_text,  # Include English text
                "distance": distance,  # Calculated distance (lower is better)
                "similarity_score":
                score,  # Direct score from Pinecone (higher is better)
            }
            formatted_results.append(doc_data)

        # Pinecone results are already sorted by score (descending),
        # which means distance is ascending (most similar first).

        total_retrieval_time = time.time() - start_time
        print(
            f"Retrieved and processed {len(formatted_results)} paragraphs from Pinecone in {total_retrieval_time:.2f} seconds."
        )
        return formatted_results

    except Exception as e:
        print(f"Error during Pinecone query or processing: {e}")
        traceback.print_exc()
        return []


# --- Main Test Block ---
if __name__ == "__main__":
    ready, msg = check_retriever_status()
    print(f"\nRetriever Status: {ready} - {msg}")
    if ready:
        print("\n--- Running Retriever Test ---")
        test_query = "role of joy in divine service"  # Test query in English
        # test_query_he = "תפקיד השמחה בעבודת ה'" # Test query in Hebrew (optional)

        retrieved_docs = find_similar_paragraphs(test_query, n_results=5)

        if retrieved_docs:
            print("\n--- Top Test Results ---")
            for i, doc in enumerate(retrieved_docs):
                print(
                    f"\n{i+1}. Score: {doc['similarity_score']:.4f} (Distance: {doc['distance']:.4f})"
                )
                print(
                    f"   Source: {doc['source_name']} (Orig ID: {doc['original_id']}, VecID: {doc['vector_id']})"
                )
                print(f"   Hebrew: {doc['hebrew_text'][:150]}...")
                print(f"   English: {doc['english_text'][:150]}...")
        else:
            print("No documents retrieved for the test query.")
    else:
        print(f"Cannot run test because retriever is not ready.")