# retriever_pinecone.py import os import time import traceback import urllib.parse # Keep for potential future ID decoding if needed from pinecone import Pinecone import openai # For generating query embeddings from typing import List, Dict # <<< --- ADD THIS IMPORT --- # --- Configuration --- OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") # PINECONE_ENVIRONMENT is deprecated for serverless/starter, use index host or name directly INDEX_NAME = "chassidus-index" # Match the index used in upload script EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model print(f"Retriever using Pinecone Index: {INDEX_NAME}") # Removed Environment print, less relevant for v3 client usage print(f"Retriever using OpenAI Embedding Model: {EMBEDDING_MODEL}") # --- End Configuration --- # --- Initialize OpenAI Client --- openai_client = None if OPENAI_API_KEY: try: openai_client = openai.OpenAI(api_key=OPENAI_API_KEY) print("OpenAI client initialized for retriever.") except Exception as e: print(f"Error initializing OpenAI client for retriever: {e}") traceback.print_exc() else: print( "Warning: OPENAI_API_KEY not found. Retriever requires it for query embeddings." ) # --- Initialize Pinecone Client and Index --- pc = None index = None if PINECONE_API_KEY: try: print("Initializing Pinecone client for retriever...") pc = Pinecone(api_key=PINECONE_API_KEY) print(f"Connecting to index '{INDEX_NAME}'...") # Check if index exists before connecting if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]: print( f"Error: Index '{INDEX_NAME}' does not exist. Cannot connect retriever." ) else: index = pc.Index(INDEX_NAME) print("Connected to Pinecone index for retriever.") # Verify connection with stats stats = index.describe_index_stats() print(f"Index stats: {stats}") if stats.total_vector_count == 0: print(f"Warning: Pinecone index '{INDEX_NAME}' is empty.") except Exception as e: print( f"Error initializing Pinecone or connecting to index for retriever: {e}" ) traceback.print_exc() else: print( "Error: PINECONE_API_KEY not found. Cannot initialize Pinecone client." ) # --- Status Check --- def check_retriever_status(): """Checks if the Pinecone retriever is ready.""" status = True messages = [] if not OPENAI_API_KEY: status = False messages.append("OpenAI API Key missing.") if not openai_client: status = False messages.append("OpenAI client initialization failed.") if not PINECONE_API_KEY: status = False messages.append("Pinecone API Key missing.") if not pc: status = False messages.append("Pinecone client failed to initialize.") if not index: # Check if index object was successfully created status = False messages.append( f"Pinecone index '{INDEX_NAME}' could not be connected to or doesn't exist." ) elif index: try: stats = index.describe_index_stats() if stats.total_vector_count == 0: messages.append( f"Retriever ready, but Pinecone index '{INDEX_NAME}' is empty." ) except Exception as stats_err: status = False messages.append( f"Failed to get stats for index '{INDEX_NAME}': {stats_err}") if status and not messages: messages.append("Retriever ready.") return status, " ".join(messages) # --- Retrieval Function --- def get_embedding(text, model=EMBEDDING_MODEL): """Generates embedding for the given text using OpenAI.""" if not openai_client: raise ValueError("OpenAI client not initialized.") try: text = text.replace("\n", " ") response = openai_client.embeddings.create(input=[text], model=model) return response.data[0].embedding except Exception as e: print(f"Error getting embedding for text: '{text[:100]}...'") traceback.print_exc() return None # This is line 114 where the error occurred, now List and Dict are defined via import def find_similar_paragraphs(query_text: str, n_results: int = 10) -> List[Dict]: """ Retrieves similar paragraphs from Pinecone based on the query text. Searches against combined Hebrew+English embeddings. Retrieves metadata including separate hebrew_text and english_text. """ ready, message = check_retriever_status() if not ready or index is None: # Check index specifically print(f"Retriever not ready: {message}") return [] print(f"\nRetrieving similar paragraphs for: '{query_text[:100]}...'") start_time = time.time() try: # 1. Get query embedding print("Generating query embedding...") query_embedding = get_embedding(query_text) if query_embedding is None: print("Failed to generate query embedding.") return [] embed_time = time.time() - start_time print(f"Query embedding generated in {embed_time:.4f} seconds.") # 2. Query Pinecone print( f"Querying Pinecone index '{INDEX_NAME}' for top {n_results} results..." ) query_start_time = time.time() response = index.query( vector=query_embedding, top_k=n_results, include_metadata=True # Essential to get the text back ) query_time = time.time() - query_start_time print(f"Pinecone query completed in {query_time:.4f} seconds.") # 3. Process results formatted_results = [] if not response or not response.matches: print("No results found by Pinecone for this query.") return [] print( f"Processing {len(response.matches)} raw results from Pinecone...") for match in response.matches: score = match.score # Cosine similarity score (higher is better) vector_id = match.id # The ID stored in Pinecone (should be original_id) metadata = match.metadata if match.metadata else {} # --- Extract data from metadata --- # Use .get() with defaults for robustness original_id = metadata.get( 'original_id', vector_id) # Fallback to vector_id if missing hebrew_text = metadata.get('hebrew_text', '') english_text = metadata.get('english_text', '') source_name = metadata.get('source_name', 'Unknown Source') # Calculate distance from similarity score (for consistency if needed) # Distance = 1 - Cosine Similarity distance = 1.0 - score doc_data = { "vector_id": vector_id, # The ID used in Pinecone "original_id": original_id, # The original ID from the source JSON "source_name": source_name, "hebrew_text": hebrew_text, "english_text": english_text, # Include English text "distance": distance, # Calculated distance (lower is better) "similarity_score": score, # Direct score from Pinecone (higher is better) } formatted_results.append(doc_data) # Pinecone results are already sorted by score (descending), # which means distance is ascending (most similar first). total_retrieval_time = time.time() - start_time print( f"Retrieved and processed {len(formatted_results)} paragraphs from Pinecone in {total_retrieval_time:.2f} seconds." ) return formatted_results except Exception as e: print(f"Error during Pinecone query or processing: {e}") traceback.print_exc() return [] # --- Main Test Block --- if __name__ == "__main__": ready, msg = check_retriever_status() print(f"\nRetriever Status: {ready} - {msg}") if ready: print("\n--- Running Retriever Test ---") test_query = "role of joy in divine service" # Test query in English # test_query_he = "תפקיד השמחה בעבודת ה'" # Test query in Hebrew (optional) retrieved_docs = find_similar_paragraphs(test_query, n_results=5) if retrieved_docs: print("\n--- Top Test Results ---") for i, doc in enumerate(retrieved_docs): print( f"\n{i+1}. Score: {doc['similarity_score']:.4f} (Distance: {doc['distance']:.4f})" ) print( f" Source: {doc['source_name']} (Orig ID: {doc['original_id']}, VecID: {doc['vector_id']})" ) print(f" Hebrew: {doc['hebrew_text'][:150]}...") print(f" English: {doc['english_text'][:150]}...") else: print("No documents retrieved for the test query.") else: print(f"Cannot run test because retriever is not ready.")