Divrey-Yoel-RAG / retriever_pinecone.py
sivan22's picture
Upload 16 files
7f683f9 verified
# retriever_pinecone.py
import os
import time
import traceback
import urllib.parse # Keep for potential future ID decoding if needed
from pinecone import Pinecone
import openai # For generating query embeddings
from typing import List, Dict # <<< --- ADD THIS IMPORT ---
# --- Configuration ---
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use index host or name directly
INDEX_NAME = "chassidus-index" # Match the index used in upload script
EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model
print(f"Retriever using Pinecone Index: {INDEX_NAME}")
# Removed Environment print, less relevant for v3 client usage
print(f"Retriever using OpenAI Embedding Model: {EMBEDDING_MODEL}")
# --- End Configuration ---
# --- Initialize OpenAI Client ---
openai_client = None
if OPENAI_API_KEY:
try:
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
print("OpenAI client initialized for retriever.")
except Exception as e:
print(f"Error initializing OpenAI client for retriever: {e}")
traceback.print_exc()
else:
print(
"Warning: OPENAI_API_KEY not found. Retriever requires it for query embeddings."
)
# --- Initialize Pinecone Client and Index ---
pc = None
index = None
if PINECONE_API_KEY:
try:
print("Initializing Pinecone client for retriever...")
pc = Pinecone(api_key=PINECONE_API_KEY)
print(f"Connecting to index '{INDEX_NAME}'...")
# Check if index exists before connecting
if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
print(
f"Error: Index '{INDEX_NAME}' does not exist. Cannot connect retriever."
)
else:
index = pc.Index(INDEX_NAME)
print("Connected to Pinecone index for retriever.")
# Verify connection with stats
stats = index.describe_index_stats()
print(f"Index stats: {stats}")
if stats.total_vector_count == 0:
print(f"Warning: Pinecone index '{INDEX_NAME}' is empty.")
except Exception as e:
print(
f"Error initializing Pinecone or connecting to index for retriever: {e}"
)
traceback.print_exc()
else:
print(
"Error: PINECONE_API_KEY not found. Cannot initialize Pinecone client."
)
# --- Status Check ---
def check_retriever_status():
"""Checks if the Pinecone retriever is ready."""
status = True
messages = []
if not OPENAI_API_KEY:
status = False
messages.append("OpenAI API Key missing.")
if not openai_client:
status = False
messages.append("OpenAI client initialization failed.")
if not PINECONE_API_KEY:
status = False
messages.append("Pinecone API Key missing.")
if not pc:
status = False
messages.append("Pinecone client failed to initialize.")
if not index: # Check if index object was successfully created
status = False
messages.append(
f"Pinecone index '{INDEX_NAME}' could not be connected to or doesn't exist."
)
elif index:
try:
stats = index.describe_index_stats()
if stats.total_vector_count == 0:
messages.append(
f"Retriever ready, but Pinecone index '{INDEX_NAME}' is empty."
)
except Exception as stats_err:
status = False
messages.append(
f"Failed to get stats for index '{INDEX_NAME}': {stats_err}")
if status and not messages:
messages.append("Retriever ready.")
return status, " ".join(messages)
# --- Retrieval Function ---
def get_embedding(text, model=EMBEDDING_MODEL):
"""Generates embedding for the given text using OpenAI."""
if not openai_client:
raise ValueError("OpenAI client not initialized.")
try:
text = text.replace("\n", " ")
response = openai_client.embeddings.create(input=[text], model=model)
return response.data[0].embedding
except Exception as e:
print(f"Error getting embedding for text: '{text[:100]}...'")
traceback.print_exc()
return None
# This is line 114 where the error occurred, now List and Dict are defined via import
def find_similar_paragraphs(query_text: str,
n_results: int = 10) -> List[Dict]:
"""
Retrieves similar paragraphs from Pinecone based on the query text.
Searches against combined Hebrew+English embeddings.
Retrieves metadata including separate hebrew_text and english_text.
"""
ready, message = check_retriever_status()
if not ready or index is None: # Check index specifically
print(f"Retriever not ready: {message}")
return []
print(f"\nRetrieving similar paragraphs for: '{query_text[:100]}...'")
start_time = time.time()
try:
# 1. Get query embedding
print("Generating query embedding...")
query_embedding = get_embedding(query_text)
if query_embedding is None:
print("Failed to generate query embedding.")
return []
embed_time = time.time() - start_time
print(f"Query embedding generated in {embed_time:.4f} seconds.")
# 2. Query Pinecone
print(
f"Querying Pinecone index '{INDEX_NAME}' for top {n_results} results..."
)
query_start_time = time.time()
response = index.query(
vector=query_embedding,
top_k=n_results,
include_metadata=True # Essential to get the text back
)
query_time = time.time() - query_start_time
print(f"Pinecone query completed in {query_time:.4f} seconds.")
# 3. Process results
formatted_results = []
if not response or not response.matches:
print("No results found by Pinecone for this query.")
return []
print(
f"Processing {len(response.matches)} raw results from Pinecone...")
for match in response.matches:
score = match.score # Cosine similarity score (higher is better)
vector_id = match.id # The ID stored in Pinecone (should be original_id)
metadata = match.metadata if match.metadata else {}
# --- Extract data from metadata ---
# Use .get() with defaults for robustness
original_id = metadata.get(
'original_id', vector_id) # Fallback to vector_id if missing
hebrew_text = metadata.get('hebrew_text', '')
english_text = metadata.get('english_text', '')
source_name = metadata.get('source_name', 'Unknown Source')
# Calculate distance from similarity score (for consistency if needed)
# Distance = 1 - Cosine Similarity
distance = 1.0 - score
doc_data = {
"vector_id": vector_id, # The ID used in Pinecone
"original_id":
original_id, # The original ID from the source JSON
"source_name": source_name,
"hebrew_text": hebrew_text,
"english_text": english_text, # Include English text
"distance": distance, # Calculated distance (lower is better)
"similarity_score":
score, # Direct score from Pinecone (higher is better)
}
formatted_results.append(doc_data)
# Pinecone results are already sorted by score (descending),
# which means distance is ascending (most similar first).
total_retrieval_time = time.time() - start_time
print(
f"Retrieved and processed {len(formatted_results)} paragraphs from Pinecone in {total_retrieval_time:.2f} seconds."
)
return formatted_results
except Exception as e:
print(f"Error during Pinecone query or processing: {e}")
traceback.print_exc()
return []
# --- Main Test Block ---
if __name__ == "__main__":
ready, msg = check_retriever_status()
print(f"\nRetriever Status: {ready} - {msg}")
if ready:
print("\n--- Running Retriever Test ---")
test_query = "role of joy in divine service" # Test query in English
# test_query_he = "תפקיד השמחה בעבודת ה'" # Test query in Hebrew (optional)
retrieved_docs = find_similar_paragraphs(test_query, n_results=5)
if retrieved_docs:
print("\n--- Top Test Results ---")
for i, doc in enumerate(retrieved_docs):
print(
f"\n{i+1}. Score: {doc['similarity_score']:.4f} (Distance: {doc['distance']:.4f})"
)
print(
f" Source: {doc['source_name']} (Orig ID: {doc['original_id']}, VecID: {doc['vector_id']})"
)
print(f" Hebrew: {doc['hebrew_text'][:150]}...")
print(f" English: {doc['english_text'][:150]}...")
else:
print("No documents retrieved for the test query.")
else:
print(f"Cannot run test because retriever is not ready.")