Spaces:
Sleeping
Sleeping
# retriever_pinecone.py | |
import os | |
import time | |
import traceback | |
import urllib.parse # Keep for potential future ID decoding if needed | |
from pinecone import Pinecone | |
import openai # For generating query embeddings | |
from typing import List, Dict # <<< --- ADD THIS IMPORT --- | |
# --- Configuration --- | |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") | |
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") | |
# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use index host or name directly | |
INDEX_NAME = "chassidus-index" # Match the index used in upload script | |
EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model | |
print(f"Retriever using Pinecone Index: {INDEX_NAME}") | |
# Removed Environment print, less relevant for v3 client usage | |
print(f"Retriever using OpenAI Embedding Model: {EMBEDDING_MODEL}") | |
# --- End Configuration --- | |
# --- Initialize OpenAI Client --- | |
openai_client = None | |
if OPENAI_API_KEY: | |
try: | |
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY) | |
print("OpenAI client initialized for retriever.") | |
except Exception as e: | |
print(f"Error initializing OpenAI client for retriever: {e}") | |
traceback.print_exc() | |
else: | |
print( | |
"Warning: OPENAI_API_KEY not found. Retriever requires it for query embeddings." | |
) | |
# --- Initialize Pinecone Client and Index --- | |
pc = None | |
index = None | |
if PINECONE_API_KEY: | |
try: | |
print("Initializing Pinecone client for retriever...") | |
pc = Pinecone(api_key=PINECONE_API_KEY) | |
print(f"Connecting to index '{INDEX_NAME}'...") | |
# Check if index exists before connecting | |
if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]: | |
print( | |
f"Error: Index '{INDEX_NAME}' does not exist. Cannot connect retriever." | |
) | |
else: | |
index = pc.Index(INDEX_NAME) | |
print("Connected to Pinecone index for retriever.") | |
# Verify connection with stats | |
stats = index.describe_index_stats() | |
print(f"Index stats: {stats}") | |
if stats.total_vector_count == 0: | |
print(f"Warning: Pinecone index '{INDEX_NAME}' is empty.") | |
except Exception as e: | |
print( | |
f"Error initializing Pinecone or connecting to index for retriever: {e}" | |
) | |
traceback.print_exc() | |
else: | |
print( | |
"Error: PINECONE_API_KEY not found. Cannot initialize Pinecone client." | |
) | |
# --- Status Check --- | |
def check_retriever_status(): | |
"""Checks if the Pinecone retriever is ready.""" | |
status = True | |
messages = [] | |
if not OPENAI_API_KEY: | |
status = False | |
messages.append("OpenAI API Key missing.") | |
if not openai_client: | |
status = False | |
messages.append("OpenAI client initialization failed.") | |
if not PINECONE_API_KEY: | |
status = False | |
messages.append("Pinecone API Key missing.") | |
if not pc: | |
status = False | |
messages.append("Pinecone client failed to initialize.") | |
if not index: # Check if index object was successfully created | |
status = False | |
messages.append( | |
f"Pinecone index '{INDEX_NAME}' could not be connected to or doesn't exist." | |
) | |
elif index: | |
try: | |
stats = index.describe_index_stats() | |
if stats.total_vector_count == 0: | |
messages.append( | |
f"Retriever ready, but Pinecone index '{INDEX_NAME}' is empty." | |
) | |
except Exception as stats_err: | |
status = False | |
messages.append( | |
f"Failed to get stats for index '{INDEX_NAME}': {stats_err}") | |
if status and not messages: | |
messages.append("Retriever ready.") | |
return status, " ".join(messages) | |
# --- Retrieval Function --- | |
def get_embedding(text, model=EMBEDDING_MODEL): | |
"""Generates embedding for the given text using OpenAI.""" | |
if not openai_client: | |
raise ValueError("OpenAI client not initialized.") | |
try: | |
text = text.replace("\n", " ") | |
response = openai_client.embeddings.create(input=[text], model=model) | |
return response.data[0].embedding | |
except Exception as e: | |
print(f"Error getting embedding for text: '{text[:100]}...'") | |
traceback.print_exc() | |
return None | |
# This is line 114 where the error occurred, now List and Dict are defined via import | |
def find_similar_paragraphs(query_text: str, | |
n_results: int = 10) -> List[Dict]: | |
""" | |
Retrieves similar paragraphs from Pinecone based on the query text. | |
Searches against combined Hebrew+English embeddings. | |
Retrieves metadata including separate hebrew_text and english_text. | |
""" | |
ready, message = check_retriever_status() | |
if not ready or index is None: # Check index specifically | |
print(f"Retriever not ready: {message}") | |
return [] | |
print(f"\nRetrieving similar paragraphs for: '{query_text[:100]}...'") | |
start_time = time.time() | |
try: | |
# 1. Get query embedding | |
print("Generating query embedding...") | |
query_embedding = get_embedding(query_text) | |
if query_embedding is None: | |
print("Failed to generate query embedding.") | |
return [] | |
embed_time = time.time() - start_time | |
print(f"Query embedding generated in {embed_time:.4f} seconds.") | |
# 2. Query Pinecone | |
print( | |
f"Querying Pinecone index '{INDEX_NAME}' for top {n_results} results..." | |
) | |
query_start_time = time.time() | |
response = index.query( | |
vector=query_embedding, | |
top_k=n_results, | |
include_metadata=True # Essential to get the text back | |
) | |
query_time = time.time() - query_start_time | |
print(f"Pinecone query completed in {query_time:.4f} seconds.") | |
# 3. Process results | |
formatted_results = [] | |
if not response or not response.matches: | |
print("No results found by Pinecone for this query.") | |
return [] | |
print( | |
f"Processing {len(response.matches)} raw results from Pinecone...") | |
for match in response.matches: | |
score = match.score # Cosine similarity score (higher is better) | |
vector_id = match.id # The ID stored in Pinecone (should be original_id) | |
metadata = match.metadata if match.metadata else {} | |
# --- Extract data from metadata --- | |
# Use .get() with defaults for robustness | |
original_id = metadata.get( | |
'original_id', vector_id) # Fallback to vector_id if missing | |
hebrew_text = metadata.get('hebrew_text', '') | |
english_text = metadata.get('english_text', '') | |
source_name = metadata.get('source_name', 'Unknown Source') | |
# Calculate distance from similarity score (for consistency if needed) | |
# Distance = 1 - Cosine Similarity | |
distance = 1.0 - score | |
doc_data = { | |
"vector_id": vector_id, # The ID used in Pinecone | |
"original_id": | |
original_id, # The original ID from the source JSON | |
"source_name": source_name, | |
"hebrew_text": hebrew_text, | |
"english_text": english_text, # Include English text | |
"distance": distance, # Calculated distance (lower is better) | |
"similarity_score": | |
score, # Direct score from Pinecone (higher is better) | |
} | |
formatted_results.append(doc_data) | |
# Pinecone results are already sorted by score (descending), | |
# which means distance is ascending (most similar first). | |
total_retrieval_time = time.time() - start_time | |
print( | |
f"Retrieved and processed {len(formatted_results)} paragraphs from Pinecone in {total_retrieval_time:.2f} seconds." | |
) | |
return formatted_results | |
except Exception as e: | |
print(f"Error during Pinecone query or processing: {e}") | |
traceback.print_exc() | |
return [] | |
# --- Main Test Block --- | |
if __name__ == "__main__": | |
ready, msg = check_retriever_status() | |
print(f"\nRetriever Status: {ready} - {msg}") | |
if ready: | |
print("\n--- Running Retriever Test ---") | |
test_query = "role of joy in divine service" # Test query in English | |
# test_query_he = "תפקיד השמחה בעבודת ה'" # Test query in Hebrew (optional) | |
retrieved_docs = find_similar_paragraphs(test_query, n_results=5) | |
if retrieved_docs: | |
print("\n--- Top Test Results ---") | |
for i, doc in enumerate(retrieved_docs): | |
print( | |
f"\n{i+1}. Score: {doc['similarity_score']:.4f} (Distance: {doc['distance']:.4f})" | |
) | |
print( | |
f" Source: {doc['source_name']} (Orig ID: {doc['original_id']}, VecID: {doc['vector_id']})" | |
) | |
print(f" Hebrew: {doc['hebrew_text'][:150]}...") | |
print(f" English: {doc['english_text'][:150]}...") | |
else: | |
print("No documents retrieved for the test query.") | |
else: | |
print(f"Cannot run test because retriever is not ready.") | |