Spaces:
Sleeping
Sleeping
File size: 9,340 Bytes
7f683f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 |
# retriever_pinecone.py
import os
import time
import traceback
import urllib.parse # Keep for potential future ID decoding if needed
from pinecone import Pinecone
import openai # For generating query embeddings
from typing import List, Dict # <<< --- ADD THIS IMPORT ---
# --- Configuration ---
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use index host or name directly
INDEX_NAME = "chassidus-index" # Match the index used in upload script
EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model
print(f"Retriever using Pinecone Index: {INDEX_NAME}")
# Removed Environment print, less relevant for v3 client usage
print(f"Retriever using OpenAI Embedding Model: {EMBEDDING_MODEL}")
# --- End Configuration ---
# --- Initialize OpenAI Client ---
openai_client = None
if OPENAI_API_KEY:
try:
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
print("OpenAI client initialized for retriever.")
except Exception as e:
print(f"Error initializing OpenAI client for retriever: {e}")
traceback.print_exc()
else:
print(
"Warning: OPENAI_API_KEY not found. Retriever requires it for query embeddings."
)
# --- Initialize Pinecone Client and Index ---
pc = None
index = None
if PINECONE_API_KEY:
try:
print("Initializing Pinecone client for retriever...")
pc = Pinecone(api_key=PINECONE_API_KEY)
print(f"Connecting to index '{INDEX_NAME}'...")
# Check if index exists before connecting
if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
print(
f"Error: Index '{INDEX_NAME}' does not exist. Cannot connect retriever."
)
else:
index = pc.Index(INDEX_NAME)
print("Connected to Pinecone index for retriever.")
# Verify connection with stats
stats = index.describe_index_stats()
print(f"Index stats: {stats}")
if stats.total_vector_count == 0:
print(f"Warning: Pinecone index '{INDEX_NAME}' is empty.")
except Exception as e:
print(
f"Error initializing Pinecone or connecting to index for retriever: {e}"
)
traceback.print_exc()
else:
print(
"Error: PINECONE_API_KEY not found. Cannot initialize Pinecone client."
)
# --- Status Check ---
def check_retriever_status():
"""Checks if the Pinecone retriever is ready."""
status = True
messages = []
if not OPENAI_API_KEY:
status = False
messages.append("OpenAI API Key missing.")
if not openai_client:
status = False
messages.append("OpenAI client initialization failed.")
if not PINECONE_API_KEY:
status = False
messages.append("Pinecone API Key missing.")
if not pc:
status = False
messages.append("Pinecone client failed to initialize.")
if not index: # Check if index object was successfully created
status = False
messages.append(
f"Pinecone index '{INDEX_NAME}' could not be connected to or doesn't exist."
)
elif index:
try:
stats = index.describe_index_stats()
if stats.total_vector_count == 0:
messages.append(
f"Retriever ready, but Pinecone index '{INDEX_NAME}' is empty."
)
except Exception as stats_err:
status = False
messages.append(
f"Failed to get stats for index '{INDEX_NAME}': {stats_err}")
if status and not messages:
messages.append("Retriever ready.")
return status, " ".join(messages)
# --- Retrieval Function ---
def get_embedding(text, model=EMBEDDING_MODEL):
"""Generates embedding for the given text using OpenAI."""
if not openai_client:
raise ValueError("OpenAI client not initialized.")
try:
text = text.replace("\n", " ")
response = openai_client.embeddings.create(input=[text], model=model)
return response.data[0].embedding
except Exception as e:
print(f"Error getting embedding for text: '{text[:100]}...'")
traceback.print_exc()
return None
# This is line 114 where the error occurred, now List and Dict are defined via import
def find_similar_paragraphs(query_text: str,
n_results: int = 10) -> List[Dict]:
"""
Retrieves similar paragraphs from Pinecone based on the query text.
Searches against combined Hebrew+English embeddings.
Retrieves metadata including separate hebrew_text and english_text.
"""
ready, message = check_retriever_status()
if not ready or index is None: # Check index specifically
print(f"Retriever not ready: {message}")
return []
print(f"\nRetrieving similar paragraphs for: '{query_text[:100]}...'")
start_time = time.time()
try:
# 1. Get query embedding
print("Generating query embedding...")
query_embedding = get_embedding(query_text)
if query_embedding is None:
print("Failed to generate query embedding.")
return []
embed_time = time.time() - start_time
print(f"Query embedding generated in {embed_time:.4f} seconds.")
# 2. Query Pinecone
print(
f"Querying Pinecone index '{INDEX_NAME}' for top {n_results} results..."
)
query_start_time = time.time()
response = index.query(
vector=query_embedding,
top_k=n_results,
include_metadata=True # Essential to get the text back
)
query_time = time.time() - query_start_time
print(f"Pinecone query completed in {query_time:.4f} seconds.")
# 3. Process results
formatted_results = []
if not response or not response.matches:
print("No results found by Pinecone for this query.")
return []
print(
f"Processing {len(response.matches)} raw results from Pinecone...")
for match in response.matches:
score = match.score # Cosine similarity score (higher is better)
vector_id = match.id # The ID stored in Pinecone (should be original_id)
metadata = match.metadata if match.metadata else {}
# --- Extract data from metadata ---
# Use .get() with defaults for robustness
original_id = metadata.get(
'original_id', vector_id) # Fallback to vector_id if missing
hebrew_text = metadata.get('hebrew_text', '')
english_text = metadata.get('english_text', '')
source_name = metadata.get('source_name', 'Unknown Source')
# Calculate distance from similarity score (for consistency if needed)
# Distance = 1 - Cosine Similarity
distance = 1.0 - score
doc_data = {
"vector_id": vector_id, # The ID used in Pinecone
"original_id":
original_id, # The original ID from the source JSON
"source_name": source_name,
"hebrew_text": hebrew_text,
"english_text": english_text, # Include English text
"distance": distance, # Calculated distance (lower is better)
"similarity_score":
score, # Direct score from Pinecone (higher is better)
}
formatted_results.append(doc_data)
# Pinecone results are already sorted by score (descending),
# which means distance is ascending (most similar first).
total_retrieval_time = time.time() - start_time
print(
f"Retrieved and processed {len(formatted_results)} paragraphs from Pinecone in {total_retrieval_time:.2f} seconds."
)
return formatted_results
except Exception as e:
print(f"Error during Pinecone query or processing: {e}")
traceback.print_exc()
return []
# --- Main Test Block ---
if __name__ == "__main__":
ready, msg = check_retriever_status()
print(f"\nRetriever Status: {ready} - {msg}")
if ready:
print("\n--- Running Retriever Test ---")
test_query = "role of joy in divine service" # Test query in English
# test_query_he = "תפקיד השמחה בעבודת ה'" # Test query in Hebrew (optional)
retrieved_docs = find_similar_paragraphs(test_query, n_results=5)
if retrieved_docs:
print("\n--- Top Test Results ---")
for i, doc in enumerate(retrieved_docs):
print(
f"\n{i+1}. Score: {doc['similarity_score']:.4f} (Distance: {doc['distance']:.4f})"
)
print(
f" Source: {doc['source_name']} (Orig ID: {doc['original_id']}, VecID: {doc['vector_id']})"
)
print(f" Hebrew: {doc['hebrew_text'][:150]}...")
print(f" English: {doc['english_text'][:150]}...")
else:
print("No documents retrieved for the test query.")
else:
print(f"Cannot run test because retriever is not ready.")
|