Divrey-Yoel-RAG / ingestion_service.py
sivan22's picture
Upload 16 files
7f683f9 verified
# ingestion_service.py
import os
import json
import openai
import pinecone
from pinecone import ServerlessSpec, PodSpec # Import spec classes
from typing import List, Dict, Optional
import time
import traceback
import urllib.parse # Keep for potential future ID encoding if needed
# --- Configuration ---
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use cloud/region
PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD", "aws") # Default cloud
PINECONE_REGION = os.environ.get("PINECONE_REGION", "us-east-1") # Default region
INDEX_NAME = "chassidus-index" # Ensure this matches your index name
EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model
EMBEDDING_DIMENSIONS = 3072 # Dimension for text-embedding-3-large
print(f"Using Pinecone Index: {INDEX_NAME}")
print(f"Using Pinecone Cloud: {PINECONE_CLOUD}")
print(f"Using Pinecone Region: {PINECONE_REGION}")
print(f"Using OpenAI Embedding Model: {EMBEDDING_MODEL} (Dimensions: {EMBEDDING_DIMENSIONS})")
# --- End Configuration ---
# --- Initialize OpenAI Client ---
openai_client = None
if OPENAI_API_KEY:
try:
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
print("OpenAI client initialized.")
except Exception as e:
print(f"Error initializing OpenAI client: {e}")
traceback.print_exc()
else:
print("ERROR: OPENAI_API_KEY not found. Ingestion requires it for embeddings.")
# --- Initialize Pinecone Client and Index ---
pc = None
index = None
if PINECONE_API_KEY and PINECONE_CLOUD and PINECONE_REGION:
try:
print("Initializing Pinecone client...")
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
# Check if index exists
if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
print(f"Index '{INDEX_NAME}' does not exist. Creating it now...")
# --- Create Index (Choose ONE spec type) ---
# Option A: Serverless (Recommended for new projects, pay-as-you-go)
try:
pc.create_index(
name=INDEX_NAME,
dimension=EMBEDDING_DIMENSIONS,
metric="cosine", # or 'dotproduct', 'euclidean'
spec=ServerlessSpec(
cloud=PINECONE_CLOUD,
region=PINECONE_REGION
)
)
print(f"Serverless index '{INDEX_NAME}' created. Waiting for initialization...")
while not pc.describe_index(INDEX_NAME).status['ready']:
time.sleep(1)
print("Index is ready.")
except Exception as create_err:
print(f"Error creating Serverless index '{INDEX_NAME}': {create_err}")
traceback.print_exc()
# Fallback or specific error handling needed here
# Option B: Pod-based (Older style, requires specifying pod type/size)
# Uncomment below and comment out ServerlessSpec if you need Pod-based
# try:
# # Example: Using a free tier pod (s1.x1) - adjust if needed
# # Note: PINECONE_ENVIRONMENT might be needed for older pod-based index creation
# pinecone_environment = os.environ.get("PINECONE_ENVIRONMENT") # Get environment if needed for pod
# if not pinecone_environment:
# raise ValueError("PINECONE_ENVIRONMENT is required for pod-based index creation.")
# pc.create_index(
# name=INDEX_NAME,
# dimension=EMBEDDING_DIMENSIONS,
# metric="cosine",
# spec=PodSpec(
# environment=pinecone_environment, # Use environment here
# pod_type="p1.x1", # Example pod type, check Pinecone docs
# pods=1
# )
# )
# print(f"Pod-based index '{INDEX_NAME}' created in environment '{pinecone_environment}'. Waiting...")
# while not pc.describe_index(INDEX_NAME).status['ready']:
# time.sleep(1)
# print("Index is ready.")
# except Exception as create_err:
# print(f"Error creating Pod-based index '{INDEX_NAME}': {create_err}")
# traceback.print_exc()
# # Fallback or specific error handling needed here
else:
print(f"Index '{INDEX_NAME}' already exists.")
# Connect to the index
print(f"Connecting to index '{INDEX_NAME}'...")
index = pc.Index(INDEX_NAME)
print("Connected to Pinecone index.")
stats = index.describe_index_stats()
print(f"Initial index stats: {stats}")
except Exception as e:
print(f"Error initializing Pinecone or connecting to index: {e}")
traceback.print_exc()
else:
print("ERROR: Pinecone API Key, Cloud, or Region not found. Cannot connect to Pinecone.")
# --- Helper Functions ---
def get_embedding(text: str, model=EMBEDDING_MODEL) -> Optional[List[float]]:
"""Generate embedding for text using OpenAI API."""
if not openai_client:
print("Error: OpenAI client not initialized, cannot generate embedding.")
return None
try:
text = text.replace("\n", " ") # OpenAI recommends replacing newlines
if not text.strip(): # Handle empty strings
print("Warning: Attempted to embed empty string.")
return None
response = openai_client.embeddings.create(input=[text], model=model)
return response.data[0].embedding
except openai.APIError as e:
print(f"OpenAI API Error getting embedding: {e}")
except Exception as e:
print(f"Error getting embedding for text snippet: '{text[:100]}...'")
traceback.print_exc()
return None
def process_json_file(file_path: str) -> List[Dict]:
"""
Process a JSON file containing documents in the specified format.
Reads objects with "id", "hebrew", "english" keys.
"""
documents = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if not isinstance(data, list):
print(f"Warning: Expected a list of objects in JSON file '{file_path}', found {type(data)}. Skipping.")
return []
for i, item in enumerate(data):
if isinstance(item, dict):
original_id = item.get("id")
hebrew_text = item.get("hebrew")
english_text = item.get("english")
if not original_id:
print(f"Warning: Missing 'id' in item {i} of file '{file_path}'. Skipping.")
continue
if not hebrew_text and not english_text:
print(f"Warning: Missing both 'hebrew' and 'english' text in item {i} (ID: {original_id}) of file '{file_path}'. Skipping.")
continue
# Ensure texts are strings, default to empty if missing but not skipping
hebrew_text = hebrew_text or ""
english_text = english_text or ""
doc = {
"original_id": str(original_id), # Ensure ID is string
"hebrew_text": hebrew_text.strip(),
"english_text": english_text.strip(),
"source_name": os.path.basename(file_path) # Add source filename
}
documents.append(doc)
else:
print(f"Warning: Item {i} in file '{file_path}' is not a dictionary. Skipping.")
except json.JSONDecodeError as e:
print(f"Error decoding JSON from file '{file_path}': {e}")
return []
except Exception as e:
print(f"Error processing file '{file_path}': {e}")
traceback.print_exc()
return []
print(f"Processed {len(documents)} documents from '{file_path}'")
return documents
def upload_documents(documents: List[Dict], batch_size: int = 100) -> bool:
"""
Embeds combined Hebrew+English text and uploads vectors and metadata to Pinecone.
Metadata includes separate hebrew_text and english_text.
"""
if not index:
print("Error: Pinecone index not initialized. Cannot upload.")
return False
if not documents:
print("No documents provided to upload.")
return True # Technically successful as there's nothing to do
total_uploaded = 0
try:
num_batches = (len(documents) + batch_size - 1) // batch_size
print(f"Preparing to upload {len(documents)} documents in {num_batches} batches of size {batch_size}...")
for i in range(0, len(documents), batch_size):
batch_start_time = time.time()
batch = documents[i : i + batch_size]
vectors_to_upload = []
ids_in_batch = set()
print(f"Processing batch {i//batch_size + 1}/{num_batches}...")
for doc in batch:
original_id = doc["original_id"]
if original_id in ids_in_batch:
print(f"Warning: Duplicate ID '{original_id}' detected within the same batch. Skipping duplicate.")
continue
ids_in_batch.add(original_id)
hebrew = doc["hebrew_text"]
english = doc["english_text"]
# --- Create combined text for embedding ---
# Add separators to potentially help the model distinguish languages
combined_text = f"Hebrew:\n{hebrew}\n\nEnglish:\n{english}"
# Alternative: Just concatenate if separators don't help much
# combined_text = hebrew + "\n\n" + english
if not combined_text.strip():
print(f"Warning: Skipping document ID '{original_id}' due to empty combined text.")
continue
# --- Get Embedding ---
embedding = get_embedding(combined_text)
if embedding is None:
print(f"Warning: Failed to get embedding for document ID '{original_id}'. Skipping.")
continue
# --- Prepare Metadata ---
# Ensure metadata values are strings or numbers, handle None/empty
metadata_payload = {
"hebrew_text": hebrew if hebrew else "N/A",
"english_text": english if english else "N/A",
"source_name": doc.get("source_name", "Unknown"),
"original_id": original_id # Store original ID in metadata too
}
# Optional: Clean metadata further if needed (e.g., truncate long texts)
vectors_to_upload.append({
"id": original_id, # Use the original document ID as the Pinecone vector ID
"values": embedding,
"metadata": metadata_payload
})
if not vectors_to_upload:
print(f"Batch {i//batch_size + 1} resulted in no vectors to upload. Skipping API call.")
continue
# --- Upsert to Pinecone ---
try:
print(f"Upserting {len(vectors_to_upload)} vectors for batch {i//batch_size + 1}...")
upsert_response = index.upsert(vectors=vectors_to_upload)
print(f" Upsert response: {upsert_response}")
total_uploaded += upsert_response.upserted_count
except Exception as upsert_err:
print(f"Error upserting batch {i//batch_size + 1}: {upsert_err}")
traceback.print_exc()
# Decide whether to continue with next batch or stop
# return False # Stop on first batch error
batch_time = time.time() - batch_start_time
print(f"Batch {i//batch_size + 1} processed in {batch_time:.2f} seconds.")
time.sleep(0.1) # Small delay between batches
print(f"\nFinished uploading. Total vectors successfully upserted: {total_uploaded}")
# Verify with index stats
try:
final_stats = index.describe_index_stats()
print(f"Final index stats: {final_stats}")
except Exception as stats_err:
print(f"Could not fetch final index stats: {stats_err}")
return True
except Exception as e:
print(f"An unexpected error occurred during the upload process: {e}")
traceback.print_exc()
return False
def process_and_upload_file(file_path: str) -> bool:
"""Main function to process a JSON file and upload its documents."""
if not os.path.exists(file_path):
print(f"Error: File not found at '{file_path}'")
return False
if not file_path.lower().endswith(".json"):
print(f"Error: This script currently only processes .json files. Found: '{file_path}'")
return False
if not openai_client or not index:
print("Error: OpenAI client or Pinecone index not initialized. Cannot proceed.")
return False
print(f"\n--- Starting processing for file: {file_path} ---")
start_time = time.time()
# 1. Process the JSON file
documents = process_json_file(file_path)
if not documents:
print(f"No valid documents found in '{file_path}'. Upload skipped.")
return False # Or True if "empty file processed successfully" is the desired outcome
# 2. Upload the documents
success = upload_documents(documents)
end_time = time.time()
print(f"--- Finished processing file: {file_path} in {end_time - start_time:.2f} seconds ---")
if success:
print(f"Successfully processed and uploaded data from {file_path}")
else:
print(f"Failed to upload data from {file_path}")
return success
# --- Main Execution Block ---
if __name__ == "__main__":
# --- Configuration for script execution ---
# Set the directory containing your JSON files
data_directory = "data" # CHANGE THIS to your data folder path
# ---
if not os.path.isdir(data_directory):
print(f"Error: Data directory '{data_directory}' not found.")
print("Please create the directory and place your JSON files inside, or update the 'data_directory' variable.")
else:
print(f"Looking for JSON files in directory: '{data_directory}'")
json_files = [f for f in os.listdir(data_directory) if f.lower().endswith(".json")]
if not json_files:
print(f"No .json files found in '{data_directory}'.")
else:
print(f"Found {len(json_files)} JSON files: {json_files}")
overall_success = True
for filename in json_files:
file_path = os.path.join(data_directory, filename)
success = process_and_upload_file(file_path)
if not success:
overall_success = False
print(f"Processing failed for {filename}. Check logs above.")
# Optional: stop processing remaining files on failure
# break
if overall_success:
print("\nAll files processed successfully.")
else:
print("\nSome files encountered errors during processing.")
# Example for single file upload:
# file_to_upload = "path/to/your/single_file.json"
# if os.path.exists(file_to_upload):
# process_and_upload_file(file_to_upload)
# else:
# print(f"File {file_to_upload} not found")