# ingestion_service.py import os import json import openai import pinecone from pinecone import ServerlessSpec, PodSpec # Import spec classes from typing import List, Dict, Optional import time import traceback import urllib.parse # Keep for potential future ID encoding if needed # --- Configuration --- OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") # PINECONE_ENVIRONMENT is deprecated for serverless/starter, use cloud/region PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD", "aws") # Default cloud PINECONE_REGION = os.environ.get("PINECONE_REGION", "us-east-1") # Default region INDEX_NAME = "chassidus-index" # Ensure this matches your index name EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model EMBEDDING_DIMENSIONS = 3072 # Dimension for text-embedding-3-large print(f"Using Pinecone Index: {INDEX_NAME}") print(f"Using Pinecone Cloud: {PINECONE_CLOUD}") print(f"Using Pinecone Region: {PINECONE_REGION}") print(f"Using OpenAI Embedding Model: {EMBEDDING_MODEL} (Dimensions: {EMBEDDING_DIMENSIONS})") # --- End Configuration --- # --- Initialize OpenAI Client --- openai_client = None if OPENAI_API_KEY: try: openai_client = openai.OpenAI(api_key=OPENAI_API_KEY) print("OpenAI client initialized.") except Exception as e: print(f"Error initializing OpenAI client: {e}") traceback.print_exc() else: print("ERROR: OPENAI_API_KEY not found. Ingestion requires it for embeddings.") # --- Initialize Pinecone Client and Index --- pc = None index = None if PINECONE_API_KEY and PINECONE_CLOUD and PINECONE_REGION: try: print("Initializing Pinecone client...") pc = pinecone.Pinecone(api_key=PINECONE_API_KEY) # Check if index exists if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]: print(f"Index '{INDEX_NAME}' does not exist. Creating it now...") # --- Create Index (Choose ONE spec type) --- # Option A: Serverless (Recommended for new projects, pay-as-you-go) try: pc.create_index( name=INDEX_NAME, dimension=EMBEDDING_DIMENSIONS, metric="cosine", # or 'dotproduct', 'euclidean' spec=ServerlessSpec( cloud=PINECONE_CLOUD, region=PINECONE_REGION ) ) print(f"Serverless index '{INDEX_NAME}' created. Waiting for initialization...") while not pc.describe_index(INDEX_NAME).status['ready']: time.sleep(1) print("Index is ready.") except Exception as create_err: print(f"Error creating Serverless index '{INDEX_NAME}': {create_err}") traceback.print_exc() # Fallback or specific error handling needed here # Option B: Pod-based (Older style, requires specifying pod type/size) # Uncomment below and comment out ServerlessSpec if you need Pod-based # try: # # Example: Using a free tier pod (s1.x1) - adjust if needed # # Note: PINECONE_ENVIRONMENT might be needed for older pod-based index creation # pinecone_environment = os.environ.get("PINECONE_ENVIRONMENT") # Get environment if needed for pod # if not pinecone_environment: # raise ValueError("PINECONE_ENVIRONMENT is required for pod-based index creation.") # pc.create_index( # name=INDEX_NAME, # dimension=EMBEDDING_DIMENSIONS, # metric="cosine", # spec=PodSpec( # environment=pinecone_environment, # Use environment here # pod_type="p1.x1", # Example pod type, check Pinecone docs # pods=1 # ) # ) # print(f"Pod-based index '{INDEX_NAME}' created in environment '{pinecone_environment}'. Waiting...") # while not pc.describe_index(INDEX_NAME).status['ready']: # time.sleep(1) # print("Index is ready.") # except Exception as create_err: # print(f"Error creating Pod-based index '{INDEX_NAME}': {create_err}") # traceback.print_exc() # # Fallback or specific error handling needed here else: print(f"Index '{INDEX_NAME}' already exists.") # Connect to the index print(f"Connecting to index '{INDEX_NAME}'...") index = pc.Index(INDEX_NAME) print("Connected to Pinecone index.") stats = index.describe_index_stats() print(f"Initial index stats: {stats}") except Exception as e: print(f"Error initializing Pinecone or connecting to index: {e}") traceback.print_exc() else: print("ERROR: Pinecone API Key, Cloud, or Region not found. Cannot connect to Pinecone.") # --- Helper Functions --- def get_embedding(text: str, model=EMBEDDING_MODEL) -> Optional[List[float]]: """Generate embedding for text using OpenAI API.""" if not openai_client: print("Error: OpenAI client not initialized, cannot generate embedding.") return None try: text = text.replace("\n", " ") # OpenAI recommends replacing newlines if not text.strip(): # Handle empty strings print("Warning: Attempted to embed empty string.") return None response = openai_client.embeddings.create(input=[text], model=model) return response.data[0].embedding except openai.APIError as e: print(f"OpenAI API Error getting embedding: {e}") except Exception as e: print(f"Error getting embedding for text snippet: '{text[:100]}...'") traceback.print_exc() return None def process_json_file(file_path: str) -> List[Dict]: """ Process a JSON file containing documents in the specified format. Reads objects with "id", "hebrew", "english" keys. """ documents = [] try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) if not isinstance(data, list): print(f"Warning: Expected a list of objects in JSON file '{file_path}', found {type(data)}. Skipping.") return [] for i, item in enumerate(data): if isinstance(item, dict): original_id = item.get("id") hebrew_text = item.get("hebrew") english_text = item.get("english") if not original_id: print(f"Warning: Missing 'id' in item {i} of file '{file_path}'. Skipping.") continue if not hebrew_text and not english_text: print(f"Warning: Missing both 'hebrew' and 'english' text in item {i} (ID: {original_id}) of file '{file_path}'. Skipping.") continue # Ensure texts are strings, default to empty if missing but not skipping hebrew_text = hebrew_text or "" english_text = english_text or "" doc = { "original_id": str(original_id), # Ensure ID is string "hebrew_text": hebrew_text.strip(), "english_text": english_text.strip(), "source_name": os.path.basename(file_path) # Add source filename } documents.append(doc) else: print(f"Warning: Item {i} in file '{file_path}' is not a dictionary. Skipping.") except json.JSONDecodeError as e: print(f"Error decoding JSON from file '{file_path}': {e}") return [] except Exception as e: print(f"Error processing file '{file_path}': {e}") traceback.print_exc() return [] print(f"Processed {len(documents)} documents from '{file_path}'") return documents def upload_documents(documents: List[Dict], batch_size: int = 100) -> bool: """ Embeds combined Hebrew+English text and uploads vectors and metadata to Pinecone. Metadata includes separate hebrew_text and english_text. """ if not index: print("Error: Pinecone index not initialized. Cannot upload.") return False if not documents: print("No documents provided to upload.") return True # Technically successful as there's nothing to do total_uploaded = 0 try: num_batches = (len(documents) + batch_size - 1) // batch_size print(f"Preparing to upload {len(documents)} documents in {num_batches} batches of size {batch_size}...") for i in range(0, len(documents), batch_size): batch_start_time = time.time() batch = documents[i : i + batch_size] vectors_to_upload = [] ids_in_batch = set() print(f"Processing batch {i//batch_size + 1}/{num_batches}...") for doc in batch: original_id = doc["original_id"] if original_id in ids_in_batch: print(f"Warning: Duplicate ID '{original_id}' detected within the same batch. Skipping duplicate.") continue ids_in_batch.add(original_id) hebrew = doc["hebrew_text"] english = doc["english_text"] # --- Create combined text for embedding --- # Add separators to potentially help the model distinguish languages combined_text = f"Hebrew:\n{hebrew}\n\nEnglish:\n{english}" # Alternative: Just concatenate if separators don't help much # combined_text = hebrew + "\n\n" + english if not combined_text.strip(): print(f"Warning: Skipping document ID '{original_id}' due to empty combined text.") continue # --- Get Embedding --- embedding = get_embedding(combined_text) if embedding is None: print(f"Warning: Failed to get embedding for document ID '{original_id}'. Skipping.") continue # --- Prepare Metadata --- # Ensure metadata values are strings or numbers, handle None/empty metadata_payload = { "hebrew_text": hebrew if hebrew else "N/A", "english_text": english if english else "N/A", "source_name": doc.get("source_name", "Unknown"), "original_id": original_id # Store original ID in metadata too } # Optional: Clean metadata further if needed (e.g., truncate long texts) vectors_to_upload.append({ "id": original_id, # Use the original document ID as the Pinecone vector ID "values": embedding, "metadata": metadata_payload }) if not vectors_to_upload: print(f"Batch {i//batch_size + 1} resulted in no vectors to upload. Skipping API call.") continue # --- Upsert to Pinecone --- try: print(f"Upserting {len(vectors_to_upload)} vectors for batch {i//batch_size + 1}...") upsert_response = index.upsert(vectors=vectors_to_upload) print(f" Upsert response: {upsert_response}") total_uploaded += upsert_response.upserted_count except Exception as upsert_err: print(f"Error upserting batch {i//batch_size + 1}: {upsert_err}") traceback.print_exc() # Decide whether to continue with next batch or stop # return False # Stop on first batch error batch_time = time.time() - batch_start_time print(f"Batch {i//batch_size + 1} processed in {batch_time:.2f} seconds.") time.sleep(0.1) # Small delay between batches print(f"\nFinished uploading. Total vectors successfully upserted: {total_uploaded}") # Verify with index stats try: final_stats = index.describe_index_stats() print(f"Final index stats: {final_stats}") except Exception as stats_err: print(f"Could not fetch final index stats: {stats_err}") return True except Exception as e: print(f"An unexpected error occurred during the upload process: {e}") traceback.print_exc() return False def process_and_upload_file(file_path: str) -> bool: """Main function to process a JSON file and upload its documents.""" if not os.path.exists(file_path): print(f"Error: File not found at '{file_path}'") return False if not file_path.lower().endswith(".json"): print(f"Error: This script currently only processes .json files. Found: '{file_path}'") return False if not openai_client or not index: print("Error: OpenAI client or Pinecone index not initialized. Cannot proceed.") return False print(f"\n--- Starting processing for file: {file_path} ---") start_time = time.time() # 1. Process the JSON file documents = process_json_file(file_path) if not documents: print(f"No valid documents found in '{file_path}'. Upload skipped.") return False # Or True if "empty file processed successfully" is the desired outcome # 2. Upload the documents success = upload_documents(documents) end_time = time.time() print(f"--- Finished processing file: {file_path} in {end_time - start_time:.2f} seconds ---") if success: print(f"Successfully processed and uploaded data from {file_path}") else: print(f"Failed to upload data from {file_path}") return success # --- Main Execution Block --- if __name__ == "__main__": # --- Configuration for script execution --- # Set the directory containing your JSON files data_directory = "data" # CHANGE THIS to your data folder path # --- if not os.path.isdir(data_directory): print(f"Error: Data directory '{data_directory}' not found.") print("Please create the directory and place your JSON files inside, or update the 'data_directory' variable.") else: print(f"Looking for JSON files in directory: '{data_directory}'") json_files = [f for f in os.listdir(data_directory) if f.lower().endswith(".json")] if not json_files: print(f"No .json files found in '{data_directory}'.") else: print(f"Found {len(json_files)} JSON files: {json_files}") overall_success = True for filename in json_files: file_path = os.path.join(data_directory, filename) success = process_and_upload_file(file_path) if not success: overall_success = False print(f"Processing failed for {filename}. Check logs above.") # Optional: stop processing remaining files on failure # break if overall_success: print("\nAll files processed successfully.") else: print("\nSome files encountered errors during processing.") # Example for single file upload: # file_to_upload = "path/to/your/single_file.json" # if os.path.exists(file_to_upload): # process_and_upload_file(file_to_upload) # else: # print(f"File {file_to_upload} not found")