Spaces:

sivan22
/

Divrey-Yoel-RAG

Sleeping

File size: 15,874 Bytes

7f683f9

# ingestion_service.py

import os
import json
import openai
import pinecone
from pinecone import ServerlessSpec, PodSpec # Import spec classes
from typing import List, Dict, Optional
import time
import traceback
import urllib.parse # Keep for potential future ID encoding if needed

# --- Configuration ---
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use cloud/region
PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD", "aws") # Default cloud
PINECONE_REGION = os.environ.get("PINECONE_REGION", "us-east-1") # Default region
INDEX_NAME = "chassidus-index" # Ensure this matches your index name
EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model
EMBEDDING_DIMENSIONS = 3072 # Dimension for text-embedding-3-large

print(f"Using Pinecone Index: {INDEX_NAME}")
print(f"Using Pinecone Cloud: {PINECONE_CLOUD}")
print(f"Using Pinecone Region: {PINECONE_REGION}")
print(f"Using OpenAI Embedding Model: {EMBEDDING_MODEL} (Dimensions: {EMBEDDING_DIMENSIONS})")
# --- End Configuration ---


# --- Initialize OpenAI Client ---
openai_client = None
if OPENAI_API_KEY:
    try:
        openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
        print("OpenAI client initialized.")
    except Exception as e:
        print(f"Error initializing OpenAI client: {e}")
        traceback.print_exc()
else:
    print("ERROR: OPENAI_API_KEY not found. Ingestion requires it for embeddings.")


# --- Initialize Pinecone Client and Index ---
pc = None
index = None
if PINECONE_API_KEY and PINECONE_CLOUD and PINECONE_REGION:
    try:
        print("Initializing Pinecone client...")
        pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)

        # Check if index exists
        if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
            print(f"Index '{INDEX_NAME}' does not exist. Creating it now...")
            # --- Create Index (Choose ONE spec type) ---

            # Option A: Serverless (Recommended for new projects, pay-as-you-go)
            try:
                 pc.create_index(
                     name=INDEX_NAME,
                     dimension=EMBEDDING_DIMENSIONS,
                     metric="cosine", # or 'dotproduct', 'euclidean'
                     spec=ServerlessSpec(
                         cloud=PINECONE_CLOUD,
                         region=PINECONE_REGION
                     )
                 )
                 print(f"Serverless index '{INDEX_NAME}' created. Waiting for initialization...")
                 while not pc.describe_index(INDEX_NAME).status['ready']:
                     time.sleep(1)
                 print("Index is ready.")
            except Exception as create_err:
                 print(f"Error creating Serverless index '{INDEX_NAME}': {create_err}")
                 traceback.print_exc()
                 # Fallback or specific error handling needed here

            # Option B: Pod-based (Older style, requires specifying pod type/size)
            # Uncomment below and comment out ServerlessSpec if you need Pod-based
            # try:
            #     # Example: Using a free tier pod (s1.x1) - adjust if needed
            #     # Note: PINECONE_ENVIRONMENT might be needed for older pod-based index creation
            #     pinecone_environment = os.environ.get("PINECONE_ENVIRONMENT") # Get environment if needed for pod
            #     if not pinecone_environment:
            #         raise ValueError("PINECONE_ENVIRONMENT is required for pod-based index creation.")
            #     pc.create_index(
            #         name=INDEX_NAME,
            #         dimension=EMBEDDING_DIMENSIONS,
            #         metric="cosine",
            #         spec=PodSpec(
            #             environment=pinecone_environment, # Use environment here
            #             pod_type="p1.x1", # Example pod type, check Pinecone docs
            #             pods=1
            #         )
            #     )
            #     print(f"Pod-based index '{INDEX_NAME}' created in environment '{pinecone_environment}'. Waiting...")
            #     while not pc.describe_index(INDEX_NAME).status['ready']:
            #        time.sleep(1)
            #     print("Index is ready.")
            # except Exception as create_err:
            #      print(f"Error creating Pod-based index '{INDEX_NAME}': {create_err}")
            #      traceback.print_exc()
            #      # Fallback or specific error handling needed here

        else:
             print(f"Index '{INDEX_NAME}' already exists.")

        # Connect to the index
        print(f"Connecting to index '{INDEX_NAME}'...")
        index = pc.Index(INDEX_NAME)
        print("Connected to Pinecone index.")
        stats = index.describe_index_stats()
        print(f"Initial index stats: {stats}")

    except Exception as e:
        print(f"Error initializing Pinecone or connecting to index: {e}")
        traceback.print_exc()
else:
    print("ERROR: Pinecone API Key, Cloud, or Region not found. Cannot connect to Pinecone.")


# --- Helper Functions ---

def get_embedding(text: str, model=EMBEDDING_MODEL) -> Optional[List[float]]:
    """Generate embedding for text using OpenAI API."""
    if not openai_client:
        print("Error: OpenAI client not initialized, cannot generate embedding.")
        return None
    try:
        text = text.replace("\n", " ") # OpenAI recommends replacing newlines
        if not text.strip(): # Handle empty strings
            print("Warning: Attempted to embed empty string.")
            return None
        response = openai_client.embeddings.create(input=[text], model=model)
        return response.data[0].embedding
    except openai.APIError as e:
        print(f"OpenAI API Error getting embedding: {e}")
    except Exception as e:
        print(f"Error getting embedding for text snippet: '{text[:100]}...'")
        traceback.print_exc()
    return None

def process_json_file(file_path: str) -> List[Dict]:
    """
    Process a JSON file containing documents in the specified format.
    Reads objects with "id", "hebrew", "english" keys.
    """
    documents = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if not isinstance(data, list):
                print(f"Warning: Expected a list of objects in JSON file '{file_path}', found {type(data)}. Skipping.")
                return []

            for i, item in enumerate(data):
                if isinstance(item, dict):
                    original_id = item.get("id")
                    hebrew_text = item.get("hebrew")
                    english_text = item.get("english")

                    if not original_id:
                        print(f"Warning: Missing 'id' in item {i} of file '{file_path}'. Skipping.")
                        continue
                    if not hebrew_text and not english_text:
                        print(f"Warning: Missing both 'hebrew' and 'english' text in item {i} (ID: {original_id}) of file '{file_path}'. Skipping.")
                        continue
                    # Ensure texts are strings, default to empty if missing but not skipping
                    hebrew_text = hebrew_text or ""
                    english_text = english_text or ""

                    doc = {
                        "original_id": str(original_id), # Ensure ID is string
                        "hebrew_text": hebrew_text.strip(),
                        "english_text": english_text.strip(),
                        "source_name": os.path.basename(file_path) # Add source filename
                    }
                    documents.append(doc)
                else:
                    print(f"Warning: Item {i} in file '{file_path}' is not a dictionary. Skipping.")

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from file '{file_path}': {e}")
        return []
    except Exception as e:
        print(f"Error processing file '{file_path}': {e}")
        traceback.print_exc()
        return []

    print(f"Processed {len(documents)} documents from '{file_path}'")
    return documents

def upload_documents(documents: List[Dict], batch_size: int = 100) -> bool:
    """
    Embeds combined Hebrew+English text and uploads vectors and metadata to Pinecone.
    Metadata includes separate hebrew_text and english_text.
    """
    if not index:
        print("Error: Pinecone index not initialized. Cannot upload.")
        return False
    if not documents:
        print("No documents provided to upload.")
        return True # Technically successful as there's nothing to do

    total_uploaded = 0
    try:
        num_batches = (len(documents) + batch_size - 1) // batch_size
        print(f"Preparing to upload {len(documents)} documents in {num_batches} batches of size {batch_size}...")

        for i in range(0, len(documents), batch_size):
            batch_start_time = time.time()
            batch = documents[i : i + batch_size]
            vectors_to_upload = []
            ids_in_batch = set()

            print(f"Processing batch {i//batch_size + 1}/{num_batches}...")

            for doc in batch:
                original_id = doc["original_id"]
                if original_id in ids_in_batch:
                    print(f"Warning: Duplicate ID '{original_id}' detected within the same batch. Skipping duplicate.")
                    continue
                ids_in_batch.add(original_id)

                hebrew = doc["hebrew_text"]
                english = doc["english_text"]

                # --- Create combined text for embedding ---
                # Add separators to potentially help the model distinguish languages
                combined_text = f"Hebrew:\n{hebrew}\n\nEnglish:\n{english}"
                # Alternative: Just concatenate if separators don't help much
                # combined_text = hebrew + "\n\n" + english

                if not combined_text.strip():
                     print(f"Warning: Skipping document ID '{original_id}' due to empty combined text.")
                     continue

                # --- Get Embedding ---
                embedding = get_embedding(combined_text)
                if embedding is None:
                    print(f"Warning: Failed to get embedding for document ID '{original_id}'. Skipping.")
                    continue

                # --- Prepare Metadata ---
                # Ensure metadata values are strings or numbers, handle None/empty
                metadata_payload = {
                    "hebrew_text": hebrew if hebrew else "N/A",
                    "english_text": english if english else "N/A",
                    "source_name": doc.get("source_name", "Unknown"),
                    "original_id": original_id # Store original ID in metadata too
                }
                # Optional: Clean metadata further if needed (e.g., truncate long texts)

                vectors_to_upload.append({
                    "id": original_id, # Use the original document ID as the Pinecone vector ID
                    "values": embedding,
                    "metadata": metadata_payload
                })

            if not vectors_to_upload:
                print(f"Batch {i//batch_size + 1} resulted in no vectors to upload. Skipping API call.")
                continue

            # --- Upsert to Pinecone ---
            try:
                print(f"Upserting {len(vectors_to_upload)} vectors for batch {i//batch_size + 1}...")
                upsert_response = index.upsert(vectors=vectors_to_upload)
                print(f"  Upsert response: {upsert_response}")
                total_uploaded += upsert_response.upserted_count
            except Exception as upsert_err:
                print(f"Error upserting batch {i//batch_size + 1}: {upsert_err}")
                traceback.print_exc()
                # Decide whether to continue with next batch or stop
                # return False # Stop on first batch error

            batch_time = time.time() - batch_start_time
            print(f"Batch {i//batch_size + 1} processed in {batch_time:.2f} seconds.")
            time.sleep(0.1) # Small delay between batches

        print(f"\nFinished uploading. Total vectors successfully upserted: {total_uploaded}")
        # Verify with index stats
        try:
            final_stats = index.describe_index_stats()
            print(f"Final index stats: {final_stats}")
        except Exception as stats_err:
            print(f"Could not fetch final index stats: {stats_err}")

        return True

    except Exception as e:
        print(f"An unexpected error occurred during the upload process: {e}")
        traceback.print_exc()
        return False

def process_and_upload_file(file_path: str) -> bool:
    """Main function to process a JSON file and upload its documents."""
    if not os.path.exists(file_path):
        print(f"Error: File not found at '{file_path}'")
        return False

    if not file_path.lower().endswith(".json"):
        print(f"Error: This script currently only processes .json files. Found: '{file_path}'")
        return False

    if not openai_client or not index:
        print("Error: OpenAI client or Pinecone index not initialized. Cannot proceed.")
        return False

    print(f"\n--- Starting processing for file: {file_path} ---")
    start_time = time.time()

    # 1. Process the JSON file
    documents = process_json_file(file_path)
    if not documents:
        print(f"No valid documents found in '{file_path}'. Upload skipped.")
        return False # Or True if "empty file processed successfully" is the desired outcome

    # 2. Upload the documents
    success = upload_documents(documents)

    end_time = time.time()
    print(f"--- Finished processing file: {file_path} in {end_time - start_time:.2f} seconds ---")

    if success:
        print(f"Successfully processed and uploaded data from {file_path}")
    else:
        print(f"Failed to upload data from {file_path}")

    return success

# --- Main Execution Block ---
if __name__ == "__main__":
    # --- Configuration for script execution ---
    # Set the directory containing your JSON files
    data_directory = "data" # CHANGE THIS to your data folder path
    # ---

    if not os.path.isdir(data_directory):
        print(f"Error: Data directory '{data_directory}' not found.")
        print("Please create the directory and place your JSON files inside, or update the 'data_directory' variable.")
    else:
        print(f"Looking for JSON files in directory: '{data_directory}'")
        json_files = [f for f in os.listdir(data_directory) if f.lower().endswith(".json")]

        if not json_files:
            print(f"No .json files found in '{data_directory}'.")
        else:
            print(f"Found {len(json_files)} JSON files: {json_files}")
            overall_success = True
            for filename in json_files:
                file_path = os.path.join(data_directory, filename)
                success = process_and_upload_file(file_path)
                if not success:
                    overall_success = False
                    print(f"Processing failed for {filename}. Check logs above.")
                    # Optional: stop processing remaining files on failure
                    # break

            if overall_success:
                print("\nAll files processed successfully.")
            else:
                print("\nSome files encountered errors during processing.")

    # Example for single file upload:
    # file_to_upload = "path/to/your/single_file.json"
    # if os.path.exists(file_to_upload):
    #     process_and_upload_file(file_to_upload)
    # else:
    #     print(f"File {file_to_upload} not found")