Spaces:

sivan22
/

Divrey-Yoel-RAG

Sleeping

App Files Files Community

Divrey-Yoel-RAG / ingestion_service.py

sivan22

Upload 16 files

7f683f9 verified about 1 month ago

raw

history blame contribute delete

15.9 kB

	# ingestion_service.py

	import os
	import json
	import openai
	import pinecone
	from pinecone import ServerlessSpec, PodSpec # Import spec classes
	from typing import List, Dict, Optional
	import time
	import traceback
	import urllib.parse # Keep for potential future ID encoding if needed

	# --- Configuration ---
	OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
	PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
	# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use cloud/region
	PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD", "aws") # Default cloud
	PINECONE_REGION = os.environ.get("PINECONE_REGION", "us-east-1") # Default region
	INDEX_NAME = "chassidus-index" # Ensure this matches your index name
	EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model
	EMBEDDING_DIMENSIONS = 3072 # Dimension for text-embedding-3-large

	print(f"Using Pinecone Index: {INDEX_NAME}")
	print(f"Using Pinecone Cloud: {PINECONE_CLOUD}")
	print(f"Using Pinecone Region: {PINECONE_REGION}")
	print(f"Using OpenAI Embedding Model: {EMBEDDING_MODEL} (Dimensions: {EMBEDDING_DIMENSIONS})")
	# --- End Configuration ---


	# --- Initialize OpenAI Client ---
	openai_client = None
	if OPENAI_API_KEY:
	try:
	openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
	print("OpenAI client initialized.")
	except Exception as e:
	print(f"Error initializing OpenAI client: {e}")
	traceback.print_exc()
	else:
	print("ERROR: OPENAI_API_KEY not found. Ingestion requires it for embeddings.")


	# --- Initialize Pinecone Client and Index ---
	pc = None
	index = None
	if PINECONE_API_KEY and PINECONE_CLOUD and PINECONE_REGION:
	try:
	print("Initializing Pinecone client...")
	pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)

	# Check if index exists
	if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
	print(f"Index '{INDEX_NAME}' does not exist. Creating it now...")
	# --- Create Index (Choose ONE spec type) ---

	# Option A: Serverless (Recommended for new projects, pay-as-you-go)
	try:
	pc.create_index(
	name=INDEX_NAME,
	dimension=EMBEDDING_DIMENSIONS,
	metric="cosine", # or 'dotproduct', 'euclidean'
	spec=ServerlessSpec(
	cloud=PINECONE_CLOUD,
	region=PINECONE_REGION
	)
	)
	print(f"Serverless index '{INDEX_NAME}' created. Waiting for initialization...")
	while not pc.describe_index(INDEX_NAME).status['ready']:
	time.sleep(1)
	print("Index is ready.")
	except Exception as create_err:
	print(f"Error creating Serverless index '{INDEX_NAME}': {create_err}")
	traceback.print_exc()
	# Fallback or specific error handling needed here

	# Option B: Pod-based (Older style, requires specifying pod type/size)
	# Uncomment below and comment out ServerlessSpec if you need Pod-based
	# try:
	# # Example: Using a free tier pod (s1.x1) - adjust if needed
	# # Note: PINECONE_ENVIRONMENT might be needed for older pod-based index creation
	# pinecone_environment = os.environ.get("PINECONE_ENVIRONMENT") # Get environment if needed for pod
	# if not pinecone_environment:
	# raise ValueError("PINECONE_ENVIRONMENT is required for pod-based index creation.")
	# pc.create_index(
	# name=INDEX_NAME,
	# dimension=EMBEDDING_DIMENSIONS,
	# metric="cosine",
	# spec=PodSpec(
	# environment=pinecone_environment, # Use environment here
	# pod_type="p1.x1", # Example pod type, check Pinecone docs
	# pods=1
	# )
	# )
	# print(f"Pod-based index '{INDEX_NAME}' created in environment '{pinecone_environment}'. Waiting...")
	# while not pc.describe_index(INDEX_NAME).status['ready']:
	# time.sleep(1)
	# print("Index is ready.")
	# except Exception as create_err:
	# print(f"Error creating Pod-based index '{INDEX_NAME}': {create_err}")
	# traceback.print_exc()
	# # Fallback or specific error handling needed here

	else:
	print(f"Index '{INDEX_NAME}' already exists.")

	# Connect to the index
	print(f"Connecting to index '{INDEX_NAME}'...")
	index = pc.Index(INDEX_NAME)
	print("Connected to Pinecone index.")
	stats = index.describe_index_stats()
	print(f"Initial index stats: {stats}")

	except Exception as e:
	print(f"Error initializing Pinecone or connecting to index: {e}")
	traceback.print_exc()
	else:
	print("ERROR: Pinecone API Key, Cloud, or Region not found. Cannot connect to Pinecone.")


	# --- Helper Functions ---

	def get_embedding(text: str, model=EMBEDDING_MODEL) -> Optional[List[float]]:
	"""Generate embedding for text using OpenAI API."""
	if not openai_client:
	print("Error: OpenAI client not initialized, cannot generate embedding.")
	return None
	try:
	text = text.replace("\n", " ") # OpenAI recommends replacing newlines
	if not text.strip(): # Handle empty strings
	print("Warning: Attempted to embed empty string.")
	return None
	response = openai_client.embeddings.create(input=[text], model=model)
	return response.data[0].embedding
	except openai.APIError as e:
	print(f"OpenAI API Error getting embedding: {e}")
	except Exception as e:
	print(f"Error getting embedding for text snippet: '{text[:100]}...'")
	traceback.print_exc()
	return None

	def process_json_file(file_path: str) -> List[Dict]:
	"""
	Process a JSON file containing documents in the specified format.
	Reads objects with "id", "hebrew", "english" keys.
	"""
	documents = []
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	if not isinstance(data, list):
	print(f"Warning: Expected a list of objects in JSON file '{file_path}', found {type(data)}. Skipping.")
	return []

	for i, item in enumerate(data):
	if isinstance(item, dict):
	original_id = item.get("id")
	hebrew_text = item.get("hebrew")
	english_text = item.get("english")

	if not original_id:
	print(f"Warning: Missing 'id' in item {i} of file '{file_path}'. Skipping.")
	continue
	if not hebrew_text and not english_text:
	print(f"Warning: Missing both 'hebrew' and 'english' text in item {i} (ID: {original_id}) of file '{file_path}'. Skipping.")
	continue
	# Ensure texts are strings, default to empty if missing but not skipping
	hebrew_text = hebrew_text or ""
	english_text = english_text or ""

	doc = {
	"original_id": str(original_id), # Ensure ID is string
	"hebrew_text": hebrew_text.strip(),
	"english_text": english_text.strip(),
	"source_name": os.path.basename(file_path) # Add source filename
	}
	documents.append(doc)
	else:
	print(f"Warning: Item {i} in file '{file_path}' is not a dictionary. Skipping.")

	except json.JSONDecodeError as e:
	print(f"Error decoding JSON from file '{file_path}': {e}")
	return []
	except Exception as e:
	print(f"Error processing file '{file_path}': {e}")
	traceback.print_exc()
	return []

	print(f"Processed {len(documents)} documents from '{file_path}'")
	return documents

	def upload_documents(documents: List[Dict], batch_size: int = 100) -> bool:
	"""
	Embeds combined Hebrew+English text and uploads vectors and metadata to Pinecone.
	Metadata includes separate hebrew_text and english_text.
	"""
	if not index:
	print("Error: Pinecone index not initialized. Cannot upload.")
	return False
	if not documents:
	print("No documents provided to upload.")
	return True # Technically successful as there's nothing to do

	total_uploaded = 0
	try:
	num_batches = (len(documents) + batch_size - 1) // batch_size
	print(f"Preparing to upload {len(documents)} documents in {num_batches} batches of size {batch_size}...")

	for i in range(0, len(documents), batch_size):
	batch_start_time = time.time()
	batch = documents[i : i + batch_size]
	vectors_to_upload = []
	ids_in_batch = set()

	print(f"Processing batch {i//batch_size + 1}/{num_batches}...")

	for doc in batch:
	original_id = doc["original_id"]
	if original_id in ids_in_batch:
	print(f"Warning: Duplicate ID '{original_id}' detected within the same batch. Skipping duplicate.")
	continue
	ids_in_batch.add(original_id)

	hebrew = doc["hebrew_text"]
	english = doc["english_text"]

	# --- Create combined text for embedding ---
	# Add separators to potentially help the model distinguish languages
	combined_text = f"Hebrew:\n{hebrew}\n\nEnglish:\n{english}"
	# Alternative: Just concatenate if separators don't help much
	# combined_text = hebrew + "\n\n" + english

	if not combined_text.strip():
	print(f"Warning: Skipping document ID '{original_id}' due to empty combined text.")
	continue

	# --- Get Embedding ---
	embedding = get_embedding(combined_text)
	if embedding is None:
	print(f"Warning: Failed to get embedding for document ID '{original_id}'. Skipping.")
	continue

	# --- Prepare Metadata ---
	# Ensure metadata values are strings or numbers, handle None/empty
	metadata_payload = {
	"hebrew_text": hebrew if hebrew else "N/A",
	"english_text": english if english else "N/A",
	"source_name": doc.get("source_name", "Unknown"),
	"original_id": original_id # Store original ID in metadata too
	}
	# Optional: Clean metadata further if needed (e.g., truncate long texts)

	vectors_to_upload.append({
	"id": original_id, # Use the original document ID as the Pinecone vector ID
	"values": embedding,
	"metadata": metadata_payload
	})

	if not vectors_to_upload:
	print(f"Batch {i//batch_size + 1} resulted in no vectors to upload. Skipping API call.")
	continue

	# --- Upsert to Pinecone ---
	try:
	print(f"Upserting {len(vectors_to_upload)} vectors for batch {i//batch_size + 1}...")
	upsert_response = index.upsert(vectors=vectors_to_upload)
	print(f" Upsert response: {upsert_response}")
	total_uploaded += upsert_response.upserted_count
	except Exception as upsert_err:
	print(f"Error upserting batch {i//batch_size + 1}: {upsert_err}")
	traceback.print_exc()
	# Decide whether to continue with next batch or stop
	# return False # Stop on first batch error

	batch_time = time.time() - batch_start_time
	print(f"Batch {i//batch_size + 1} processed in {batch_time:.2f} seconds.")
	time.sleep(0.1) # Small delay between batches

	print(f"\nFinished uploading. Total vectors successfully upserted: {total_uploaded}")
	# Verify with index stats
	try:
	final_stats = index.describe_index_stats()
	print(f"Final index stats: {final_stats}")
	except Exception as stats_err:
	print(f"Could not fetch final index stats: {stats_err}")

	return True

	except Exception as e:
	print(f"An unexpected error occurred during the upload process: {e}")
	traceback.print_exc()
	return False

	def process_and_upload_file(file_path: str) -> bool:
	"""Main function to process a JSON file and upload its documents."""
	if not os.path.exists(file_path):
	print(f"Error: File not found at '{file_path}'")
	return False

	if not file_path.lower().endswith(".json"):
	print(f"Error: This script currently only processes .json files. Found: '{file_path}'")
	return False

	if not openai_client or not index:
	print("Error: OpenAI client or Pinecone index not initialized. Cannot proceed.")
	return False

	print(f"\n--- Starting processing for file: {file_path} ---")
	start_time = time.time()

	# 1. Process the JSON file
	documents = process_json_file(file_path)
	if not documents:
	print(f"No valid documents found in '{file_path}'. Upload skipped.")
	return False # Or True if "empty file processed successfully" is the desired outcome

	# 2. Upload the documents
	success = upload_documents(documents)

	end_time = time.time()
	print(f"--- Finished processing file: {file_path} in {end_time - start_time:.2f} seconds ---")

	if success:
	print(f"Successfully processed and uploaded data from {file_path}")
	else:
	print(f"Failed to upload data from {file_path}")

	return success

	# --- Main Execution Block ---
	if __name__ == "__main__":
	# --- Configuration for script execution ---
	# Set the directory containing your JSON files
	data_directory = "data" # CHANGE THIS to your data folder path
	# ---

	if not os.path.isdir(data_directory):
	print(f"Error: Data directory '{data_directory}' not found.")
	print("Please create the directory and place your JSON files inside, or update the 'data_directory' variable.")
	else:
	print(f"Looking for JSON files in directory: '{data_directory}'")
	json_files = [f for f in os.listdir(data_directory) if f.lower().endswith(".json")]

	if not json_files:
	print(f"No .json files found in '{data_directory}'.")
	else:
	print(f"Found {len(json_files)} JSON files: {json_files}")
	overall_success = True
	for filename in json_files:
	file_path = os.path.join(data_directory, filename)
	success = process_and_upload_file(file_path)
	if not success:
	overall_success = False
	print(f"Processing failed for {filename}. Check logs above.")
	# Optional: stop processing remaining files on failure
	# break

	if overall_success:
	print("\nAll files processed successfully.")
	else:
	print("\nSome files encountered errors during processing.")

	# Example for single file upload:
	# file_to_upload = "path/to/your/single_file.json"
	# if os.path.exists(file_to_upload):
	# process_and_upload_file(file_to_upload)
	# else:
	# print(f"File {file_to_upload} not found")