Spaces:
Sleeping
Sleeping
# ingestion_service.py | |
import os | |
import json | |
import openai | |
import pinecone | |
from pinecone import ServerlessSpec, PodSpec # Import spec classes | |
from typing import List, Dict, Optional | |
import time | |
import traceback | |
import urllib.parse # Keep for potential future ID encoding if needed | |
# --- Configuration --- | |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") | |
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") | |
# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use cloud/region | |
PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD", "aws") # Default cloud | |
PINECONE_REGION = os.environ.get("PINECONE_REGION", "us-east-1") # Default region | |
INDEX_NAME = "chassidus-index" # Ensure this matches your index name | |
EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model | |
EMBEDDING_DIMENSIONS = 3072 # Dimension for text-embedding-3-large | |
print(f"Using Pinecone Index: {INDEX_NAME}") | |
print(f"Using Pinecone Cloud: {PINECONE_CLOUD}") | |
print(f"Using Pinecone Region: {PINECONE_REGION}") | |
print(f"Using OpenAI Embedding Model: {EMBEDDING_MODEL} (Dimensions: {EMBEDDING_DIMENSIONS})") | |
# --- End Configuration --- | |
# --- Initialize OpenAI Client --- | |
openai_client = None | |
if OPENAI_API_KEY: | |
try: | |
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY) | |
print("OpenAI client initialized.") | |
except Exception as e: | |
print(f"Error initializing OpenAI client: {e}") | |
traceback.print_exc() | |
else: | |
print("ERROR: OPENAI_API_KEY not found. Ingestion requires it for embeddings.") | |
# --- Initialize Pinecone Client and Index --- | |
pc = None | |
index = None | |
if PINECONE_API_KEY and PINECONE_CLOUD and PINECONE_REGION: | |
try: | |
print("Initializing Pinecone client...") | |
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY) | |
# Check if index exists | |
if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]: | |
print(f"Index '{INDEX_NAME}' does not exist. Creating it now...") | |
# --- Create Index (Choose ONE spec type) --- | |
# Option A: Serverless (Recommended for new projects, pay-as-you-go) | |
try: | |
pc.create_index( | |
name=INDEX_NAME, | |
dimension=EMBEDDING_DIMENSIONS, | |
metric="cosine", # or 'dotproduct', 'euclidean' | |
spec=ServerlessSpec( | |
cloud=PINECONE_CLOUD, | |
region=PINECONE_REGION | |
) | |
) | |
print(f"Serverless index '{INDEX_NAME}' created. Waiting for initialization...") | |
while not pc.describe_index(INDEX_NAME).status['ready']: | |
time.sleep(1) | |
print("Index is ready.") | |
except Exception as create_err: | |
print(f"Error creating Serverless index '{INDEX_NAME}': {create_err}") | |
traceback.print_exc() | |
# Fallback or specific error handling needed here | |
# Option B: Pod-based (Older style, requires specifying pod type/size) | |
# Uncomment below and comment out ServerlessSpec if you need Pod-based | |
# try: | |
# # Example: Using a free tier pod (s1.x1) - adjust if needed | |
# # Note: PINECONE_ENVIRONMENT might be needed for older pod-based index creation | |
# pinecone_environment = os.environ.get("PINECONE_ENVIRONMENT") # Get environment if needed for pod | |
# if not pinecone_environment: | |
# raise ValueError("PINECONE_ENVIRONMENT is required for pod-based index creation.") | |
# pc.create_index( | |
# name=INDEX_NAME, | |
# dimension=EMBEDDING_DIMENSIONS, | |
# metric="cosine", | |
# spec=PodSpec( | |
# environment=pinecone_environment, # Use environment here | |
# pod_type="p1.x1", # Example pod type, check Pinecone docs | |
# pods=1 | |
# ) | |
# ) | |
# print(f"Pod-based index '{INDEX_NAME}' created in environment '{pinecone_environment}'. Waiting...") | |
# while not pc.describe_index(INDEX_NAME).status['ready']: | |
# time.sleep(1) | |
# print("Index is ready.") | |
# except Exception as create_err: | |
# print(f"Error creating Pod-based index '{INDEX_NAME}': {create_err}") | |
# traceback.print_exc() | |
# # Fallback or specific error handling needed here | |
else: | |
print(f"Index '{INDEX_NAME}' already exists.") | |
# Connect to the index | |
print(f"Connecting to index '{INDEX_NAME}'...") | |
index = pc.Index(INDEX_NAME) | |
print("Connected to Pinecone index.") | |
stats = index.describe_index_stats() | |
print(f"Initial index stats: {stats}") | |
except Exception as e: | |
print(f"Error initializing Pinecone or connecting to index: {e}") | |
traceback.print_exc() | |
else: | |
print("ERROR: Pinecone API Key, Cloud, or Region not found. Cannot connect to Pinecone.") | |
# --- Helper Functions --- | |
def get_embedding(text: str, model=EMBEDDING_MODEL) -> Optional[List[float]]: | |
"""Generate embedding for text using OpenAI API.""" | |
if not openai_client: | |
print("Error: OpenAI client not initialized, cannot generate embedding.") | |
return None | |
try: | |
text = text.replace("\n", " ") # OpenAI recommends replacing newlines | |
if not text.strip(): # Handle empty strings | |
print("Warning: Attempted to embed empty string.") | |
return None | |
response = openai_client.embeddings.create(input=[text], model=model) | |
return response.data[0].embedding | |
except openai.APIError as e: | |
print(f"OpenAI API Error getting embedding: {e}") | |
except Exception as e: | |
print(f"Error getting embedding for text snippet: '{text[:100]}...'") | |
traceback.print_exc() | |
return None | |
def process_json_file(file_path: str) -> List[Dict]: | |
""" | |
Process a JSON file containing documents in the specified format. | |
Reads objects with "id", "hebrew", "english" keys. | |
""" | |
documents = [] | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
data = json.load(f) | |
if not isinstance(data, list): | |
print(f"Warning: Expected a list of objects in JSON file '{file_path}', found {type(data)}. Skipping.") | |
return [] | |
for i, item in enumerate(data): | |
if isinstance(item, dict): | |
original_id = item.get("id") | |
hebrew_text = item.get("hebrew") | |
english_text = item.get("english") | |
if not original_id: | |
print(f"Warning: Missing 'id' in item {i} of file '{file_path}'. Skipping.") | |
continue | |
if not hebrew_text and not english_text: | |
print(f"Warning: Missing both 'hebrew' and 'english' text in item {i} (ID: {original_id}) of file '{file_path}'. Skipping.") | |
continue | |
# Ensure texts are strings, default to empty if missing but not skipping | |
hebrew_text = hebrew_text or "" | |
english_text = english_text or "" | |
doc = { | |
"original_id": str(original_id), # Ensure ID is string | |
"hebrew_text": hebrew_text.strip(), | |
"english_text": english_text.strip(), | |
"source_name": os.path.basename(file_path) # Add source filename | |
} | |
documents.append(doc) | |
else: | |
print(f"Warning: Item {i} in file '{file_path}' is not a dictionary. Skipping.") | |
except json.JSONDecodeError as e: | |
print(f"Error decoding JSON from file '{file_path}': {e}") | |
return [] | |
except Exception as e: | |
print(f"Error processing file '{file_path}': {e}") | |
traceback.print_exc() | |
return [] | |
print(f"Processed {len(documents)} documents from '{file_path}'") | |
return documents | |
def upload_documents(documents: List[Dict], batch_size: int = 100) -> bool: | |
""" | |
Embeds combined Hebrew+English text and uploads vectors and metadata to Pinecone. | |
Metadata includes separate hebrew_text and english_text. | |
""" | |
if not index: | |
print("Error: Pinecone index not initialized. Cannot upload.") | |
return False | |
if not documents: | |
print("No documents provided to upload.") | |
return True # Technically successful as there's nothing to do | |
total_uploaded = 0 | |
try: | |
num_batches = (len(documents) + batch_size - 1) // batch_size | |
print(f"Preparing to upload {len(documents)} documents in {num_batches} batches of size {batch_size}...") | |
for i in range(0, len(documents), batch_size): | |
batch_start_time = time.time() | |
batch = documents[i : i + batch_size] | |
vectors_to_upload = [] | |
ids_in_batch = set() | |
print(f"Processing batch {i//batch_size + 1}/{num_batches}...") | |
for doc in batch: | |
original_id = doc["original_id"] | |
if original_id in ids_in_batch: | |
print(f"Warning: Duplicate ID '{original_id}' detected within the same batch. Skipping duplicate.") | |
continue | |
ids_in_batch.add(original_id) | |
hebrew = doc["hebrew_text"] | |
english = doc["english_text"] | |
# --- Create combined text for embedding --- | |
# Add separators to potentially help the model distinguish languages | |
combined_text = f"Hebrew:\n{hebrew}\n\nEnglish:\n{english}" | |
# Alternative: Just concatenate if separators don't help much | |
# combined_text = hebrew + "\n\n" + english | |
if not combined_text.strip(): | |
print(f"Warning: Skipping document ID '{original_id}' due to empty combined text.") | |
continue | |
# --- Get Embedding --- | |
embedding = get_embedding(combined_text) | |
if embedding is None: | |
print(f"Warning: Failed to get embedding for document ID '{original_id}'. Skipping.") | |
continue | |
# --- Prepare Metadata --- | |
# Ensure metadata values are strings or numbers, handle None/empty | |
metadata_payload = { | |
"hebrew_text": hebrew if hebrew else "N/A", | |
"english_text": english if english else "N/A", | |
"source_name": doc.get("source_name", "Unknown"), | |
"original_id": original_id # Store original ID in metadata too | |
} | |
# Optional: Clean metadata further if needed (e.g., truncate long texts) | |
vectors_to_upload.append({ | |
"id": original_id, # Use the original document ID as the Pinecone vector ID | |
"values": embedding, | |
"metadata": metadata_payload | |
}) | |
if not vectors_to_upload: | |
print(f"Batch {i//batch_size + 1} resulted in no vectors to upload. Skipping API call.") | |
continue | |
# --- Upsert to Pinecone --- | |
try: | |
print(f"Upserting {len(vectors_to_upload)} vectors for batch {i//batch_size + 1}...") | |
upsert_response = index.upsert(vectors=vectors_to_upload) | |
print(f" Upsert response: {upsert_response}") | |
total_uploaded += upsert_response.upserted_count | |
except Exception as upsert_err: | |
print(f"Error upserting batch {i//batch_size + 1}: {upsert_err}") | |
traceback.print_exc() | |
# Decide whether to continue with next batch or stop | |
# return False # Stop on first batch error | |
batch_time = time.time() - batch_start_time | |
print(f"Batch {i//batch_size + 1} processed in {batch_time:.2f} seconds.") | |
time.sleep(0.1) # Small delay between batches | |
print(f"\nFinished uploading. Total vectors successfully upserted: {total_uploaded}") | |
# Verify with index stats | |
try: | |
final_stats = index.describe_index_stats() | |
print(f"Final index stats: {final_stats}") | |
except Exception as stats_err: | |
print(f"Could not fetch final index stats: {stats_err}") | |
return True | |
except Exception as e: | |
print(f"An unexpected error occurred during the upload process: {e}") | |
traceback.print_exc() | |
return False | |
def process_and_upload_file(file_path: str) -> bool: | |
"""Main function to process a JSON file and upload its documents.""" | |
if not os.path.exists(file_path): | |
print(f"Error: File not found at '{file_path}'") | |
return False | |
if not file_path.lower().endswith(".json"): | |
print(f"Error: This script currently only processes .json files. Found: '{file_path}'") | |
return False | |
if not openai_client or not index: | |
print("Error: OpenAI client or Pinecone index not initialized. Cannot proceed.") | |
return False | |
print(f"\n--- Starting processing for file: {file_path} ---") | |
start_time = time.time() | |
# 1. Process the JSON file | |
documents = process_json_file(file_path) | |
if not documents: | |
print(f"No valid documents found in '{file_path}'. Upload skipped.") | |
return False # Or True if "empty file processed successfully" is the desired outcome | |
# 2. Upload the documents | |
success = upload_documents(documents) | |
end_time = time.time() | |
print(f"--- Finished processing file: {file_path} in {end_time - start_time:.2f} seconds ---") | |
if success: | |
print(f"Successfully processed and uploaded data from {file_path}") | |
else: | |
print(f"Failed to upload data from {file_path}") | |
return success | |
# --- Main Execution Block --- | |
if __name__ == "__main__": | |
# --- Configuration for script execution --- | |
# Set the directory containing your JSON files | |
data_directory = "data" # CHANGE THIS to your data folder path | |
# --- | |
if not os.path.isdir(data_directory): | |
print(f"Error: Data directory '{data_directory}' not found.") | |
print("Please create the directory and place your JSON files inside, or update the 'data_directory' variable.") | |
else: | |
print(f"Looking for JSON files in directory: '{data_directory}'") | |
json_files = [f for f in os.listdir(data_directory) if f.lower().endswith(".json")] | |
if not json_files: | |
print(f"No .json files found in '{data_directory}'.") | |
else: | |
print(f"Found {len(json_files)} JSON files: {json_files}") | |
overall_success = True | |
for filename in json_files: | |
file_path = os.path.join(data_directory, filename) | |
success = process_and_upload_file(file_path) | |
if not success: | |
overall_success = False | |
print(f"Processing failed for {filename}. Check logs above.") | |
# Optional: stop processing remaining files on failure | |
# break | |
if overall_success: | |
print("\nAll files processed successfully.") | |
else: | |
print("\nSome files encountered errors during processing.") | |
# Example for single file upload: | |
# file_to_upload = "path/to/your/single_file.json" | |
# if os.path.exists(file_to_upload): | |
# process_and_upload_file(file_to_upload) | |
# else: | |
# print(f"File {file_to_upload} not found") |