Spaces:
Sleeping
Sleeping
File size: 15,874 Bytes
7f683f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 |
# ingestion_service.py
import os
import json
import openai
import pinecone
from pinecone import ServerlessSpec, PodSpec # Import spec classes
from typing import List, Dict, Optional
import time
import traceback
import urllib.parse # Keep for potential future ID encoding if needed
# --- Configuration ---
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use cloud/region
PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD", "aws") # Default cloud
PINECONE_REGION = os.environ.get("PINECONE_REGION", "us-east-1") # Default region
INDEX_NAME = "chassidus-index" # Ensure this matches your index name
EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model
EMBEDDING_DIMENSIONS = 3072 # Dimension for text-embedding-3-large
print(f"Using Pinecone Index: {INDEX_NAME}")
print(f"Using Pinecone Cloud: {PINECONE_CLOUD}")
print(f"Using Pinecone Region: {PINECONE_REGION}")
print(f"Using OpenAI Embedding Model: {EMBEDDING_MODEL} (Dimensions: {EMBEDDING_DIMENSIONS})")
# --- End Configuration ---
# --- Initialize OpenAI Client ---
openai_client = None
if OPENAI_API_KEY:
try:
openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
print("OpenAI client initialized.")
except Exception as e:
print(f"Error initializing OpenAI client: {e}")
traceback.print_exc()
else:
print("ERROR: OPENAI_API_KEY not found. Ingestion requires it for embeddings.")
# --- Initialize Pinecone Client and Index ---
pc = None
index = None
if PINECONE_API_KEY and PINECONE_CLOUD and PINECONE_REGION:
try:
print("Initializing Pinecone client...")
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
# Check if index exists
if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
print(f"Index '{INDEX_NAME}' does not exist. Creating it now...")
# --- Create Index (Choose ONE spec type) ---
# Option A: Serverless (Recommended for new projects, pay-as-you-go)
try:
pc.create_index(
name=INDEX_NAME,
dimension=EMBEDDING_DIMENSIONS,
metric="cosine", # or 'dotproduct', 'euclidean'
spec=ServerlessSpec(
cloud=PINECONE_CLOUD,
region=PINECONE_REGION
)
)
print(f"Serverless index '{INDEX_NAME}' created. Waiting for initialization...")
while not pc.describe_index(INDEX_NAME).status['ready']:
time.sleep(1)
print("Index is ready.")
except Exception as create_err:
print(f"Error creating Serverless index '{INDEX_NAME}': {create_err}")
traceback.print_exc()
# Fallback or specific error handling needed here
# Option B: Pod-based (Older style, requires specifying pod type/size)
# Uncomment below and comment out ServerlessSpec if you need Pod-based
# try:
# # Example: Using a free tier pod (s1.x1) - adjust if needed
# # Note: PINECONE_ENVIRONMENT might be needed for older pod-based index creation
# pinecone_environment = os.environ.get("PINECONE_ENVIRONMENT") # Get environment if needed for pod
# if not pinecone_environment:
# raise ValueError("PINECONE_ENVIRONMENT is required for pod-based index creation.")
# pc.create_index(
# name=INDEX_NAME,
# dimension=EMBEDDING_DIMENSIONS,
# metric="cosine",
# spec=PodSpec(
# environment=pinecone_environment, # Use environment here
# pod_type="p1.x1", # Example pod type, check Pinecone docs
# pods=1
# )
# )
# print(f"Pod-based index '{INDEX_NAME}' created in environment '{pinecone_environment}'. Waiting...")
# while not pc.describe_index(INDEX_NAME).status['ready']:
# time.sleep(1)
# print("Index is ready.")
# except Exception as create_err:
# print(f"Error creating Pod-based index '{INDEX_NAME}': {create_err}")
# traceback.print_exc()
# # Fallback or specific error handling needed here
else:
print(f"Index '{INDEX_NAME}' already exists.")
# Connect to the index
print(f"Connecting to index '{INDEX_NAME}'...")
index = pc.Index(INDEX_NAME)
print("Connected to Pinecone index.")
stats = index.describe_index_stats()
print(f"Initial index stats: {stats}")
except Exception as e:
print(f"Error initializing Pinecone or connecting to index: {e}")
traceback.print_exc()
else:
print("ERROR: Pinecone API Key, Cloud, or Region not found. Cannot connect to Pinecone.")
# --- Helper Functions ---
def get_embedding(text: str, model=EMBEDDING_MODEL) -> Optional[List[float]]:
"""Generate embedding for text using OpenAI API."""
if not openai_client:
print("Error: OpenAI client not initialized, cannot generate embedding.")
return None
try:
text = text.replace("\n", " ") # OpenAI recommends replacing newlines
if not text.strip(): # Handle empty strings
print("Warning: Attempted to embed empty string.")
return None
response = openai_client.embeddings.create(input=[text], model=model)
return response.data[0].embedding
except openai.APIError as e:
print(f"OpenAI API Error getting embedding: {e}")
except Exception as e:
print(f"Error getting embedding for text snippet: '{text[:100]}...'")
traceback.print_exc()
return None
def process_json_file(file_path: str) -> List[Dict]:
"""
Process a JSON file containing documents in the specified format.
Reads objects with "id", "hebrew", "english" keys.
"""
documents = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if not isinstance(data, list):
print(f"Warning: Expected a list of objects in JSON file '{file_path}', found {type(data)}. Skipping.")
return []
for i, item in enumerate(data):
if isinstance(item, dict):
original_id = item.get("id")
hebrew_text = item.get("hebrew")
english_text = item.get("english")
if not original_id:
print(f"Warning: Missing 'id' in item {i} of file '{file_path}'. Skipping.")
continue
if not hebrew_text and not english_text:
print(f"Warning: Missing both 'hebrew' and 'english' text in item {i} (ID: {original_id}) of file '{file_path}'. Skipping.")
continue
# Ensure texts are strings, default to empty if missing but not skipping
hebrew_text = hebrew_text or ""
english_text = english_text or ""
doc = {
"original_id": str(original_id), # Ensure ID is string
"hebrew_text": hebrew_text.strip(),
"english_text": english_text.strip(),
"source_name": os.path.basename(file_path) # Add source filename
}
documents.append(doc)
else:
print(f"Warning: Item {i} in file '{file_path}' is not a dictionary. Skipping.")
except json.JSONDecodeError as e:
print(f"Error decoding JSON from file '{file_path}': {e}")
return []
except Exception as e:
print(f"Error processing file '{file_path}': {e}")
traceback.print_exc()
return []
print(f"Processed {len(documents)} documents from '{file_path}'")
return documents
def upload_documents(documents: List[Dict], batch_size: int = 100) -> bool:
"""
Embeds combined Hebrew+English text and uploads vectors and metadata to Pinecone.
Metadata includes separate hebrew_text and english_text.
"""
if not index:
print("Error: Pinecone index not initialized. Cannot upload.")
return False
if not documents:
print("No documents provided to upload.")
return True # Technically successful as there's nothing to do
total_uploaded = 0
try:
num_batches = (len(documents) + batch_size - 1) // batch_size
print(f"Preparing to upload {len(documents)} documents in {num_batches} batches of size {batch_size}...")
for i in range(0, len(documents), batch_size):
batch_start_time = time.time()
batch = documents[i : i + batch_size]
vectors_to_upload = []
ids_in_batch = set()
print(f"Processing batch {i//batch_size + 1}/{num_batches}...")
for doc in batch:
original_id = doc["original_id"]
if original_id in ids_in_batch:
print(f"Warning: Duplicate ID '{original_id}' detected within the same batch. Skipping duplicate.")
continue
ids_in_batch.add(original_id)
hebrew = doc["hebrew_text"]
english = doc["english_text"]
# --- Create combined text for embedding ---
# Add separators to potentially help the model distinguish languages
combined_text = f"Hebrew:\n{hebrew}\n\nEnglish:\n{english}"
# Alternative: Just concatenate if separators don't help much
# combined_text = hebrew + "\n\n" + english
if not combined_text.strip():
print(f"Warning: Skipping document ID '{original_id}' due to empty combined text.")
continue
# --- Get Embedding ---
embedding = get_embedding(combined_text)
if embedding is None:
print(f"Warning: Failed to get embedding for document ID '{original_id}'. Skipping.")
continue
# --- Prepare Metadata ---
# Ensure metadata values are strings or numbers, handle None/empty
metadata_payload = {
"hebrew_text": hebrew if hebrew else "N/A",
"english_text": english if english else "N/A",
"source_name": doc.get("source_name", "Unknown"),
"original_id": original_id # Store original ID in metadata too
}
# Optional: Clean metadata further if needed (e.g., truncate long texts)
vectors_to_upload.append({
"id": original_id, # Use the original document ID as the Pinecone vector ID
"values": embedding,
"metadata": metadata_payload
})
if not vectors_to_upload:
print(f"Batch {i//batch_size + 1} resulted in no vectors to upload. Skipping API call.")
continue
# --- Upsert to Pinecone ---
try:
print(f"Upserting {len(vectors_to_upload)} vectors for batch {i//batch_size + 1}...")
upsert_response = index.upsert(vectors=vectors_to_upload)
print(f" Upsert response: {upsert_response}")
total_uploaded += upsert_response.upserted_count
except Exception as upsert_err:
print(f"Error upserting batch {i//batch_size + 1}: {upsert_err}")
traceback.print_exc()
# Decide whether to continue with next batch or stop
# return False # Stop on first batch error
batch_time = time.time() - batch_start_time
print(f"Batch {i//batch_size + 1} processed in {batch_time:.2f} seconds.")
time.sleep(0.1) # Small delay between batches
print(f"\nFinished uploading. Total vectors successfully upserted: {total_uploaded}")
# Verify with index stats
try:
final_stats = index.describe_index_stats()
print(f"Final index stats: {final_stats}")
except Exception as stats_err:
print(f"Could not fetch final index stats: {stats_err}")
return True
except Exception as e:
print(f"An unexpected error occurred during the upload process: {e}")
traceback.print_exc()
return False
def process_and_upload_file(file_path: str) -> bool:
"""Main function to process a JSON file and upload its documents."""
if not os.path.exists(file_path):
print(f"Error: File not found at '{file_path}'")
return False
if not file_path.lower().endswith(".json"):
print(f"Error: This script currently only processes .json files. Found: '{file_path}'")
return False
if not openai_client or not index:
print("Error: OpenAI client or Pinecone index not initialized. Cannot proceed.")
return False
print(f"\n--- Starting processing for file: {file_path} ---")
start_time = time.time()
# 1. Process the JSON file
documents = process_json_file(file_path)
if not documents:
print(f"No valid documents found in '{file_path}'. Upload skipped.")
return False # Or True if "empty file processed successfully" is the desired outcome
# 2. Upload the documents
success = upload_documents(documents)
end_time = time.time()
print(f"--- Finished processing file: {file_path} in {end_time - start_time:.2f} seconds ---")
if success:
print(f"Successfully processed and uploaded data from {file_path}")
else:
print(f"Failed to upload data from {file_path}")
return success
# --- Main Execution Block ---
if __name__ == "__main__":
# --- Configuration for script execution ---
# Set the directory containing your JSON files
data_directory = "data" # CHANGE THIS to your data folder path
# ---
if not os.path.isdir(data_directory):
print(f"Error: Data directory '{data_directory}' not found.")
print("Please create the directory and place your JSON files inside, or update the 'data_directory' variable.")
else:
print(f"Looking for JSON files in directory: '{data_directory}'")
json_files = [f for f in os.listdir(data_directory) if f.lower().endswith(".json")]
if not json_files:
print(f"No .json files found in '{data_directory}'.")
else:
print(f"Found {len(json_files)} JSON files: {json_files}")
overall_success = True
for filename in json_files:
file_path = os.path.join(data_directory, filename)
success = process_and_upload_file(file_path)
if not success:
overall_success = False
print(f"Processing failed for {filename}. Check logs above.")
# Optional: stop processing remaining files on failure
# break
if overall_success:
print("\nAll files processed successfully.")
else:
print("\nSome files encountered errors during processing.")
# Example for single file upload:
# file_to_upload = "path/to/your/single_file.json"
# if os.path.exists(file_to_upload):
# process_and_upload_file(file_to_upload)
# else:
# print(f"File {file_to_upload} not found") |