File size: 15,874 Bytes
7f683f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
# ingestion_service.py

import os
import json
import openai
import pinecone
from pinecone import ServerlessSpec, PodSpec # Import spec classes
from typing import List, Dict, Optional
import time
import traceback
import urllib.parse # Keep for potential future ID encoding if needed

# --- Configuration ---
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use cloud/region
PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD", "aws") # Default cloud
PINECONE_REGION = os.environ.get("PINECONE_REGION", "us-east-1") # Default region
INDEX_NAME = "chassidus-index" # Ensure this matches your index name
EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model
EMBEDDING_DIMENSIONS = 3072 # Dimension for text-embedding-3-large

print(f"Using Pinecone Index: {INDEX_NAME}")
print(f"Using Pinecone Cloud: {PINECONE_CLOUD}")
print(f"Using Pinecone Region: {PINECONE_REGION}")
print(f"Using OpenAI Embedding Model: {EMBEDDING_MODEL} (Dimensions: {EMBEDDING_DIMENSIONS})")
# --- End Configuration ---


# --- Initialize OpenAI Client ---
openai_client = None
if OPENAI_API_KEY:
    try:
        openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
        print("OpenAI client initialized.")
    except Exception as e:
        print(f"Error initializing OpenAI client: {e}")
        traceback.print_exc()
else:
    print("ERROR: OPENAI_API_KEY not found. Ingestion requires it for embeddings.")


# --- Initialize Pinecone Client and Index ---
pc = None
index = None
if PINECONE_API_KEY and PINECONE_CLOUD and PINECONE_REGION:
    try:
        print("Initializing Pinecone client...")
        pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)

        # Check if index exists
        if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
            print(f"Index '{INDEX_NAME}' does not exist. Creating it now...")
            # --- Create Index (Choose ONE spec type) ---

            # Option A: Serverless (Recommended for new projects, pay-as-you-go)
            try:
                 pc.create_index(
                     name=INDEX_NAME,
                     dimension=EMBEDDING_DIMENSIONS,
                     metric="cosine", # or 'dotproduct', 'euclidean'
                     spec=ServerlessSpec(
                         cloud=PINECONE_CLOUD,
                         region=PINECONE_REGION
                     )
                 )
                 print(f"Serverless index '{INDEX_NAME}' created. Waiting for initialization...")
                 while not pc.describe_index(INDEX_NAME).status['ready']:
                     time.sleep(1)
                 print("Index is ready.")
            except Exception as create_err:
                 print(f"Error creating Serverless index '{INDEX_NAME}': {create_err}")
                 traceback.print_exc()
                 # Fallback or specific error handling needed here

            # Option B: Pod-based (Older style, requires specifying pod type/size)
            # Uncomment below and comment out ServerlessSpec if you need Pod-based
            # try:
            #     # Example: Using a free tier pod (s1.x1) - adjust if needed
            #     # Note: PINECONE_ENVIRONMENT might be needed for older pod-based index creation
            #     pinecone_environment = os.environ.get("PINECONE_ENVIRONMENT") # Get environment if needed for pod
            #     if not pinecone_environment:
            #         raise ValueError("PINECONE_ENVIRONMENT is required for pod-based index creation.")
            #     pc.create_index(
            #         name=INDEX_NAME,
            #         dimension=EMBEDDING_DIMENSIONS,
            #         metric="cosine",
            #         spec=PodSpec(
            #             environment=pinecone_environment, # Use environment here
            #             pod_type="p1.x1", # Example pod type, check Pinecone docs
            #             pods=1
            #         )
            #     )
            #     print(f"Pod-based index '{INDEX_NAME}' created in environment '{pinecone_environment}'. Waiting...")
            #     while not pc.describe_index(INDEX_NAME).status['ready']:
            #        time.sleep(1)
            #     print("Index is ready.")
            # except Exception as create_err:
            #      print(f"Error creating Pod-based index '{INDEX_NAME}': {create_err}")
            #      traceback.print_exc()
            #      # Fallback or specific error handling needed here

        else:
             print(f"Index '{INDEX_NAME}' already exists.")

        # Connect to the index
        print(f"Connecting to index '{INDEX_NAME}'...")
        index = pc.Index(INDEX_NAME)
        print("Connected to Pinecone index.")
        stats = index.describe_index_stats()
        print(f"Initial index stats: {stats}")

    except Exception as e:
        print(f"Error initializing Pinecone or connecting to index: {e}")
        traceback.print_exc()
else:
    print("ERROR: Pinecone API Key, Cloud, or Region not found. Cannot connect to Pinecone.")


# --- Helper Functions ---

def get_embedding(text: str, model=EMBEDDING_MODEL) -> Optional[List[float]]:
    """Generate embedding for text using OpenAI API."""
    if not openai_client:
        print("Error: OpenAI client not initialized, cannot generate embedding.")
        return None
    try:
        text = text.replace("\n", " ") # OpenAI recommends replacing newlines
        if not text.strip(): # Handle empty strings
            print("Warning: Attempted to embed empty string.")
            return None
        response = openai_client.embeddings.create(input=[text], model=model)
        return response.data[0].embedding
    except openai.APIError as e:
        print(f"OpenAI API Error getting embedding: {e}")
    except Exception as e:
        print(f"Error getting embedding for text snippet: '{text[:100]}...'")
        traceback.print_exc()
    return None

def process_json_file(file_path: str) -> List[Dict]:
    """
    Process a JSON file containing documents in the specified format.
    Reads objects with "id", "hebrew", "english" keys.
    """
    documents = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if not isinstance(data, list):
                print(f"Warning: Expected a list of objects in JSON file '{file_path}', found {type(data)}. Skipping.")
                return []

            for i, item in enumerate(data):
                if isinstance(item, dict):
                    original_id = item.get("id")
                    hebrew_text = item.get("hebrew")
                    english_text = item.get("english")

                    if not original_id:
                        print(f"Warning: Missing 'id' in item {i} of file '{file_path}'. Skipping.")
                        continue
                    if not hebrew_text and not english_text:
                        print(f"Warning: Missing both 'hebrew' and 'english' text in item {i} (ID: {original_id}) of file '{file_path}'. Skipping.")
                        continue
                    # Ensure texts are strings, default to empty if missing but not skipping
                    hebrew_text = hebrew_text or ""
                    english_text = english_text or ""

                    doc = {
                        "original_id": str(original_id), # Ensure ID is string
                        "hebrew_text": hebrew_text.strip(),
                        "english_text": english_text.strip(),
                        "source_name": os.path.basename(file_path) # Add source filename
                    }
                    documents.append(doc)
                else:
                    print(f"Warning: Item {i} in file '{file_path}' is not a dictionary. Skipping.")

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from file '{file_path}': {e}")
        return []
    except Exception as e:
        print(f"Error processing file '{file_path}': {e}")
        traceback.print_exc()
        return []

    print(f"Processed {len(documents)} documents from '{file_path}'")
    return documents

def upload_documents(documents: List[Dict], batch_size: int = 100) -> bool:
    """
    Embeds combined Hebrew+English text and uploads vectors and metadata to Pinecone.
    Metadata includes separate hebrew_text and english_text.
    """
    if not index:
        print("Error: Pinecone index not initialized. Cannot upload.")
        return False
    if not documents:
        print("No documents provided to upload.")
        return True # Technically successful as there's nothing to do

    total_uploaded = 0
    try:
        num_batches = (len(documents) + batch_size - 1) // batch_size
        print(f"Preparing to upload {len(documents)} documents in {num_batches} batches of size {batch_size}...")

        for i in range(0, len(documents), batch_size):
            batch_start_time = time.time()
            batch = documents[i : i + batch_size]
            vectors_to_upload = []
            ids_in_batch = set()

            print(f"Processing batch {i//batch_size + 1}/{num_batches}...")

            for doc in batch:
                original_id = doc["original_id"]
                if original_id in ids_in_batch:
                    print(f"Warning: Duplicate ID '{original_id}' detected within the same batch. Skipping duplicate.")
                    continue
                ids_in_batch.add(original_id)

                hebrew = doc["hebrew_text"]
                english = doc["english_text"]

                # --- Create combined text for embedding ---
                # Add separators to potentially help the model distinguish languages
                combined_text = f"Hebrew:\n{hebrew}\n\nEnglish:\n{english}"
                # Alternative: Just concatenate if separators don't help much
                # combined_text = hebrew + "\n\n" + english

                if not combined_text.strip():
                     print(f"Warning: Skipping document ID '{original_id}' due to empty combined text.")
                     continue

                # --- Get Embedding ---
                embedding = get_embedding(combined_text)
                if embedding is None:
                    print(f"Warning: Failed to get embedding for document ID '{original_id}'. Skipping.")
                    continue

                # --- Prepare Metadata ---
                # Ensure metadata values are strings or numbers, handle None/empty
                metadata_payload = {
                    "hebrew_text": hebrew if hebrew else "N/A",
                    "english_text": english if english else "N/A",
                    "source_name": doc.get("source_name", "Unknown"),
                    "original_id": original_id # Store original ID in metadata too
                }
                # Optional: Clean metadata further if needed (e.g., truncate long texts)

                vectors_to_upload.append({
                    "id": original_id, # Use the original document ID as the Pinecone vector ID
                    "values": embedding,
                    "metadata": metadata_payload
                })

            if not vectors_to_upload:
                print(f"Batch {i//batch_size + 1} resulted in no vectors to upload. Skipping API call.")
                continue

            # --- Upsert to Pinecone ---
            try:
                print(f"Upserting {len(vectors_to_upload)} vectors for batch {i//batch_size + 1}...")
                upsert_response = index.upsert(vectors=vectors_to_upload)
                print(f"  Upsert response: {upsert_response}")
                total_uploaded += upsert_response.upserted_count
            except Exception as upsert_err:
                print(f"Error upserting batch {i//batch_size + 1}: {upsert_err}")
                traceback.print_exc()
                # Decide whether to continue with next batch or stop
                # return False # Stop on first batch error

            batch_time = time.time() - batch_start_time
            print(f"Batch {i//batch_size + 1} processed in {batch_time:.2f} seconds.")
            time.sleep(0.1) # Small delay between batches

        print(f"\nFinished uploading. Total vectors successfully upserted: {total_uploaded}")
        # Verify with index stats
        try:
            final_stats = index.describe_index_stats()
            print(f"Final index stats: {final_stats}")
        except Exception as stats_err:
            print(f"Could not fetch final index stats: {stats_err}")

        return True

    except Exception as e:
        print(f"An unexpected error occurred during the upload process: {e}")
        traceback.print_exc()
        return False

def process_and_upload_file(file_path: str) -> bool:
    """Main function to process a JSON file and upload its documents."""
    if not os.path.exists(file_path):
        print(f"Error: File not found at '{file_path}'")
        return False

    if not file_path.lower().endswith(".json"):
        print(f"Error: This script currently only processes .json files. Found: '{file_path}'")
        return False

    if not openai_client or not index:
        print("Error: OpenAI client or Pinecone index not initialized. Cannot proceed.")
        return False

    print(f"\n--- Starting processing for file: {file_path} ---")
    start_time = time.time()

    # 1. Process the JSON file
    documents = process_json_file(file_path)
    if not documents:
        print(f"No valid documents found in '{file_path}'. Upload skipped.")
        return False # Or True if "empty file processed successfully" is the desired outcome

    # 2. Upload the documents
    success = upload_documents(documents)

    end_time = time.time()
    print(f"--- Finished processing file: {file_path} in {end_time - start_time:.2f} seconds ---")

    if success:
        print(f"Successfully processed and uploaded data from {file_path}")
    else:
        print(f"Failed to upload data from {file_path}")

    return success

# --- Main Execution Block ---
if __name__ == "__main__":
    # --- Configuration for script execution ---
    # Set the directory containing your JSON files
    data_directory = "data" # CHANGE THIS to your data folder path
    # ---

    if not os.path.isdir(data_directory):
        print(f"Error: Data directory '{data_directory}' not found.")
        print("Please create the directory and place your JSON files inside, or update the 'data_directory' variable.")
    else:
        print(f"Looking for JSON files in directory: '{data_directory}'")
        json_files = [f for f in os.listdir(data_directory) if f.lower().endswith(".json")]

        if not json_files:
            print(f"No .json files found in '{data_directory}'.")
        else:
            print(f"Found {len(json_files)} JSON files: {json_files}")
            overall_success = True
            for filename in json_files:
                file_path = os.path.join(data_directory, filename)
                success = process_and_upload_file(file_path)
                if not success:
                    overall_success = False
                    print(f"Processing failed for {filename}. Check logs above.")
                    # Optional: stop processing remaining files on failure
                    # break

            if overall_success:
                print("\nAll files processed successfully.")
            else:
                print("\nSome files encountered errors during processing.")

    # Example for single file upload:
    # file_to_upload = "path/to/your/single_file.json"
    # if os.path.exists(file_to_upload):
    #     process_and_upload_file(file_to_upload)
    # else:
    #     print(f"File {file_to_upload} not found")