Spaces:

sivan22
/

Divrey-Yoel-RAG

Sleeping

App Files Files Community

sivan22 commited on Apr 27

Commit

7f683f9

verified ·

1 Parent(s): bafd905

Upload 16 files

Browse files

Files changed (16) hide show

.gitignore +160 -0
.replit +31 -0
analysis_service_anthropic.py +280 -0
app.py +481 -0
file_processor.py +315 -0
generated-icon.png +0 -0
generation_service_anthropic.py +103 -0
generation_service_gemini.py +175 -0
ingestion_service.py +367 -0
main.py +21 -0
package-lock.json +6 -0
pyproject.toml +16 -0
requirements.txt +10 -0
retriever_pinecone.py +243 -0
uv.lock +0 -0
validation_service_openai.py +156 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

.replit ADDED Viewed

	@@ -0,0 +1,31 @@

+entrypoint = "main.py"
+modules = ["nodejs-20", "python-3.11"]
+[nix]
+channel = "stable-24_05"
+[unitTest]
+language = "python3"
+[gitHubImport]
+requiredFiles = [".replit", "replit.nix"]
+[deployment]
+run = ["sh", "-c", "python -m streamlit run --server.address 0.0.0.0 --server.headless true --server.enableCORS=false --server.enableXsrfProtection=false --server.enableWebsocketCompression=false app.py"]
+deploymentTarget = "cloudrun"
+[workflows]
+runButton = "Run"
+[[workflows.workflow]]
+name = "Run"
+author = 22737092
+mode = "sequential"
+[[workflows.workflow.tasks]]
+task = "shell.exec"
+args = "python main.py"
+[[ports]]
+localPort = 8501
+externalPort = 80

analysis_service_anthropic.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# analysis_service_anthropic.py
+import anthropic
+import os
+import json
+import re
+import traceback
+# --- Configuration ---
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
+# --- MODEL NAME CHANGED TO SPECIFIC VERSION, LATEST AS COMMENT ---
+# Use the specific dated version for potentially more stable results in production.
+# 'latest' alias points to this or newer snapshots.
+# Original: ANALYSIS_MODEL = "claude-3-7-sonnet-latest"
+ANALYSIS_MODEL = "claude-3-7-sonnet-20250219" # Or use "claude-3-7-sonnet-latest"
+# --- End Configuration ---
+client = None
+if ANTHROPIC_API_KEY:
+    try:
+        # Use the specific model name in initialization log
+        client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY)
+        print(
+            f"Anthropic ASYNC client initialized for analysis (Model: {ANALYSIS_MODEL})."
+        )
+    except Exception as e:
+        print(f"Error initializing Anthropic ASYNC client for analysis: {e}")
+else:
+    print("ANTHROPIC_API_KEY not found. Analysis service will not function.")
+# --- Helper functions (clean_json_string, check_analyzer_status, clean_source_text) ---
+def clean_json_string(json_string):
+    """Attempts to clean common issues in JSON strings returned by LLMs."""
+    if not isinstance(json_string, str):
+        return ""
+    # Remove trailing commas before closing brackets/braces
+    cleaned = re.sub(r",(\s*[}\]])", r"\1", json_string)
+    # Remove markdown code block fences
+    cleaned = re.sub(r"^```json\s*", "", cleaned, flags=re.IGNORECASE)
+    cleaned = re.sub(r"\s*```$", "", cleaned)
+    return cleaned.strip()
+def check_analyzer_status():
+    """Checks if the Anthropic analyzer service is ready."""
+    if not client:
+        return False, "Anthropic client not initialized (check API key)."
+    return True, f"Analysis service ready (Model: {ANALYSIS_MODEL})." # Show model in status
+def clean_source_text(text):
+    """Cleans source text by removing specific patterns."""
+    if not text: return ""
+    # Remove @number patterns, <HAL> tags (case-insensitive), <br> tags, and normalize whitespace
+    cleaned = text
+    cleaned = re.sub(r'@\d+', '', cleaned)
+    cleaned = re.sub(r'<HAL>', '', cleaned, flags=re.IGNORECASE)
+    cleaned = cleaned.replace('<br>', ' ').replace('<br />', ' ')
+    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
+    return cleaned
+# --- End Helper Functions ---
+async def analyze_source_relevance_async(paragraph_hebrew, paragraph_english,
+                                         user_question):
+    """
+    Analyzes a Hebrew text paragraph for relevance to a user's question using Anthropic.
+    Args:
+        paragraph_hebrew (str): The Hebrew text snippet to analyze.
+        paragraph_english (str): The English translation (currently unused but kept for signature consistency).
+        user_question (str): The user's question in Hebrew.
+    Returns:
+        dict or None: A dictionary containing the analysis result (relevance score, headline, conclusion)
+                      if successful, otherwise None.
+    """
+    global client
+    ready, msg = check_analyzer_status()
+    if not ready or client is None:
+        print(f"Analyzer not ready: {msg}")
+        return None
+    # Ensure inputs are strings, even if empty
+    paragraph_hebrew = str(paragraph_hebrew) if paragraph_hebrew is not None else ""
+    user_question = str(user_question) if user_question is not None else ""
+    if not paragraph_hebrew or not user_question:
+        print(
+            "Warning: Missing Hebrew paragraph or user question for analysis.")
+        # Return a default non-relevant structure instead of None if needed by downstream logic
+        # return {"relevance": {"is_relevant": False, "relevance_score": 1, "explanation": "קלט חסר."}, "headline": {"hebrew": "קלט חסר"}, "conclusion": {"hebrew": "קלט חסר."}}
+        return None # Keep returning None for now
+    original_snippet = paragraph_hebrew[:60].replace('\n', ' ')
+    cleaned_hebrew = clean_source_text(paragraph_hebrew)
+    cleaned_snippet = cleaned_hebrew[:60].replace('\n', ' ')
+    # === START NEW SYSTEM PROMPT ===
+    system_prompt = """You are an expert analyst specializing in Chassidic texts, particularly the works of the Satmar Rebbe, Rabbi Yoel Teitelbaum (Divrei Yoel). Your task is to evaluate a single Hebrew paragraph provided by the user based *only* on its relevance to the user's specific Hebrew question.
+You MUST output your analysis STRICTLY as a single, valid JSON object, with no other text before or after the JSON structure.
+The JSON object must have the following structure:
+{
+  "relevance": {
+    "is_relevant": boolean, // True if the paragraph directly discusses or provides significant information related to the user's question. False otherwise.
+    "relevance_score": integer, // A score from 1 (completely irrelevant) to 10 (directly and fully answers a key aspect of the question). Assess based *only* on the provided paragraph content.
+    "explanation": string // A concise explanation IN HEBREW justifying the score and relevance assessment, referring only to the content of the paragraph. Explain *why* it is or is not relevant.
+  },
+  "headline": {
+    "hebrew": string // A very brief (3-7 words) headline IN HEBREW summarizing the paragraph's main topic *as it relates to the question*. If irrelevant, summarize the paragraph's general topic.
+  },
+  "conclusion": {
+    "hebrew": string // A single sentence IN HEBREW summarizing the key takeaway or information *from the paragraph* that is relevant to the question. If the paragraph is irrelevant, state clearly in Hebrew that it does not address the question (e.g., "הפסקה אינה עוסקת בשאלה.").
+  }
+}
+Base your entire analysis SOLELY on the Hebrew text paragraph provided. Do not use external knowledge. Ensure the output is valid JSON.
+"""
+    # === END NEW SYSTEM PROMPT ===
+    # === START NEW USER MESSAGE CONTENT ===
+    # Ensure inputs are properly escaped if they contain characters that could break JSON structure within the f-string, though unlikely here.
+    # Using f-string for clarity, but ensure no direct injection vulnerability if inputs were different.
+    user_message_content = f"""Please analyze the following Hebrew text passage based *only* on its content and relevance to the specific Hebrew question provided below. Adhere strictly to the JSON output format specified in the system prompt.
+**User Question (Hebrew):**
+{user_question}
+**Hebrew Text Passage to Analyze:**
+<paragraph>
+{cleaned_hebrew}
+</paragraph>
+"""
+    # === END NEW USER MESSAGE CONTENT ===
+    print(
+        f"  -> Sending cleaned paragraph (Snippet: '{cleaned_snippet}...') for Claude analysis (Model: {ANALYSIS_MODEL}) regarding question: '{user_question[:60]}...'"
+    )
+    try:
+        # --- API Call - Adheres to Messages API format ---
+        message = await client.messages.create(
+            model=ANALYSIS_MODEL,          # Correct parameter
+            max_tokens=1024,               # Correct parameter (Estimate response size, maybe smaller?)
+            system=system_prompt,          # Correct parameter (Using new detailed prompt)
+            messages=[{                    # Correct parameter and structure
+                "role": "user",
+                "content": user_message_content # Using new detailed content
+            }],
+            temperature=0.1,               # Lower temperature for deterministic analysis
+            # --- Thinking Parameter - Commented Out for Analysis ---
+            # thinking={"type": "enabled", "budget_tokens": 16000} # Consider if needed for complex analysis
+            # --- End Thinking Parameter ---
+        )
+        # --- Process the response ---
+        if not message or not message.content or not isinstance(message.content, list) or not message.content:
+            print(
+                f"  <- Analysis failed: Invalid message object or empty content list received from API for snippet '{cleaned_snippet}...'. Message: {message}"
+            )
+            return None
+        first_block = message.content[0]
+        if not first_block or not hasattr(first_block, 'text') or not first_block.text:
+            print(
+                f"  <- Analysis failed: First content block is invalid or has no text for snippet '{cleaned_snippet}...'. First block: {first_block}"
+            )
+            return None
+        raw_response_text = first_block.text
+        print(f"  <- Raw analysis response snippet: {raw_response_text[:200]}...") # Log raw response
+        # --- JSON Parsing Logic ---
+        # Attempt to find JSON block first
+        json_match = re.search(r"\{.*\}", raw_response_text, re.DOTALL)
+        json_to_parse = None
+        if json_match:
+            json_block = json_match.group(0)
+            # Further clean the extracted block
+            json_to_parse = clean_json_string(json_block)
+            print(f"  -- Extracted JSON block: {json_to_parse[:100]}...")
+        else:
+            # If no block found, try cleaning the whole response (less reliable)
+            json_to_parse = clean_json_string(raw_response_text)
+            print(f"  -- No JSON block found, attempting parse on cleaned full response: {json_to_parse[:100]}...")
+        if not json_to_parse or not json_to_parse.startswith("{") or not json_to_parse.endswith("}"):
+             print(f"  <- Analysis failed: Could not extract valid JSON structure after cleaning. Cleaned data: '{json_to_parse[:100]}...'")
+             # Log more context on failure
+             print(f"  -- Original raw response was: {raw_response_text}")
+             return None
+        try:
+            analysis_result = json.loads(json_to_parse)
+        except json.JSONDecodeError as json_err:
+            print(f"  <- Analysis failed: JSONDecodeError - {json_err}. Problematic JSON string (cleaned): '{json_to_parse}'")
+            # Log more context on failure
+            print(f"  -- Original raw response was: {raw_response_text}")
+            return None
+        # --- Validate Structure ---
+        # Add more verbose checks for debugging
+        if not isinstance(analysis_result, dict):
+            print(f"  <- Analysis failed: Parsed result is not a dictionary. Type: {type(analysis_result)}")
+            return None
+        # Check top-level keys
+        if not all(key in analysis_result for key in ['relevance', 'headline', 'conclusion']):
+            print(f"  <- Analysis failed: Missing top-level keys. Found: {list(analysis_result.keys())}")
+            return None
+        # Check nested structure and types
+        relevance_data = analysis_result.get('relevance')
+        headline_data = analysis_result.get('headline')
+        conclusion_data = analysis_result.get('conclusion')
+        if not isinstance(relevance_data, dict) or \
+           not all(k in relevance_data for k in ['is_relevant', 'relevance_score', 'explanation']) or \
+           not isinstance(relevance_data.get('is_relevant'), bool) or \
+           not isinstance(relevance_data.get('relevance_score'), int) or \
+           not isinstance(relevance_data.get('explanation'), str):
+            print(f"  <- Analysis failed: Incorrect structure or types in 'relevance' field. Data: {relevance_data}")
+            return None
+        if not isinstance(headline_data, dict) or \
+           'hebrew' not in headline_data or \
+           not isinstance(headline_data.get('hebrew'), str):
+            print(f"  <- Analysis failed: Incorrect structure or types in 'headline' field. Data: {headline_data}")
+            return None
+        if not isinstance(conclusion_data, dict) or \
+           'hebrew' not in conclusion_data or \
+           not isinstance(conclusion_data.get('hebrew'), str):
+            print(f"  <- Analysis failed: Incorrect structure or types in 'conclusion' field. Data: {conclusion_data}")
+            return None
+        # If all checks pass
+        print(
+            f"  <- Analysis successful for snippet '{cleaned_snippet}...'. Score: {analysis_result.get('relevance', {}).get('relevance_score', 'N/A')}"
+        )
+        return analysis_result
+    # --- Error Handling - Aligns with library exceptions ---
+    except anthropic.APIStatusError as e:
+        print(
+            f"  <- Anthropic API Status Error (Analysis): Status={e.status_code} Response={e.response} for snippet '{cleaned_snippet}...'. Model: {ANALYSIS_MODEL}"
+        )
+        if e.status_code == 400:
+            print(
+                f"  <- NOTE: 400 Bad Request. Possible causes: Model name '{ANALYSIS_MODEL}' invalid/unavailable OR API parameters incorrect OR input/output token limits exceeded."
+            )
+        # Log relevant parts of the request if possible (be careful with sensitive data)
+        print(f"  -- Failing request details: Question='{user_question[:60]}...', Paragraph Snippet='{cleaned_snippet}...', System Prompt Length={len(system_prompt)}, User Content Length={len(user_message_content)}")
+        return None
+    except Exception as e:
+        print(
+            f"  <- Unexpected error during Claude analysis API call ({type(e).__name__}) for snippet '{cleaned_snippet}...': {e}"
+        )
+        traceback.print_exc()
+         # Log relevant parts of the request if possible
+        print(f"  -- Failing request details: Question='{user_question[:60]}...', Paragraph Snippet='{cleaned_snippet}...', System Prompt Length={len(system_prompt)}, User Content Length={len(user_message_content)}")
+        return None
+# --- Example Usage (No changes needed) ---
+# Consider adding a small async test function here if needed
+# import asyncio
+# async def main():
+#     test_q = "מהי חשיבות השמחה בעבודת ה'?"
+#     test_p = "ועיקר עבודת ה' היא בשמחה, כמו שכתוב 'עבדו את ה' בשמחה', כי השמחה פותחת הלב ומאירה הנשמה, ומביאה לידי דביקות בהשי\"ת. ועל ידי העצבות ח\"ו נסתם הלב ואינו יכול לקבל אור הקדושה."
+#     result = await analyze_source_relevance_async(test_p, "", test_q)
+#     print("\n--- Test Analysis Result ---")
+#     print(json.dumps(result, indent=2, ensure_ascii=False))
+# if __name__ == "__main__":
+#     if ANTHROPIC_API_KEY:
+#         asyncio.run(main())
+#     else:
+#         print("Cannot run test: ANTHROPIC_API_KEY not set.")

app.py ADDED Viewed

	@@ -0,0 +1,481 @@

+# app.py - LangSmith enabled, designed for Replit + Anthropic + OpenAI
+import os
+import streamlit as st
+import time
+import traceback
+import json
+import asyncio
+import nest_asyncio
+from typing import List, Dict
+from dotenv import load_dotenv
+load_dotenv()
+# ----- SETUP SECRETS AND ENV -----
+# Hardcoded (safe): you never need these in secrets!
+os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
+os.environ["LANGSMITH_TRACING"] = "true"
+# The following must exist in your Replit secrets:
+# OPENAI_API_KEY, ANTHROPIC_API_KEY, LANGSMITH_API_KEY, LANGSMITH_PROJECT
+os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
+os.environ["ANTHROPIC_API_KEY"] = os.environ["ANTHROPIC_API_KEY"]
+os.environ["LANGSMITH_API_KEY"] = os.environ["LANGSMITH_API_KEY"]
+os.environ["LANGSMITH_PROJECT"] = os.environ["LANGSMITH_PROJECT"]
+# ----------------------------------
+from langsmith import traceable
+nest_asyncio.apply()
+from retriever_pinecone import find_similar_paragraphs, check_retriever_status
+from analysis_service_anthropic import (
+    analyze_source_relevance_async,
+    check_analyzer_status,
+    ANALYSIS_MODEL as ANTHROPIC_ANALYSIS_MODEL,
+)
+from generation_service_anthropic import (
+    generate_response_stream_async as generate_anthropic,
+    check_generator_status as check_anthropic_generator,
+    GENERATION_MODEL as ANTHROPIC_GENERATION_MODEL,
+)
+from generation_service_gemini import (
+    generate_response_stream_gemini as generate_gemini,
+    check_gemini_generator_status,
+    GENERATION_MODEL as GEMINI_GENERATION_MODEL,
+)
+from validation_service_openai import (
+    validate_paragraph_relevance_gpt4o,
+    check_openai_validator_status,
+    VALIDATION_MODEL as GPT4O_VALIDATION_MODEL,
+)
+try:
+    from generation_service_anthropic import format_context_for_prompt
+    print("Format context function potentially available.")
+except ImportError:
+    print("Warning: format_context_for_prompt not imported.")
+st.set_page_config(page_title="Divrey Yoel AI Chat", layout="wide")
+st.markdown(
+    """<style>
+    .rtl-text { direction: rtl; text-align: right; }
+    .hebrew-text { font-family: 'Arial Hebrew', 'David', sans-serif; direction: rtl; text-align: right; font-size: 1.1em; margin-bottom: 5px; }
+    .source-info { font-size: 0.85em; color: #666; margin-bottom: 8px; }
+    .expander-content > div { border-bottom: 1px solid #eee; padding-bottom: 15px; margin-bottom: 15px; }
+    .expander-content > div:last-child { border-bottom: none; margin-bottom: 0; padding-bottom: 0; }
+    .stChatMessage .stExpander { margin-top: 15px; border-left: 3px solid #ddd; padding-left: 10px; }
+    .stStatus div[data-testid="stStatusContent"] p { direction: rtl; text-align: right; }
+    .stButton > button[kind="header"] { direction: rtl; text-align: right; }
+    .stExpander div[data-testid="stVerticalBlock"] code { display: block; text-align: right; direction: rtl; }
+    .alert-warning { padding: 0.75rem 1.25rem; margin-bottom: 1rem; border: 1px solid transparent;
+        border-radius: 0.25rem; color: #856404; background-color: #fff3cd; border-color: #ffeeba;}
+</style>""",
+    unsafe_allow_html=True,
+)
+st.markdown("<h1 class='rtl-text'>Divrey Yoel AI Chat</h1>", unsafe_allow_html=True)
+st.markdown("<p class='rtl-text'>חיפוש בטקסטים חסידיים באמצעות RAG</p>", unsafe_allow_html=True)
+# --- Status Checks & Sidebar ---
+retriever_ready, retriever_msg = check_retriever_status()
+anthropic_analyzer_ready, anthropic_analyzer_msg = check_analyzer_status()
+anthropic_generator_ready, anthropic_generator_msg = check_anthropic_generator()
+gemini_generator_ready, gemini_generator_msg = check_gemini_generator_status()
+openai_validator_ready, openai_validator_msg = check_openai_validator_status()
+st.sidebar.markdown("<h3 class='rtl-text'>מצב המערכת</h3>", unsafe_allow_html=True)
+st.sidebar.markdown(
+    f"<p class='rtl-text'><strong>מאחזר (Pinecone):</strong> {'✅' if retriever_ready else '❌'}</p>",
+    unsafe_allow_html=True,
+)
+if not retriever_ready:
+    st.sidebar.markdown(
+        f"<div class='alert alert-warning rtl-text' role='alert'>{retriever_msg}</div>", unsafe_allow_html=True
+    )
+    st.markdown(
+        "<p class='rtl-text' style='color: red;'><strong>שירות האחזור (Pinecone) אינו זמין. לא ניתן להמשיך.</strong></p>",
+        unsafe_allow_html=True,
+    )
+    st.stop()
+st.sidebar.markdown("<hr>", unsafe_allow_html=True)
+st.sidebar.markdown(
+    f"<p class='rtl-text'><strong>מנתח (Anthropic):</strong> {'✅ <small>(נדרש לשיטת Anthropic)</small>' if anthropic_analyzer_ready else '❌ <small>(נדרש לשיטת Anthropic)</small>'}</p>",
+    unsafe_allow_html=True,
+)
+st.sidebar.markdown(
+    f"<p class='rtl-text'><strong>מאמת (GPT-4o):</strong> {'✅ <small>(נדרש לשיטת GPT-4o)</small>' if openai_validator_ready else '❌ <small>(נדרש לשיטת GPT-4o)</small>'}</p>",
+    unsafe_allow_html=True,
+)
+st.sidebar.markdown(
+    f"<p class='rtl-text'><strong>מחולל (Anthropic):</strong> {'✅ <small>(נדרש לשיטות Anthropic/GPT-4o)</small>' if anthropic_generator_ready else '❌ <small>(נדרש לשיטות Anthropic/GPT-4o)</small>'}</p>",
+    unsafe_allow_html=True,
+)
+st.sidebar.markdown(
+    f"<p class='rtl-text'><strong>מחולל (Gemini):</strong> {'✅ <small>(נדרש לשיטת Gemini)</small>' if gemini_generator_ready else '❌ <small>(נדרש לשיטת Gemini)</small>'}</p>",
+    unsafe_allow_html=True,
+)
+st.sidebar.markdown("<hr>", unsafe_allow_html=True)
+st.sidebar.markdown("<h3 class='rtl-text'>הגדרות RAG</h3>", unsafe_allow_html=True)
+pipeline_method = st.sidebar.selectbox(
+    "בחר שיטת עיבוד:",
+    options=[
+        "Anthropic (ניתוח וסינון פרטני)",
+        "Gemini (אחזור ויצירה ישירה)",
+        "GPT-4o Paragraph Validator + Claude Synthesizer",
+    ],
+    index=2,
+)
+is_anthropic_pipeline = pipeline_method == "Anthropic (ניתוח וסינון פרטני)"
+is_gemini_pipeline = pipeline_method == "Gemini (אחזור ויצירה ישירה)"
+is_gpt4o_para_pipeline = pipeline_method == "GPT-4o Paragraph Validator + Claude Synthesizer"
+n_retrieve = st.sidebar.slider(
+    "מספר פסקאות לאחזור (Retrieve)", 1, 300, 100,
+    help="כמה פסקאות לאחזר ראשונית (משותף לכל השיטות)."
+)
+n_analyze = st.sidebar.slider(
+    "מספר פסקאות לניתוח (Anthropic בלבד)", 1, min(n_retrieve, 50), min(21, n_retrieve, 50),
+    help="כמה פסקאות יישלחו לניתוח רלוונטיות פרטני ע'י Claude.",
+    disabled=not is_anthropic_pipeline
+)
+relevance_thresh = st.sidebar.slider(
+    "סף רלוונטיות (Anthropic בלבד)", 1, 10, 5,
+    help="הציון המינימלי (1-10) שפסקה צריכה לקבל מ-Claude כדי להיחשב רלוונטית.",
+    disabled=not is_anthropic_pipeline
+)
+n_validate = st.sidebar.slider(
+    "מספר פסקאות לאימות (GPT-4o בלבד)", 1, min(n_retrieve, 100), min(50, n_retrieve),
+    help="כמה מהפסקאות שאוחזרו יישלחו לאימות רלוונטיות פרטני ע'י GPT-4o.",
+    disabled=not is_gpt4o_para_pipeline
+)
+n_final_context = st.sidebar.slider(
+    "פסקאות מקסימום להקשר סופי (Gemini/Anthropic)", 1, n_retrieve, min(21, n_retrieve),
+    help="Gemini/Anthropic: כמה מהפסקאות הטובות ביותר יישלחו ליצירה. GPT-4o: לא בשימוש ישיר (הקשר נקבע ע'י האימות).",
+    disabled=is_gpt4o_para_pipeline
+)
+services_ready = (
+    retriever_ready and
+    ((anthropic_analyzer_ready and anthropic_generator_ready) if is_anthropic_pipeline else True) and
+    (gemini_generator_ready if is_gemini_pipeline else True) and
+    ((openai_validator_ready and anthropic_generator_ready) if is_gpt4o_para_pipeline else True)
+)
+if not services_ready and retriever_ready:
+    st.markdown(
+        f"<div class='alert alert-warning rtl-text' role='alert'>שירות(ים) חסרים. ודא שכל השירותים דרושים זמינים.</div>",
+        unsafe_allow_html=True,
+    )
+@traceable
+def run_rag_pipeline(pipeline_prompt: str, selected_pipeline_method: str, status_container=None):
+    is_anthropic_pipeline = selected_pipeline_method == "Anthropic (ניתוח וסינון פרטני)"
+    is_gemini_pipeline = selected_pipeline_method == "Gemini (אחזור ויצירה ישירה)"
+    is_gpt4o_para_pipeline = selected_pipeline_method == "GPT-4o Paragraph Validator + Claude Synthesizer"
+    result = {
+        "full_response": "", "final_docs_data": [], "status_updates": [],
+        "error": None, "analysis_flow": selected_pipeline_method
+    }
+    current_status_label = "מתחיל עיבוד..."
+    message_placeholder = st.empty()
+    try:
+        current_status_label = f"1. מאחזר עד {n_retrieve} פסקאות מ-Pinecone..."
+        start_retrieval = time.time()
+        if status_container: status_container.update(label=current_status_label)
+        retrieved_docs = find_similar_paragraphs(query_text=pipeline_prompt, n_results=n_retrieve)
+        retrieval_time = time.time() - start_retrieval
+        status_msg = f"אוחזרו {len(retrieved_docs)} פסקאות ב-{retrieval_time:.2f} שניות."
+        result["status_updates"].append(f"1. {status_msg}")
+        current_status_label = f"1. {status_msg}"
+        if status_container: status_container.update(label=current_status_label)
+        if not retrieved_docs:
+            result["full_response"] = "<div class='rtl-text'>לא אותרו מקורות רלוונטיים לשאילתה.</div>"
+            if status_container: status_container.update(label="לא נמצאו מסמכים.", state="complete")
+            message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
+            return result
+        docs_for_generator = []
+        generator_name = ""
+        if is_anthropic_pipeline:
+            generator_name = "Anthropic"
+            analysis_count = min(len(retrieved_docs), n_analyze)
+            current_status_label = f"2. [Anthropic] מנתח רלוונטיות פרטנית ({analysis_count} פסקאות)..."
+            analysis_start_time = time.time()
+            if status_container: status_container.update(label=current_status_label)
+            async def run_anthropic_analysis():
+                docs_to_analyze_local = retrieved_docs[:analysis_count]
+                tasks = [analyze_source_relevance_async(d.get('hebrew_text',''), '', pipeline_prompt) for d in docs_to_analyze_local]
+                analysis_results = await asyncio.gather(*tasks, return_exceptions=True)
+                return docs_to_analyze_local, analysis_results
+            try:
+                loop = asyncio.get_event_loop_policy().get_event_loop()
+                if loop.is_running(): nest_asyncio.apply(); loop = asyncio.get_event_loop_policy().get_event_loop()
+                docs_analyzed, analysis_raw_results = loop.run_until_complete(run_anthropic_analysis())
+            except Exception as loop_err: raise
+            processed_for_filter = []; analysis_success_count = 0; analysis_fail_count = 0;
+            for i, doc in enumerate(docs_analyzed):
+                res = analysis_raw_results[i]
+                if isinstance(res, dict) and 'relevance' in res:
+                    doc['analysis'] = res; processed_for_filter.append(doc); analysis_success_count += 1
+                elif isinstance(res, Exception): analysis_fail_count += 1;
+                else: analysis_fail_count += 1;
+            analysis_time = time.time() - analysis_start_time
+            status_msg = f"ניתוח Anthropic פרטני הושלם ({analysis_success_count} הצלחות, {analysis_fail_count} כשלונות) ב-{analysis_time:.2f} שניות."
+            result["status_updates"].append(f"2. {status_msg}")
+            current_status_label = f"2. {status_msg}"
+            if status_container: status_container.update(label=current_status_label)
+            current_status_label = "3. [Anthropic] סינון לפי ציון רלוונטיות..."
+            if status_container: status_container.update(label=current_status_label)
+            filtered_docs = []
+            for doc in processed_for_filter:
+                 try:
+                      score = int(doc.get('analysis', {}).get('relevance', {}).get('relevance_score', '0'))
+                      doc['analysis']['relevance']['numeric_score'] = score
+                      if score >= relevance_thresh: filtered_docs.append(doc)
+                 except Exception as filter_err: pass
+            filtered_docs.sort(key=lambda d: d.get('analysis',{}).get('relevance',{}).get('numeric_score', 0), reverse=True)
+            docs_for_generator = filtered_docs[:n_final_context]
+            status_msg = f"נבחרו {len(docs_for_generator)} פסקאות לאחר סינון Anthropic (סף: {relevance_thresh}, מקס': {n_final_context})."
+            result["status_updates"].append(f"3. {status_msg}")
+            current_status_label = f"3. {status_msg}"
+            if status_container: status_container.update(label=current_status_label)
+            if not docs_for_generator:
+                result["full_response"] = "<div class='rtl-text'>לא נמצאו פסקאות רלוונטיות מספיק לאחר סינון Anthropic פרטני.</div>"
+                if status_container: status_container.update(label="לא נמצאו פסקאות מסוננות.", state="complete")
+                message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
+                return result
+        elif is_gemini_pipeline:
+            generator_name = "Gemini"
+            status_msg = "2. דילוג על שלב ניתוח/סינון (שיטת Gemini)."; result["status_updates"].append(status_msg)
+            current_status_label = status_msg;
+            if status_container: status_container.update(label=current_status_label)
+            docs_for_generator = retrieved_docs[:n_final_context]
+            status_msg = f"3. נבחרו {len(docs_for_generator)} פסקאות מובילות (לפי אחזור) להקשר עבור Gemini (מקס': {n_final_context})."
+            result["status_updates"].append(status_msg)
+            current_status_label = status_msg
+            if status_container: status_container.update(label=current_status_label)
+            if not docs_for_generator:
+                result["full_response"] = "<div class='rtl-text'>לא אותרו מסמכים כלל (שגיאה פנימית).</div>"
+                if status_container: status_container.update(label="שגיאה בבחירת הקשר.", state="error")
+                message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
+                return result
+        elif is_gpt4o_para_pipeline:
+            generator_name = "Anthropic"
+            docs_to_validate = retrieved_docs[:n_validate]
+            num_to_validate = len(docs_to_validate)
+            if not docs_to_validate:
+                 result["full_response"] = "<div class='rtl-text'>שגיאה: אין מסמכים לאימות (לאחר אחזור).</div>"
+                 if status_container: status_container.update(label="שגיאה לפני אימות.", state="error")
+                 message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
+                 return result
+            status_msg = f"2. נבחרו {num_to_validate} פסקאות מובילות לאימות פרטני (מתוך {len(retrieved_docs)})."
+            result["status_updates"].append(status_msg)
+            current_status_label = status_msg
+            if status_container: status_container.update(label=current_status_label)
+            current_status_label = f"3. [GPT-4o] מתחיל אימות מקבילי של {num_to_validate} פסקאות..."
+            validation_start_time = time.time()
+            if status_container: status_container.update(label=current_status_label)
+            tasks = [validate_paragraph_relevance_gpt4o(doc, pipeline_prompt, i) for i, doc in enumerate(docs_to_validate)]
+            validation_results = []
+            try:
+                loop = asyncio.get_event_loop_policy().get_event_loop()
+                if loop.is_running(): nest_asyncio.apply(); loop = asyncio.get_event_loop_policy().get_event_loop()
+                validation_results = loop.run_until_complete(asyncio.gather(*tasks, return_exceptions=True))
+            except Exception as gather_err:
+                 result["error"] = f"שגיאה בביצוע האימות המקבילי: {gather_err}"
+                 result["full_response"] = f"<div class='rtl-text'>אירעה שגיאה קריטית בשלב אימות המידע.</div>";
+                 if status_container: status_container.update(label="שגיאה באימות!", state="error")
+                 message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
+                 return result
+            validation_time = time.time() - validation_start_time
+            passed_count = 0; failed_count = 0; filtered_paragraphs = []
+            current_status_label = "4. [GPT-4o] סינון פסקאות לפי תוצאות האימות..."
+            if status_container: status_container.update(label=current_status_label)
+            for i, res in enumerate(validation_results):
+                para_num = i + 1
+                if isinstance(res, Exception): failed_count += 1;
+                elif isinstance(res, dict) and res.get("validation"):
+                    if res["validation"].get("contains_relevant_info") is True:
+                        passed_count += 1; filtered_paragraphs.append(res.get("paragraph_data", {}))
+                else: failed_count += 1;
+            filtered_paragraphs = [p for p in filtered_paragraphs if p]
+            status_msg_val = f"אימות GPT-4o פרטני הושלם ({passed_count} עברו, {num_to_validate - passed_count - failed_count} נדחו, {failed_count} נכשלו) ב-{validation_time:.2f} שניות."
+            result["status_updates"].append(f"3. {status_msg_val}")
+            status_msg_filter = f"נאספו {len(filtered_paragraphs)} פסקאות רלוונטיות לאחר אימות."
+            result["status_updates"].append(f"4. {status_msg_filter}")
+            current_status_label = f"4. {status_msg_filter}"
+            if status_container: status_container.update(label=current_status_label)
+            if not filtered_paragraphs:
+                result["full_response"] = "<div class='rtl-text'>לא נמצא מידע רלוונטי בפסקאות שנבדקו ע'י GPT-4o.</div>"
+                if status_container: status_container.update(label="לא נמצא מידע רלוונטי.", state="complete")
+                message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
+                return result
+            docs_for_generator = filtered_paragraphs
+        else:
+            raise ValueError(f"שיטת עיבוד לא ידועה: {selected_pipeline_method}")
+        current_status_label = f"5. מכין הקשר ({len(docs_for_generator)} פסקאות) ומחולל תשובה סופית ({generator_name})..."
+        result["status_updates"].append(f"5. מכין הקשר ומחולל תשובה ({generator_name})...")
+        if status_container: status_container.update(label=current_status_label)
+        start_generation = time.time()
+        final_response_text = ""
+        generation_error_details = None
+        result["final_docs_data"] = docs_for_generator
+        try:
+            if generator_name == "Gemini":
+                generator_stream = generate_gemini(query=pipeline_prompt, context_documents=docs_for_generator)
+                response_chunks = []
+                for chunk in generator_stream:
+                    if isinstance(chunk, str) and chunk.strip().startswith("--- שגיאה"):
+                        generation_error_details = chunk.strip()
+                        break
+                    response_chunks.append(str(chunk))
+                    temp_stream_response = "".join(response_chunks)
+                    message_placeholder.markdown(f"<div class='rtl-text'>{temp_stream_response}▌</div>", unsafe_allow_html=True)
+                if generation_error_details is None: final_response_text = "".join(response_chunks)
+            elif generator_name == "Anthropic":
+                async def consume_anthropic_stream():
+                    history = [{"role": "user", "content": pipeline_prompt}]
+                    local_chunks = []
+                    async for chunk in generate_anthropic(messages=history, context_documents=docs_for_generator):
+                        if isinstance(chunk, str) and chunk.strip().startswith("--- שגיאה"):
+                            raise RuntimeError(f"Error yielded from Anthropic generator: {chunk.strip()}")
+                        local_chunks.append(str(chunk))
+                        temp_response = "".join(local_chunks)
+                        message_placeholder.markdown(f"<div class='rtl-text'>{temp_response}▌</div>", unsafe_allow_html=True)
+                    return "".join(local_chunks)
+                try:
+                    loop = asyncio.get_event_loop_policy().get_event_loop()
+                    if loop.is_running(): nest_asyncio.apply(); loop = asyncio.get_event_loop_policy().get_event_loop()
+                    final_response_text = loop.run_until_complete(consume_anthropic_stream())
+                except Exception as consume_err:
+                    generation_error_details = f"{type(consume_err).__name__}: {str(consume_err)}"
+            else:
+                raise RuntimeError(f"Generator name '{generator_name}' not recognized.")
+        except Exception as gen_err:
+            generation_error_details = f"{type(gen_err).__name__}: {str(gen_err)}"
+        generation_time = time.time() - start_generation
+        if generation_error_details:
+             result["error"] = f"שגיאה במהלך יצירת התשובה ({generator_name}): {generation_error_details}"
+             result["full_response"] = f"<div class='rtl-text'><strong>שגיאה ביצירת התשובה.</strong><br>פרטים: {generation_error_details}</div>"
+             message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
+        else:
+            lines_to_remove = ["יהי רצון שנזכה לגאולה השלמה במהרה בימינו אמן.", "יהי רצון שנזכה...", "הכותב וחותם לכבוד התורה ולומדיה", "הכותב וחותם לכבוד התורה...", "בכבוד רב,", "בברכה,"]
+            response_lines = final_response_text.strip().split('\n'); cleaned_lines = response_lines[:]
+            while cleaned_lines:
+                last_line = cleaned_lines[-1].strip()
+                if any(last_line.lower() == ltr.lower() or last_line.lower().startswith(ltr.lower().replace('...','')) for ltr in lines_to_remove): cleaned_lines.pop()
+                else: break
+            final_response_text = "\n".join(cleaned_lines).strip()
+            result["full_response"] = final_response_text
+            message_placeholder.markdown(f"<div class='rtl-text'>{final_response_text}</div>", unsafe_allow_html=True)
+    except Exception as e:
+        pipeline_error_type = type(e).__name__; pipeline_error_msg = str(e)
+        result["error"] = f"שגיאה בזמן הריצה: {pipeline_error_type}: {pipeline_error_msg}"
+        result["full_response"] = f"<div class='rtl-text'><strong>שגיאה במהלך העיבוד ({pipeline_error_type})</strong><br>אנא נסה שוב מאוחר יותר.<details><summary>פרטים טכניים</summary><pre>{traceback.format_exc()}</pre></details></div>"
+        message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
+        if status_container: status_container.update(label="שגיאה בעיבוד!", state="error")
+    return result
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        content_display = message['content']
+        if not content_display.strip().startswith(('<div', '<p', '<strong', '<details')):
+             content_display = f"<div class='rtl-text'>{content_display}</div>"
+        st.markdown(content_display, unsafe_allow_html=True)
+        if message["role"] == "assistant" and "final_docs" in message and message["final_docs"]:
+            final_docs_data = message.get("final_docs", [])
+            pipeline_flow_used = message.get("analysis_flow", "לא ידוע")
+            if final_docs_data:
+                st.expander("מסמכים שנמצאו", expanded=False).write(final_docs_data)
+                expander_title_text = f"הצג {len(final_docs_data)} פסקאות מקור שנשלחו למחולל"
+                if pipeline_flow_used == "Anthropic (ניתוח וסינון פרטני)":
+                    expander_title_text += " (לאחר סינון Anthropic פרטני)"
+                elif pipeline_flow_used == "Gemini (אחזור ויצירה ישירה)":
+                    expander_title_text += " (ללא סינון נוסף)"
+                elif pipeline_flow_used == "GPT-4o Paragraph Validator + Claude Synthesizer":
+                    expander_title_text += " (לאחר אימות GPT-4o פרטני)"
+                else:
+                    expander_title_text += " (לאחר עיבוד)"
+                expander_title = f"<span class='rtl-text'>{expander_title_text}</span>"
+                with st.expander(expander_title, expanded=False):
+                    st.markdown("<div class='expander-content'>", unsafe_allow_html=True)
+                    for i, doc in enumerate(final_docs_data):
+                        score_info = ""
+                        source_name = doc.get('source_name', 'לא ידוע')
+                        original_id = doc.get('original_id', 'N/A')
+                        hebrew_text = doc.get('hebrew_text', 'טקסט המקור חסר')
+                        st.markdown(
+                            f"<div class='source-info rtl-text'><strong>מקור {i+1}:</strong> ספר: {source_name}, ID: {original_id}{score_info}</div>",
+                            unsafe_allow_html=True,
+                        )
+                        st.markdown(f"<div class='hebrew-text'>{hebrew_text}</div>", unsafe_allow_html=True)
+                    st.markdown("</div>", unsafe_allow_html=True)
+if prompt := st.chat_input("שאל שאלה בענייני חסידות...", disabled=not services_ready, key="chat_input"):
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(f"<div class='rtl-text'>{prompt}</div>", unsafe_allow_html=True)
+    with st.chat_message("assistant"):
+        status_control_asst = None
+        rag_result_asst = None
+        try:
+            status_label = f"<span class='rtl-text'>מעבד בקשה באמצעות '{pipeline_method}'...</span>"
+            with st.status(status_label, expanded=True) as status:
+                status_control_asst = status
+                rag_result_asst = run_rag_pipeline(
+                    pipeline_prompt=prompt,
+                    selected_pipeline_method=pipeline_method,
+                    status_container=status_control_asst,
+                )
+            if rag_result_asst and isinstance(rag_result_asst, dict):
+                pipeline_error_value = rag_result_asst.get("error")
+                final_docs_value = rag_result_asst.get("final_docs_data", [])
+                final_docs_to_store = []
+                if pipeline_error_value is None:
+                    final_docs_to_store = final_docs_value
+                flow_to_store = rag_result_asst.get("analysis_flow", "Error")
+                if pipeline_error_value is not None:
+                    flow_to_store = "Error"
+                st.session_state.messages.append({
+                    "role": "assistant",
+                    "content": rag_result_asst.get("full_response", "..."),
+                    "final_docs": final_docs_to_store,
+                    "analysis_flow": flow_to_store,
+                })
+                if rag_result_asst.get("status_updates"):
+                    expander_label = "<span class='rtl-text'>הצג שלבי עיבוד</span>"
+                    with st.expander(expander_label, expanded=False):
+                        for update in rag_result_asst["status_updates"]:
+                            st.markdown(f"<div class='rtl-text'><code>- {update}</code></div>", unsafe_allow_html=True)
+            else:
+                fallback_err_msg_html = "<div class='rtl-text'><strong>שגיאה בלתי צפויה בתקשורת עם מנגנון העיבוד (fallback).</strong></div>"
+                st.session_state.messages.append({
+                    "role": "assistant",
+                    "content": fallback_err_msg_html,
+                    "final_docs": [],
+                    "analysis_flow": "Error",
+                })
+        except Exception as e:
+            error_display_html = f"<div class='rtl-text'><strong>שגיאה קריטית!</strong><br><pre>{traceback.format_exc()}</pre></div>"
+            st.error(error_display_html, icon="🔥")
+            st.session_state.messages.append({
+                "role": "assistant",
+                "content": error_display_html,
+                "final_docs": [],
+                "analysis_flow": "Critical Error",
+            })

file_processor.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import os
+import openai
+import json
+import uuid
+import re
+import asyncio
+import time
+import argparse
+from typing import List, Dict, Optional, Tuple
+from dotenv import load_dotenv
+# --- Required Libraries ---
+try:
+    from docx import Document
+except ImportError:
+    print("Requirement Missing: Please install 'python-docx' (`pip install python-docx`)")
+    exit()
+# PDF library (PyPDF2) import removed
+try:
+    from langdetect import detect, DetectorFactory, LangDetectException
+    DetectorFactory.seed = 0
+except ImportError:
+    print("Requirement Missing: Please install 'langdetect' (`pip install langdetect`)")
+    exit()
+# --- Configuration ---
+load_dotenv()
+API_KEY = os.environ.get("OPENAI_API_KEY")
+if not API_KEY:
+    print("🛑 ERROR: OpenAI API key not found. Set OPENAI_API_KEY in your .env file.")
+    exit()
+OUTPUT_DIR = "data"
+TRANSLATION_MODEL = "gpt-4o-mini"
+MAX_CONCURRENT_TRANSLATIONS = 10
+TARGET_LANGUAGE = "en"
+# --- Chunking Configuration ---
+PARAGRAPH_CHUNK_THRESHOLD = 2000 # Characters
+CHUNK_SIZE = 800 # Characters
+CHUNK_OVERLAP = 100 # Characters
+# Validate chunking config
+if CHUNK_OVERLAP >= CHUNK_SIZE:
+     print(f"🛑 ERROR: CHUNK_OVERLAP ({CHUNK_OVERLAP}) must be less than CHUNK_SIZE ({CHUNK_SIZE}).")
+     exit()
+# --- Setup OpenAI Client ---
+try:
+    client = openai.AsyncOpenAI(api_key=API_KEY)
+    print("✅ OpenAI Async Client Initialized.")
+except Exception as e:
+    print(f"🛑 ERROR: Failed to initialize OpenAI client: {e}")
+    exit()
+# --- Text Extraction Functions ---
+def extract_text_from_docx(file_path: str) -> Optional[str]:
+    """Extracts all text from a DOCX file."""
+    try:
+        doc = Document(file_path)
+        full_text = [para.text for para in doc.paragraphs if para.text.strip()]
+        print(f"  📄 Extracted {len(full_text)} paragraphs from DOCX: {os.path.basename(file_path)}")
+        return "\n\n".join(full_text) # Use double newline join as a base
+    except Exception as e:
+        print(f"  ❌ ERROR reading DOCX file '{os.path.basename(file_path)}': {e}")
+        return None
+# --- PDF Extraction Function Removed ---
+def extract_text_from_txt(file_path: str) -> Optional[str]:
+    """Reads text from a TXT file."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            text = file.read()
+            print(f"  📄 Read TXT file: {os.path.basename(file_path)} (length: {len(text)} chars)")
+            return text
+    except Exception as e:
+        print(f"  ❌ ERROR reading TXT file '{os.path.basename(file_path)}': {e}")
+        return None
+# --- Text Processing Functions (segment, chunk, detect, translate - No changes needed here) ---
+def _chunk_text(text: str, size: int, overlap: int) -> List[str]:
+    """Helper function to chunk a single block of text."""
+    # (Implementation remains the same as previous version)
+    if not text: return []
+    chunks = []
+    start_index = 0
+    text_len = len(text)
+    while start_index < text_len:
+        end_index = start_index + size
+        chunk = text[start_index:end_index]
+        chunks.append(chunk.strip())
+        next_start = start_index + size - overlap
+        if next_start <= start_index: next_start = start_index + 1
+        start_index = next_start
+        if start_index >= text_len: break
+    return [c for c in chunks if c]
+def segment_into_paragraphs_or_chunks(text: str) -> List[str]:
+    """
+    Segments text into paragraphs based on newlines.
+    If a resulting paragraph exceeds PARAGRAPH_CHUNK_THRESHOLD,
+    it chunks that specific paragraph instead.
+    """
+    # (Implementation remains the same as previous version)
+    if not text: return []
+    normalized_text = text.replace('\r\n', '\n').replace('\r', '\n')
+    initial_segments = re.split(r'\n\s*\n+', normalized_text)
+    initial_segments = [s.strip() for s in initial_segments if s.strip()]
+    if len(initial_segments) <= 1 and '\n' in normalized_text:
+        print("  Parsing: Double newline split yielded few segments, trying single newline split.")
+        initial_segments = [s.strip() for s in normalized_text.split('\n') if s.strip()]
+    if not initial_segments:
+         print("  Parsing: No segments found after initial splitting.")
+         return []
+    print(f"  Parsing: Initial segmentation yielded {len(initial_segments)} segments.")
+    final_segments = []
+    long_segment_count = 0
+    for segment in initial_segments:
+        if len(segment) > PARAGRAPH_CHUNK_THRESHOLD:
+            long_segment_count += 1
+            print(f"    ❗ Segment ({len(segment)} chars > {PARAGRAPH_CHUNK_THRESHOLD}) is too long. Applying chunking (Size: {CHUNK_SIZE}, Overlap: {CHUNK_OVERLAP})...")
+            chunks = _chunk_text(segment, CHUNK_SIZE, CHUNK_OVERLAP)
+            print(f"      -> Chunked into {len(chunks)} pieces.")
+            final_segments.extend(chunks)
+        elif segment:
+            final_segments.append(segment)
+    if long_segment_count > 0:
+        print(f"  Parsing: Chunking applied to {long_segment_count} long segments.")
+    print(f"  🔪 Final segmentation/chunking resulted in {len(final_segments)} pieces.")
+    return final_segments
+def detect_language_safe(text: str, default_lang: str = "unknown") -> str:
+    """Detects language, handling short text and errors."""
+    # (Implementation remains the same as previous version)
+    clean_text = text.strip()
+    if not clean_text or len(clean_text) < 10: return default_lang
+    try: return detect(clean_text)
+    except LangDetectException: return default_lang
+    except Exception as e:
+        print(f"    ❌ Unexpected error during language detection: {e}")
+        return "error"
+async def translate_paragraph(text: str, target_lang: str, semaphore: asyncio.Semaphore) -> Tuple[str, Optional[str]]:
+    """Translates a single paragraph/chunk using OpenAI, with rate limiting."""
+    # (Implementation remains the same as previous version)
+    async with semaphore:
+        detected_lang = detect_language_safe(text)
+        if detected_lang != 'he': return text, None
+        print(f"    🌍 Translating Hebrew segment to {target_lang.upper()}: '{text[:60]}...'")
+        prompt = f"Translate the following Hebrew text accurately to {target_lang}. Provide only the translation, without any introductory phrases.\nHebrew Text:\n```heb\n{text}\n```\nTranslation:"
+        retries = 1
+        for attempt in range(retries + 1):
+            try:
+                response = await client.chat.completions.create(
+                    model=TRANSLATION_MODEL, messages=[ {"role": "system", "content": f"You are an expert translator specializing in Hebrew to {target_lang} translation. Provide only the translated text."}, {"role": "user", "content": prompt} ],
+                    max_tokens=int(len(text.split()) * 2.5) + 50, temperature=0.1, n=1, stop=None, )
+                translation = response.choices[0].message.content.strip()
+                if translation:
+                     if translation.strip() == text.strip():
+                          print(f"    ⚠️ Translation attempt returned original text for: '{text[:60]}...'")
+                          return text, "Translation Failed: Model returned original text"
+                     return text, translation
+                else:
+                    print(f"    ❌ Translation attempt returned empty response for: '{text[:60]}...'")
+                    if attempt == retries: return text, "Translation Failed: Empty Response"
+            except openai.RateLimitError as e:
+                wait_time = 5 * (attempt + 1)
+                print(f"    ⏳ Rate limit hit during translation. Waiting {wait_time}s... ({e})")
+                await asyncio.sleep(wait_time)
+                if attempt == retries: return text, "Translation Failed: Rate Limited"
+            except openai.APIError as e:
+                 print(f"    ❌ OpenAI API Error during translation: {e}")
+                 wait_time = 3 * (attempt + 1); await asyncio.sleep(wait_time)
+                 if attempt == retries: return text, f"Translation Failed: API Error ({e.code})"
+            except Exception as e:
+                print(f"    ❌ Unexpected error during translation: {e}")
+                if attempt == retries: return text, f"Translation Failed: Unexpected Error ({type(e).__name__})"
+            if attempt < retries: await asyncio.sleep(2 * (attempt + 1))
+        return text, "Translation Failed: Max Retries"
+# --- Main Processing Function ---
+async def process_file(input_path: str, output_dir: str):
+    """Processes a single DOCX or TXT file: extracts, segments/chunks, translates, saves JSON."""
+    print(f"\n--- Processing file: {os.path.basename(input_path)} ---")
+    start_time = time.time()
+    file_ext = os.path.splitext(input_path)[1].lower()
+    extracted_text: Optional[str] = None
+    # 1. Extract Text (Only DOCX and TXT)
+    if file_ext == ".docx":
+        extracted_text = extract_text_from_docx(input_path)
+    elif file_ext == ".txt":
+        extracted_text = extract_text_from_txt(input_path)
+    else:
+        # This case should ideally not be hit if input is pre-filtered, but acts as safeguard
+        print(f"  ⚠️ Internal Skip: Unsupported extension '{file_ext}' passed to process_file.")
+        return
+    if not extracted_text or not extracted_text.strip():
+        print("  ❌ Text extraction failed or returned empty. Skipping.")
+        return
+    # 2. Segment into Paragraphs or Chunks
+    segments = segment_into_paragraphs_or_chunks(extracted_text)
+    if not segments:
+        print("  ❌ No paragraphs or chunks found after segmentation. Skipping.")
+        return
+    # 3. Translate Hebrew Segments (Asynchronously)
+    output_data = []
+    translation_semaphore = asyncio.Semaphore(MAX_CONCURRENT_TRANSLATIONS)
+    tasks = []
+    print(f"  🗣️ Preparing to translate {len(segments)} segments (max concurrent: {MAX_CONCURRENT_TRANSLATIONS})...")
+    for i, seg_text in enumerate(segments):
+        task = asyncio.create_task(translate_paragraph(seg_text, TARGET_LANGUAGE, translation_semaphore))
+        tasks.append(task)
+    translation_results = await asyncio.gather(*tasks)
+    # 4. Format into JSON Structure
+    print("  📝 Formatting results into JSON...")
+    translation_failures = 0
+    for i, (original_he, translation_en) in enumerate(translation_results):
+        failure_msg = "Translation Failed"
+        is_failure = isinstance(translation_en, str) and failure_msg in translation_en
+        if is_failure:
+            translation_failures += 1
+            english_text = translation_en # Store the error message
+        else:
+            english_text = translation_en if translation_en else ""
+        output_data.append({ "id": str(uuid.uuid4()), "hebrew": original_he, "english": english_text })
+    if translation_failures > 0:
+         print(f"  ⚠️ Encountered {translation_failures} translation failures out of {len(segments)} segments.")
+    # 5. Save to JSON File
+    base_filename = os.path.splitext(os.path.basename(input_path))[0]
+    output_filename = f"{base_filename}.json"
+    output_path = os.path.join(output_dir, output_filename)
+    try:
+        os.makedirs(output_dir, exist_ok=True)
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(output_data, f, ensure_ascii=False, indent=2)
+        end_time = time.time()
+        print(f"✅ Successfully saved {len(output_data)} segments to: {output_path}")
+        print(f"⏱️ File processing time: {end_time - start_time:.2f} seconds")
+    except Exception as e:
+        print(f"  ❌ ERROR saving JSON file '{output_path}': {e}")
+# --- Script Execution ---
+if __name__ == "__main__":
+    # Update description to remove PDF mention
+    parser = argparse.ArgumentParser(description="Process DOCX and TXT files into paragraph/chunk-based JSON with Hebrew-to-English translation.")
+    parser.add_argument("input_paths", nargs='+', help="Path(s) to input file(s) or directory(ies) containing DOCX/TXT files.")
+    parser.add_argument("-o", "--output_dir", default=OUTPUT_DIR, help=f"Directory to save output JSON files (default: '{OUTPUT_DIR}')")
+    parser.add_argument("--chunk_threshold", type=int, default=PARAGRAPH_CHUNK_THRESHOLD, help="Max chars per paragraph before chunking.")
+    parser.add_argument("--chunk_size", type=int, default=CHUNK_SIZE, help="Target chunk size in chars.")
+    parser.add_argument("--chunk_overlap", type=int, default=CHUNK_OVERLAP, help="Chunk overlap in chars.")
+    args = parser.parse_args()
+    OUTPUT_DIR = args.output_dir
+    PARAGRAPH_CHUNK_THRESHOLD = args.chunk_threshold
+    CHUNK_SIZE = args.chunk_size
+    CHUNK_OVERLAP = args.chunk_overlap
+    if CHUNK_OVERLAP >= CHUNK_SIZE:
+         print(f"🛑 ERROR: Chunk overlap ({CHUNK_OVERLAP}) must be less than chunk size ({CHUNK_SIZE}). Adjust --chunk_overlap or --chunk_size.")
+         exit()
+    print(f"🚀 Starting File Processor (DOCX & TXT only)...") # Updated startup message
+    print(f"📂 Output Directory: {os.path.abspath(OUTPUT_DIR)}")
+    print(f"🔪 Paragraph/Chunking Settings: Threshold={PARAGRAPH_CHUNK_THRESHOLD}, Size={CHUNK_SIZE}, Overlap={CHUNK_OVERLAP}")
+    files_to_process = []
+    for path in args.input_paths:
+        if os.path.isfile(path):
+            files_to_process.append(path)
+        elif os.path.isdir(path):
+            print(f"📁 Scanning directory: {path}")
+            for filename in os.listdir(path):
+                full_path = os.path.join(path, filename)
+                if os.path.isfile(full_path):
+                    files_to_process.append(full_path)
+        else:
+            print(f"⚠️ Warning: Input path not found or not a file/directory: {path}")
+    # Update supported extensions list
+    supported_extensions = ('.docx', '.txt')
+    valid_files = [f for f in files_to_process if f.lower().endswith(supported_extensions)]
+    if not valid_files:
+        # Update message for no supported files found
+        print(f"\n🛑 No supported files ({', '.join(supported_extensions)}) found in the specified paths. Exiting.")
+    else:
+        print(f"\nFound {len(valid_files)} supported files to process:")
+        for f in valid_files:
+            print(f"  - {os.path.basename(f)}")
+        async def main():
+            process_tasks = [process_file(f, OUTPUT_DIR) for f in valid_files]
+            await asyncio.gather(*process_tasks)
+        script_start_time = time.time()
+        asyncio.run(main())
+        script_end_time = time.time()
+        print(f"\n🏁 File processing complete. Total script time: {script_end_time - script_start_time:.2f} seconds.")

generated-icon.png ADDED Viewed

generation_service_anthropic.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# generation_service_anthropic.py
+# For LangSmith tracing; NO Braintrust; clean for OpenAI/Anthropic API
+import os
+import anthropic
+import re
+import traceback
+from typing import List, Dict, AsyncGenerator
+from langsmith import traceable
+# --- Environment: ensure API key is injected (from Replit secrets) ---
+os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
+os.environ["LANGSMITH_TRACING"] = "true"
+os.environ["ANTHROPIC_API_KEY"] = os.environ["ANTHROPIC_API_KEY"]
+os.environ["LANGSMITH_API_KEY"] = os.environ["LANGSMITH_API_KEY"]
+os.environ["LANGSMITH_PROJECT"] = os.environ["LANGSMITH_PROJECT"]
+# --- Anthropic config and client ---
+ANTHROPIC_API_KEY = os.environ["ANTHROPIC_API_KEY"]
+GENERATION_MODEL = "claude-3-7-sonnet-20250219"
+client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY) if ANTHROPIC_API_KEY else None
+def check_generator_status():
+    if not client: return False, "Anthropic client not initialized."
+    return True, f"Anthropic generation service ready (Model: {GENERATION_MODEL})."
+def clean_source_text(text):
+    if not text: return ""
+    cleaned = text; cleaned = re.sub(r'@\d+', '', cleaned); cleaned = re.sub(r'<HAL>', '', cleaned, flags=re.IGNORECASE); cleaned = cleaned.replace('<br>', ' ').replace('<br />', ' '); cleaned = re.sub(r'\s+', ' ', cleaned).strip()
+    return cleaned
+def format_context_for_prompt(documents):
+    if not documents: return ""
+    formatted_docs = []
+    language_key = 'hebrew_text'; id_key = 'original_id'
+    for index, doc in enumerate(documents):
+        full_text_original = doc.get(language_key, ''); doc_id = doc.get(id_key, f'unknown_{index+1}')
+        full_text_cleaned = clean_source_text(full_text_original)
+        if full_text_cleaned: formatted_docs.append(f"<source index=\"{index + 1}\" id=\"{doc_id}\">\n<full_text>{full_text_cleaned}</full_text>\n</source>")
+    return "\n\n".join(formatted_docs)
+EXAMPLE_RESPONSE_HEBREW = """<example response hebrew>
+על פי המקורות שהובאו, חשיבות השמחה בעבודת ה' היא מרכזית. נאמר כי <quote source_index="1">עיקר עבודת ה' היא בשמחה</quote>, כפי הפסוק <quote source_index="1">'עבדו את ה' בשמחה'</quote>. הסיבה לכך היא <quote source_index="1">כי השמחה פותחת הלב ומאירה הנשמה, ומביאה לידי דביקות בהשי"ת</quote>. לעומת זאת, מצב של עצבות גורם לתוצאה הפוכה, שכן <quote source_index="1">על ידי העצבות ח"ו נסתם הלב ואינו יכול לקבל אור הקדושה</quote>. מקור נוסף מדגיש כי השמחה היא תנאי לקבלת רוח הקודש והשראת השכינה, כפי שנאמר <quote source_index="2">שאין השכינה שורה אלא מתוך שמחה של מצוה</quote>, וכן <quote source_index="2">שלא שרתה עליו שכינה מפני שהיה עצב</quote>, כפי שלמדו מיעקב אבינו.
+</example response hebrew>"""
+@traceable
+async def generate_response_stream_async(
+    messages: List[Dict],
+    context_documents: List[Dict],
+) -> AsyncGenerator:
+    """
+    Generates a response using Anthropic, yields text chunks.
+    Traced with LangSmith.
+    """
+    global client
+    ready, msg = check_generator_status()
+    if not ready or client is None: yield f"--- שגיאה: {msg} ---"; return
+    last_user_msg_content = "שאלה לא נמצאה"
+    for msg_ in reversed(messages):
+        if msg_.get("role") == "user": last_user_msg_content = str(msg_.get("content", "")); break
+    try:
+        formatted_context = format_context_for_prompt(context_documents)
+        has_context = bool(formatted_context)
+        if not has_context and context_documents:
+            yield f"--- שגיאה: המקורות שסופקו ריקים לאחר ניקוי. ---"; return
+        elif not has_context and not context_documents:
+            yield f"--- שגיאה: לא סופקו מקורות להקשר. ---"; return
+    except Exception as format_err:
+        yield f"--- שגיאה בעיצוב ההקשר: {format_err} ---"; return
+    # System prompt as before
+    system_prompt = f"""<instructions>
+You are an expert assistant specializing in Chassidic texts...
+**Response Requirements:**
+(Keep all instructions as before)
+</instructions>
+{EXAMPLE_RESPONSE_HEBREW}"""
+    api_messages = []
+    user_prompt_content = f"<context>\n{formatted_context}\n</context>\n\nBased *exclusively* on the source text provided... Question (Hebrew):\n{last_user_msg_content}"
+    api_messages.append({"role": "user", "content": user_prompt_content})
+    print(f" -> Sending request to Anthropic (Model: {GENERATION_MODEL})...")
+    final_response_text_chunks = []
+    try:
+        async with client.messages.stream(
+            model=GENERATION_MODEL, max_tokens=20000, system=system_prompt,
+            messages=api_messages, temperature=1.0,
+            thinking={"type": "enabled", "budget_tokens": 16000}
+        ) as stream:
+            print(f" -> Anthropic stream created successfully...")
+            async for chunk in stream.text_stream:
+                if chunk and chunk.strip():
+                    final_response_text_chunks.append(chunk)
+                    yield chunk
+    except Exception as e:
+        yield f"\n\n--- שגיאה: {type(e).__name__} - {e} ---"
+        traceback.print_exc()

generation_service_gemini.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# generation_service_gemini.py
+import google.generativeai as genai
+import os
+import re
+import traceback
+from typing import List, Dict, Generator # Use standard Generator
+# --- Attempt to Import Shared Functions ---
+try:
+    from generation_service_anthropic import clean_source_text
+    print("Successfully imported clean_source_text from generation_service_anthropic.")
+except ImportError:
+    print("Warning: Could not import clean_source_text. Using fallback cleaner.")
+    def clean_source_text(text): # Fallback
+        if not text: return ""
+        cleaned = text; cleaned = re.sub(r'@\d+', '', cleaned); cleaned = re.sub(r'<HAL>', '', cleaned, flags=re.IGNORECASE); cleaned = cleaned.replace('<br>', ' ').replace('<br />', ' '); cleaned = re.sub(r'\s+', ' ', cleaned).strip()
+        return cleaned
+# --- End Fallback Definitions ---
+# --- Configuration ---
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+GENERATION_MODEL = "gemini-2.5-pro-preview-03-25" # Your model
+# --- End Configuration ---
+# --- Client Initialization ---
+genai_client = None
+if GOOGLE_API_KEY:
+    try:
+        genai.configure(api_key=GOOGLE_API_KEY)
+        genai_client = genai.GenerativeModel(GENERATION_MODEL)
+        print(f"Google AI client initialized for Gemini generation (Model: {GENERATION_MODEL}).")
+    except Exception as e: print(f"Error initializing Google AI client: {e}"); traceback.print_exc()
+else: print("GOOGLE_API_KEY not found. Gemini generation service will not function.")
+def check_gemini_generator_status():
+    if not genai_client: return False, f"Gemini generator client not initialized."
+    return True, f"Gemini generation service ready (Model: {GENERATION_MODEL})."
+# --- MODIFIED format_context_for_prompt (Keep ID attribute) ---
+def format_context_for_prompt(documents: List[Dict]) -> str:
+    if not documents: return ""
+    formatted_docs = []
+    language_key = 'hebrew_text'
+    id_key = 'original_id'
+    for index, doc in enumerate(documents):
+        full_text_original = doc.get(language_key, '')
+        paragraph_id = doc.get(id_key, f'unknown_id_{index+1}')
+        try: full_text_cleaned = clean_source_text(full_text_original)
+        except NameError: full_text_cleaned = full_text_original # Fallback
+        if full_text_cleaned:
+            formatted_docs.append(
+                f'<source index="{index + 1}" id="{paragraph_id}">\n' # Keep id attribute
+                f'<full_text>{full_text_cleaned}</full_text>\n'
+                f'</source>'
+            )
+    return "\n\n".join(formatted_docs)
+# --- END MODIFIED format_context_for_prompt ---
+# --- *** NEW SIMPLIFIED EXAMPLE_RESPONSE_HEBREW *** ---
+EXAMPLE_RESPONSE_HEBREW = """<example response hebrew>
+במקורות שהובאו מצאנו כמה נקודות בנוגע לרדיפת פרעה אחר בני ישראל. ראשית, היה זה רצון השי"ת להביא את המצריים לים סוף "כדי שיטבעו" (ID: cc792519-8a96-4c2e-96a7-e940a3d6688f) ויתפרסם כבודו יתברך בעולם. הקב"ה סיבב זאת על ידי שהטעה את פרעה לחשוב שבני ישראל "נבוכים הם בארץ סגר עליהם המדבר" (ID: 2e0227b5-f359-4a60-ab51-2ba9f6c3fca5), מה שעורר אותו לרדוף אחריהם.
+עוד מבואר כי נס קריעת ים סוף נועד להורות "הוראה מפורסמת היות בו יתברך פעולת ההפכים" (ID: cde20ae5-0374-4023-9f15-e721b4920db8), דהיינו שבאותו רגע שהיטיב לישראל וקרע לפניהם את הים, הוא הרע למצרים והטביעם בתוכו.
+בנוגע לשאלה מדוע הים לא נקרע מיד, מובא שהיו טענות שונות, כגון שעדיין לא הושלם זמן הגלות של ת' שנה, וכן טענת המקטרג ש"הללו עובדי עבודה זרה והללו עובדי עבודה זרה" (ID: [Could be a different ID if cited]). טענות אלו נדחו, בין היתר, משום שהשעבוד הקשה השלים את הזמן, וכן מפני שעבודתם של ישראל היתה "באונס ושוגג" (ID: [Could be a different ID if cited]).
+</example response hebrew>"""
+# --- *** END NEW SIMPLIFIED EXAMPLE_RESPONSE_HEBREW *** ---
+# --- Synchronous Generation Function ---
+def generate_response_stream_gemini(
+    query: str,
+    context_documents: List[Dict]
+) -> Generator[str, None, None]:
+    global genai_client
+    ready, msg = check_gemini_generator_status()
+    if not ready or genai_client is None: yield f"שגיאה: ..."; return
+    if not query: yield "שגיאה: ..."; return
+    try:
+        formatted_context = format_context_for_prompt(context_documents)
+    except Exception as format_err: yield f"שגיאה ...: {format_err}"; return
+    has_context = bool(formatted_context)
+    if not has_context: yield "לא סופקו מקורות לעיון."; return
+    # --- *** REVISED System Instruction Content for Simple Output *** ---
+    system_instruction_content = f"""<instructions>
+You are a highly knowledgeable assistant acting as a learned scholar specializing in Chassidic texts, particularly Divrei Yoel. Your function is to answer the user's Hebrew question based *strictly and exclusively* on the provided Hebrew source text passages found in the <context> section.
+**Response Requirements:**
+1.  **Language:** Respond ONLY in formal, traditional Rabbinic/Torah Hebrew (עברית תורנית, לשון הקודש). ABSOLUTELY NO MODERN HEBREW. Use only Hebrew letters and standard punctuation.
+2.  **Content:** Base your answer *solely* on information present in the `<source>` passages provided in the context. Do not add external knowledge or opinions.
+3.  **Structure:** Write a clear, coherent answer to the user's question.
+4.  **Citations:** When directly quoting or closely paraphrasing a specific point from a source to support your answer, incorporate a **short, relevant snippet** of the source text directly into your sentence. Immediately following the snippet or the sentence containing it, you MUST add the paragraph ID in the format `(ID: <id_value>)`. Extract the `<id_value>` from the `id` attribute of the corresponding `<source>` tag in the context.
+5.  **Conciseness:** Keep quoted snippets brief and directly relevant to the point you are making.
+6.  **Irrelevant Sources:** If the provided sources do not contain information to answer the question, state this clearly (e.g., "על פי המקורות שסופקו, לא נמצאה תשובה לשאלה זו."). Do not invent answers.
+7.  **Format:** Output *only* the final Hebrew answer with embedded citations as described. Do not include greetings, apologies, the original question, or any meta-commentary about the process.
+8.  **Example Adherence:** Follow the style, language, and citation format shown in the example response below.
+</instructions>
+{EXAMPLE_RESPONSE_HEBREW}""" # Use the NEW simplified example
+    # --- *** END REVISED System Instruction Content *** ---
+    # --- Prepare User Prompt Content ---
+    user_prompt_content = f"<context>\n{formatted_context}\n</context>\n\nBased *exclusively* on the source text provided in the context above, please answer the following question according to the detailed instructions:\n\nQuestion:\n{query}"
+    print(f" -> Sending request to Gemini (Model: {GENERATION_MODEL})...")
+    print(f" -> Context size: ~{len(formatted_context)} characters")
+    print(f" -> System Instruction Length: ~{len(system_instruction_content)} characters")
+    print(f" -> User Prompt Length: ~{len(user_prompt_content)} characters")
+    print(f" -> Query: '{query[:50]}...'")
+    # --- API Call Block (Compatible with v0.8.5) ---
+    try:
+        generation_config = genai.types.GenerationConfig(temperature=0.2, max_output_tokens=8192) # Keep large output for now
+        safety_settings=[ # Keep safety settings
+            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
+            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
+            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
+        ]
+        contents_for_api = [ system_instruction_content, user_prompt_content ] # Instructions first
+        response_stream = genai_client.generate_content(
+            contents=contents_for_api, # Pass combined list
+            generation_config=generation_config,
+            safety_settings=safety_settings,
+            stream=True
+        )
+        print(f" -> Gemini SYNC stream iterator created successfully...")
+        chunk_count = 0
+        # --- Stream Handling Loop (Keep existing robust logic) ---
+        for chunk in response_stream:
+            try:
+                chunk_text = chunk.text
+                block_reason_str = chunk.safety_block_reason
+                if block_reason_str:
+                    print(f" -> Gemini BLOCKED chunk: {block_reason_str}")
+                    yield f"שגיאה: {block_reason_str}"
+                    break  # Stop processing if blocked
+                if chunk_text:
+                    chunk_count += 1
+                    yield chunk_text # Yield to app.py
+            except AttributeError as ae:
+                print(f" -> Gemini chunk error: {ae}")
+                yield f"שגיאה: {ae}"
+                break
+        # --- Final Check if No Chunks Yielded ---
+        if chunk_count == 0: yield "(לא התקבלה תשובה טקסטואלית מ-Gemini)"
+    # --- General Error Handling ---
+    except Exception as e:
+        # ... (print error, yield error message) ...
+        error_type = type(e).__name__; error_msg = str(e)
+        print(f" <- Error during Gemini SYNC stream ({error_type}): {error_msg}"); traceback.print_exc()
+        yield f"\n\n--- שגיאה ביצירת התשובה מ-Gemini ({error_type}): {error_msg} ---"
+    # --- END API Call Block ---
+# --- Test function (Synchronous) ---
+# Commenting out as manual check is better now
+# def run_gemini_generation_test_sync():
+#     print("\n--- Running Gemini SYNC Generation Test (Manual Check Needed) ---")
+    # ...
+# --- Main Execution Block ---
+if __name__ == "__main__":
+     pass # Or run test if adapted
+     # if GOOGLE_API_KEY: run_gemini_generation_test_sync()
+     # else: print("\nError: GOOGLE_API_KEY environment variable not set.")

ingestion_service.py ADDED Viewed

	@@ -0,0 +1,367 @@

+# ingestion_service.py
+import os
+import json
+import openai
+import pinecone
+from pinecone import ServerlessSpec, PodSpec # Import spec classes
+from typing import List, Dict, Optional
+import time
+import traceback
+import urllib.parse # Keep for potential future ID encoding if needed
+# --- Configuration ---
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
+# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use cloud/region
+PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD", "aws") # Default cloud
+PINECONE_REGION = os.environ.get("PINECONE_REGION", "us-east-1") # Default region
+INDEX_NAME = "chassidus-index" # Ensure this matches your index name
+EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model
+EMBEDDING_DIMENSIONS = 3072 # Dimension for text-embedding-3-large
+print(f"Using Pinecone Index: {INDEX_NAME}")
+print(f"Using Pinecone Cloud: {PINECONE_CLOUD}")
+print(f"Using Pinecone Region: {PINECONE_REGION}")
+print(f"Using OpenAI Embedding Model: {EMBEDDING_MODEL} (Dimensions: {EMBEDDING_DIMENSIONS})")
+# --- End Configuration ---
+# --- Initialize OpenAI Client ---
+openai_client = None
+if OPENAI_API_KEY:
+    try:
+        openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
+        print("OpenAI client initialized.")
+    except Exception as e:
+        print(f"Error initializing OpenAI client: {e}")
+        traceback.print_exc()
+else:
+    print("ERROR: OPENAI_API_KEY not found. Ingestion requires it for embeddings.")
+# --- Initialize Pinecone Client and Index ---
+pc = None
+index = None
+if PINECONE_API_KEY and PINECONE_CLOUD and PINECONE_REGION:
+    try:
+        print("Initializing Pinecone client...")
+        pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
+        # Check if index exists
+        if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
+            print(f"Index '{INDEX_NAME}' does not exist. Creating it now...")
+            # --- Create Index (Choose ONE spec type) ---
+            # Option A: Serverless (Recommended for new projects, pay-as-you-go)
+            try:
+                 pc.create_index(
+                     name=INDEX_NAME,
+                     dimension=EMBEDDING_DIMENSIONS,
+                     metric="cosine", # or 'dotproduct', 'euclidean'
+                     spec=ServerlessSpec(
+                         cloud=PINECONE_CLOUD,
+                         region=PINECONE_REGION
+                     )
+                 )
+                 print(f"Serverless index '{INDEX_NAME}' created. Waiting for initialization...")
+                 while not pc.describe_index(INDEX_NAME).status['ready']:
+                     time.sleep(1)
+                 print("Index is ready.")
+            except Exception as create_err:
+                 print(f"Error creating Serverless index '{INDEX_NAME}': {create_err}")
+                 traceback.print_exc()
+                 # Fallback or specific error handling needed here
+            # Option B: Pod-based (Older style, requires specifying pod type/size)
+            # Uncomment below and comment out ServerlessSpec if you need Pod-based
+            # try:
+            #     # Example: Using a free tier pod (s1.x1) - adjust if needed
+            #     # Note: PINECONE_ENVIRONMENT might be needed for older pod-based index creation
+            #     pinecone_environment = os.environ.get("PINECONE_ENVIRONMENT") # Get environment if needed for pod
+            #     if not pinecone_environment:
+            #         raise ValueError("PINECONE_ENVIRONMENT is required for pod-based index creation.")
+            #     pc.create_index(
+            #         name=INDEX_NAME,
+            #         dimension=EMBEDDING_DIMENSIONS,
+            #         metric="cosine",
+            #         spec=PodSpec(
+            #             environment=pinecone_environment, # Use environment here
+            #             pod_type="p1.x1", # Example pod type, check Pinecone docs
+            #             pods=1
+            #         )
+            #     )
+            #     print(f"Pod-based index '{INDEX_NAME}' created in environment '{pinecone_environment}'. Waiting...")
+            #     while not pc.describe_index(INDEX_NAME).status['ready']:
+            #        time.sleep(1)
+            #     print("Index is ready.")
+            # except Exception as create_err:
+            #      print(f"Error creating Pod-based index '{INDEX_NAME}': {create_err}")
+            #      traceback.print_exc()
+            #      # Fallback or specific error handling needed here
+        else:
+             print(f"Index '{INDEX_NAME}' already exists.")
+        # Connect to the index
+        print(f"Connecting to index '{INDEX_NAME}'...")
+        index = pc.Index(INDEX_NAME)
+        print("Connected to Pinecone index.")
+        stats = index.describe_index_stats()
+        print(f"Initial index stats: {stats}")
+    except Exception as e:
+        print(f"Error initializing Pinecone or connecting to index: {e}")
+        traceback.print_exc()
+else:
+    print("ERROR: Pinecone API Key, Cloud, or Region not found. Cannot connect to Pinecone.")
+# --- Helper Functions ---
+def get_embedding(text: str, model=EMBEDDING_MODEL) -> Optional[List[float]]:
+    """Generate embedding for text using OpenAI API."""
+    if not openai_client:
+        print("Error: OpenAI client not initialized, cannot generate embedding.")
+        return None
+    try:
+        text = text.replace("\n", " ") # OpenAI recommends replacing newlines
+        if not text.strip(): # Handle empty strings
+            print("Warning: Attempted to embed empty string.")
+            return None
+        response = openai_client.embeddings.create(input=[text], model=model)
+        return response.data[0].embedding
+    except openai.APIError as e:
+        print(f"OpenAI API Error getting embedding: {e}")
+    except Exception as e:
+        print(f"Error getting embedding for text snippet: '{text[:100]}...'")
+        traceback.print_exc()
+    return None
+def process_json_file(file_path: str) -> List[Dict]:
+    """
+    Process a JSON file containing documents in the specified format.
+    Reads objects with "id", "hebrew", "english" keys.
+    """
+    documents = []
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            if not isinstance(data, list):
+                print(f"Warning: Expected a list of objects in JSON file '{file_path}', found {type(data)}. Skipping.")
+                return []
+            for i, item in enumerate(data):
+                if isinstance(item, dict):
+                    original_id = item.get("id")
+                    hebrew_text = item.get("hebrew")
+                    english_text = item.get("english")
+                    if not original_id:
+                        print(f"Warning: Missing 'id' in item {i} of file '{file_path}'. Skipping.")
+                        continue
+                    if not hebrew_text and not english_text:
+                        print(f"Warning: Missing both 'hebrew' and 'english' text in item {i} (ID: {original_id}) of file '{file_path}'. Skipping.")
+                        continue
+                    # Ensure texts are strings, default to empty if missing but not skipping
+                    hebrew_text = hebrew_text or ""
+                    english_text = english_text or ""
+                    doc = {
+                        "original_id": str(original_id), # Ensure ID is string
+                        "hebrew_text": hebrew_text.strip(),
+                        "english_text": english_text.strip(),
+                        "source_name": os.path.basename(file_path) # Add source filename
+                    }
+                    documents.append(doc)
+                else:
+                    print(f"Warning: Item {i} in file '{file_path}' is not a dictionary. Skipping.")
+    except json.JSONDecodeError as e:
+        print(f"Error decoding JSON from file '{file_path}': {e}")
+        return []
+    except Exception as e:
+        print(f"Error processing file '{file_path}': {e}")
+        traceback.print_exc()
+        return []
+    print(f"Processed {len(documents)} documents from '{file_path}'")
+    return documents
+def upload_documents(documents: List[Dict], batch_size: int = 100) -> bool:
+    """
+    Embeds combined Hebrew+English text and uploads vectors and metadata to Pinecone.
+    Metadata includes separate hebrew_text and english_text.
+    """
+    if not index:
+        print("Error: Pinecone index not initialized. Cannot upload.")
+        return False
+    if not documents:
+        print("No documents provided to upload.")
+        return True # Technically successful as there's nothing to do
+    total_uploaded = 0
+    try:
+        num_batches = (len(documents) + batch_size - 1) // batch_size
+        print(f"Preparing to upload {len(documents)} documents in {num_batches} batches of size {batch_size}...")
+        for i in range(0, len(documents), batch_size):
+            batch_start_time = time.time()
+            batch = documents[i : i + batch_size]
+            vectors_to_upload = []
+            ids_in_batch = set()
+            print(f"Processing batch {i//batch_size + 1}/{num_batches}...")
+            for doc in batch:
+                original_id = doc["original_id"]
+                if original_id in ids_in_batch:
+                    print(f"Warning: Duplicate ID '{original_id}' detected within the same batch. Skipping duplicate.")
+                    continue
+                ids_in_batch.add(original_id)
+                hebrew = doc["hebrew_text"]
+                english = doc["english_text"]
+                # --- Create combined text for embedding ---
+                # Add separators to potentially help the model distinguish languages
+                combined_text = f"Hebrew:\n{hebrew}\n\nEnglish:\n{english}"
+                # Alternative: Just concatenate if separators don't help much
+                # combined_text = hebrew + "\n\n" + english
+                if not combined_text.strip():
+                     print(f"Warning: Skipping document ID '{original_id}' due to empty combined text.")
+                     continue
+                # --- Get Embedding ---
+                embedding = get_embedding(combined_text)
+                if embedding is None:
+                    print(f"Warning: Failed to get embedding for document ID '{original_id}'. Skipping.")
+                    continue
+                # --- Prepare Metadata ---
+                # Ensure metadata values are strings or numbers, handle None/empty
+                metadata_payload = {
+                    "hebrew_text": hebrew if hebrew else "N/A",
+                    "english_text": english if english else "N/A",
+                    "source_name": doc.get("source_name", "Unknown"),
+                    "original_id": original_id # Store original ID in metadata too
+                }
+                # Optional: Clean metadata further if needed (e.g., truncate long texts)
+                vectors_to_upload.append({
+                    "id": original_id, # Use the original document ID as the Pinecone vector ID
+                    "values": embedding,
+                    "metadata": metadata_payload
+                })
+            if not vectors_to_upload:
+                print(f"Batch {i//batch_size + 1} resulted in no vectors to upload. Skipping API call.")
+                continue
+            # --- Upsert to Pinecone ---
+            try:
+                print(f"Upserting {len(vectors_to_upload)} vectors for batch {i//batch_size + 1}...")
+                upsert_response = index.upsert(vectors=vectors_to_upload)
+                print(f"  Upsert response: {upsert_response}")
+                total_uploaded += upsert_response.upserted_count
+            except Exception as upsert_err:
+                print(f"Error upserting batch {i//batch_size + 1}: {upsert_err}")
+                traceback.print_exc()
+                # Decide whether to continue with next batch or stop
+                # return False # Stop on first batch error
+            batch_time = time.time() - batch_start_time
+            print(f"Batch {i//batch_size + 1} processed in {batch_time:.2f} seconds.")
+            time.sleep(0.1) # Small delay between batches
+        print(f"\nFinished uploading. Total vectors successfully upserted: {total_uploaded}")
+        # Verify with index stats
+        try:
+            final_stats = index.describe_index_stats()
+            print(f"Final index stats: {final_stats}")
+        except Exception as stats_err:
+            print(f"Could not fetch final index stats: {stats_err}")
+        return True
+    except Exception as e:
+        print(f"An unexpected error occurred during the upload process: {e}")
+        traceback.print_exc()
+        return False
+def process_and_upload_file(file_path: str) -> bool:
+    """Main function to process a JSON file and upload its documents."""
+    if not os.path.exists(file_path):
+        print(f"Error: File not found at '{file_path}'")
+        return False
+    if not file_path.lower().endswith(".json"):
+        print(f"Error: This script currently only processes .json files. Found: '{file_path}'")
+        return False
+    if not openai_client or not index:
+        print("Error: OpenAI client or Pinecone index not initialized. Cannot proceed.")
+        return False
+    print(f"\n--- Starting processing for file: {file_path} ---")
+    start_time = time.time()
+    # 1. Process the JSON file
+    documents = process_json_file(file_path)
+    if not documents:
+        print(f"No valid documents found in '{file_path}'. Upload skipped.")
+        return False # Or True if "empty file processed successfully" is the desired outcome
+    # 2. Upload the documents
+    success = upload_documents(documents)
+    end_time = time.time()
+    print(f"--- Finished processing file: {file_path} in {end_time - start_time:.2f} seconds ---")
+    if success:
+        print(f"Successfully processed and uploaded data from {file_path}")
+    else:
+        print(f"Failed to upload data from {file_path}")
+    return success
+# --- Main Execution Block ---
+if __name__ == "__main__":
+    # --- Configuration for script execution ---
+    # Set the directory containing your JSON files
+    data_directory = "data" # CHANGE THIS to your data folder path
+    # ---
+    if not os.path.isdir(data_directory):
+        print(f"Error: Data directory '{data_directory}' not found.")
+        print("Please create the directory and place your JSON files inside, or update the 'data_directory' variable.")
+    else:
+        print(f"Looking for JSON files in directory: '{data_directory}'")
+        json_files = [f for f in os.listdir(data_directory) if f.lower().endswith(".json")]
+        if not json_files:
+            print(f"No .json files found in '{data_directory}'.")
+        else:
+            print(f"Found {len(json_files)} JSON files: {json_files}")
+            overall_success = True
+            for filename in json_files:
+                file_path = os.path.join(data_directory, filename)
+                success = process_and_upload_file(file_path)
+                if not success:
+                    overall_success = False
+                    print(f"Processing failed for {filename}. Check logs above.")
+                    # Optional: stop processing remaining files on failure
+                    # break
+            if overall_success:
+                print("\nAll files processed successfully.")
+            else:
+                print("\nSome files encountered errors during processing.")
+    # Example for single file upload:
+    # file_to_upload = "path/to/your/single_file.json"
+    # if os.path.exists(file_to_upload):
+    #     process_and_upload_file(file_to_upload)
+    # else:
+    #     print(f"File {file_to_upload} not found")

main.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# main.py
+# Entry point for the Chat Chassidus RAG application
+import os
+import subprocess
+import sys
+# This file serves as an entry point to run the Streamlit app
+# The actual application logic is in app.py
+if __name__ == "__main__":
+    # Run the Streamlit app with specific port and address for Replit
+    # Disable WebSocket compression and CORS to prevent connection issues
+    os.environ["STREAMLIT_SERVER_PORT"] = "8501"  # Use port forwarded to 80
+    os.environ["STREAMLIT_SERVER_ADDRESS"] = "localhost"
+    os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
+    os.environ["STREAMLIT_SERVER_ENABLE_WEBSOCKET_COMPRESSION"] = "false"
+    os.environ["STREAMLIT_SERVER_ENABLE_CORS"] = "false"
+    os.environ["STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION"] = "false"
+    subprocess.run([sys.executable, "-m", "streamlit", "run", "app.py"], check=True)

package-lock.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "name": "workspace",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {}
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "python-template"
+version = "0.1.0"
+description = ""
+authors = ["Your Name <[email protected]>"]
+requires-python = ">=3.11"
+dependencies = [
+    "anthropic>=0.49.0",
+    "google-generativeai>=0.8.5",
+    "nest-asyncio>=1.6.0",
+    "openai>=1.72.0",
+    "pinecone>=6.0.2",
+    "python-dotenv>=1.1.0",
+    "streamlit>=1.44.1",
+    "langsmith"
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+anthropic>=0.49.0
+google-generativeai>=0.8.5
+nest-asyncio>=1.6.0
+openai>=1.72.0
+pinecone>=6.0.2
+python-dotenv>=1.1.0
+streamlit>=1.44.1
+langchain>=0.0.335
+langsmith>=0.0.56

retriever_pinecone.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# retriever_pinecone.py
+import os
+import time
+import traceback
+import urllib.parse  # Keep for potential future ID decoding if needed
+from pinecone import Pinecone
+import openai  # For generating query embeddings
+from typing import List, Dict  # <<< --- ADD THIS IMPORT ---
+# --- Configuration ---
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
+# PINECONE_ENVIRONMENT is deprecated for serverless/starter, use index host or name directly
+INDEX_NAME = "chassidus-index"  # Match the index used in upload script
+EMBEDDING_MODEL = "text-embedding-3-large"  # Ensure this matches your embedding model
+print(f"Retriever using Pinecone Index: {INDEX_NAME}")
+# Removed Environment print, less relevant for v3 client usage
+print(f"Retriever using OpenAI Embedding Model: {EMBEDDING_MODEL}")
+# --- End Configuration ---
+# --- Initialize OpenAI Client ---
+openai_client = None
+if OPENAI_API_KEY:
+    try:
+        openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
+        print("OpenAI client initialized for retriever.")
+    except Exception as e:
+        print(f"Error initializing OpenAI client for retriever: {e}")
+        traceback.print_exc()
+else:
+    print(
+        "Warning: OPENAI_API_KEY not found. Retriever requires it for query embeddings."
+    )
+# --- Initialize Pinecone Client and Index ---
+pc = None
+index = None
+if PINECONE_API_KEY:
+    try:
+        print("Initializing Pinecone client for retriever...")
+        pc = Pinecone(api_key=PINECONE_API_KEY)
+        print(f"Connecting to index '{INDEX_NAME}'...")
+        # Check if index exists before connecting
+        if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
+            print(
+                f"Error: Index '{INDEX_NAME}' does not exist. Cannot connect retriever."
+            )
+        else:
+            index = pc.Index(INDEX_NAME)
+            print("Connected to Pinecone index for retriever.")
+            # Verify connection with stats
+            stats = index.describe_index_stats()
+            print(f"Index stats: {stats}")
+            if stats.total_vector_count == 0:
+                print(f"Warning: Pinecone index '{INDEX_NAME}' is empty.")
+    except Exception as e:
+        print(
+            f"Error initializing Pinecone or connecting to index for retriever: {e}"
+        )
+        traceback.print_exc()
+else:
+    print(
+        "Error: PINECONE_API_KEY not found. Cannot initialize Pinecone client."
+    )
+# --- Status Check ---
+def check_retriever_status():
+    """Checks if the Pinecone retriever is ready."""
+    status = True
+    messages = []
+    if not OPENAI_API_KEY:
+        status = False
+        messages.append("OpenAI API Key missing.")
+    if not openai_client:
+        status = False
+        messages.append("OpenAI client initialization failed.")
+    if not PINECONE_API_KEY:
+        status = False
+        messages.append("Pinecone API Key missing.")
+    if not pc:
+        status = False
+        messages.append("Pinecone client failed to initialize.")
+    if not index:  # Check if index object was successfully created
+        status = False
+        messages.append(
+            f"Pinecone index '{INDEX_NAME}' could not be connected to or doesn't exist."
+        )
+    elif index:
+        try:
+            stats = index.describe_index_stats()
+            if stats.total_vector_count == 0:
+                messages.append(
+                    f"Retriever ready, but Pinecone index '{INDEX_NAME}' is empty."
+                )
+        except Exception as stats_err:
+            status = False
+            messages.append(
+                f"Failed to get stats for index '{INDEX_NAME}': {stats_err}")
+    if status and not messages:
+        messages.append("Retriever ready.")
+    return status, " ".join(messages)
+# --- Retrieval Function ---
+def get_embedding(text, model=EMBEDDING_MODEL):
+    """Generates embedding for the given text using OpenAI."""
+    if not openai_client:
+        raise ValueError("OpenAI client not initialized.")
+    try:
+        text = text.replace("\n", " ")
+        response = openai_client.embeddings.create(input=[text], model=model)
+        return response.data[0].embedding
+    except Exception as e:
+        print(f"Error getting embedding for text: '{text[:100]}...'")
+        traceback.print_exc()
+        return None
+# This is line 114 where the error occurred, now List and Dict are defined via import
+def find_similar_paragraphs(query_text: str,
+                            n_results: int = 10) -> List[Dict]:
+    """
+    Retrieves similar paragraphs from Pinecone based on the query text.
+    Searches against combined Hebrew+English embeddings.
+    Retrieves metadata including separate hebrew_text and english_text.
+    """
+    ready, message = check_retriever_status()
+    if not ready or index is None:  # Check index specifically
+        print(f"Retriever not ready: {message}")
+        return []
+    print(f"\nRetrieving similar paragraphs for: '{query_text[:100]}...'")
+    start_time = time.time()
+    try:
+        # 1. Get query embedding
+        print("Generating query embedding...")
+        query_embedding = get_embedding(query_text)
+        if query_embedding is None:
+            print("Failed to generate query embedding.")
+            return []
+        embed_time = time.time() - start_time
+        print(f"Query embedding generated in {embed_time:.4f} seconds.")
+        # 2. Query Pinecone
+        print(
+            f"Querying Pinecone index '{INDEX_NAME}' for top {n_results} results..."
+        )
+        query_start_time = time.time()
+        response = index.query(
+            vector=query_embedding,
+            top_k=n_results,
+            include_metadata=True  # Essential to get the text back
+        )
+        query_time = time.time() - query_start_time
+        print(f"Pinecone query completed in {query_time:.4f} seconds.")
+        # 3. Process results
+        formatted_results = []
+        if not response or not response.matches:
+            print("No results found by Pinecone for this query.")
+            return []
+        print(
+            f"Processing {len(response.matches)} raw results from Pinecone...")
+        for match in response.matches:
+            score = match.score  # Cosine similarity score (higher is better)
+            vector_id = match.id  # The ID stored in Pinecone (should be original_id)
+            metadata = match.metadata if match.metadata else {}
+            # --- Extract data from metadata ---
+            # Use .get() with defaults for robustness
+            original_id = metadata.get(
+                'original_id', vector_id)  # Fallback to vector_id if missing
+            hebrew_text = metadata.get('hebrew_text', '')
+            english_text = metadata.get('english_text', '')
+            source_name = metadata.get('source_name', 'Unknown Source')
+            # Calculate distance from similarity score (for consistency if needed)
+            # Distance = 1 - Cosine Similarity
+            distance = 1.0 - score
+            doc_data = {
+                "vector_id": vector_id,  # The ID used in Pinecone
+                "original_id":
+                original_id,  # The original ID from the source JSON
+                "source_name": source_name,
+                "hebrew_text": hebrew_text,
+                "english_text": english_text,  # Include English text
+                "distance": distance,  # Calculated distance (lower is better)
+                "similarity_score":
+                score,  # Direct score from Pinecone (higher is better)
+            }
+            formatted_results.append(doc_data)
+        # Pinecone results are already sorted by score (descending),
+        # which means distance is ascending (most similar first).
+        total_retrieval_time = time.time() - start_time
+        print(
+            f"Retrieved and processed {len(formatted_results)} paragraphs from Pinecone in {total_retrieval_time:.2f} seconds."
+        )
+        return formatted_results
+    except Exception as e:
+        print(f"Error during Pinecone query or processing: {e}")
+        traceback.print_exc()
+        return []
+# --- Main Test Block ---
+if __name__ == "__main__":
+    ready, msg = check_retriever_status()
+    print(f"\nRetriever Status: {ready} - {msg}")
+    if ready:
+        print("\n--- Running Retriever Test ---")
+        test_query = "role of joy in divine service"  # Test query in English
+        # test_query_he = "תפקיד השמחה בעבודת ה'" # Test query in Hebrew (optional)
+        retrieved_docs = find_similar_paragraphs(test_query, n_results=5)
+        if retrieved_docs:
+            print("\n--- Top Test Results ---")
+            for i, doc in enumerate(retrieved_docs):
+                print(
+                    f"\n{i+1}. Score: {doc['similarity_score']:.4f} (Distance: {doc['distance']:.4f})"
+                )
+                print(
+                    f"   Source: {doc['source_name']} (Orig ID: {doc['original_id']}, VecID: {doc['vector_id']})"
+                )
+                print(f"   Hebrew: {doc['hebrew_text'][:150]}...")
+                print(f"   English: {doc['english_text'][:150]}...")
+        else:
+            print("No documents retrieved for the test query.")
+    else:
+        print(f"Cannot run test because retriever is not ready.")

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

validation_service_openai.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# validation_service_openai.py
+# Works with LangSmith, OpenAI async, built for RAG validation
+import os
+import traceback
+import openai
+import asyncio
+import json
+from typing import Dict, Optional
+from langsmith import traceable
+# ----- ENVIRONMENT SETUP (Replit secret-based) -----
+os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
+os.environ["LANGSMITH_TRACING"] = "true"
+os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
+os.environ["LANGSMITH_API_KEY"] = os.environ["LANGSMITH_API_KEY"]
+os.environ["LANGSMITH_PROJECT"] = os.environ["LANGSMITH_PROJECT"]
+# ---------------------------------------------------
+OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
+VALIDATION_MODEL = "gpt-4o"
+# Initialize OpenAI Async Client
+async_openai_client = None
+if OPENAI_API_KEY:
+    try:
+        # (no need for wrap_openai here unless you want call-level traces)
+        async_openai_client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)
+        print("OpenAI ASYNC client initialized for validator service.")
+    except Exception as e:
+        print(f"Error initializing OpenAI ASYNC client for validator: {e}")
+        traceback.print_exc()
+else:
+    print("Warning: OPENAI_API_KEY not found. Validator service (GPT-4o) requires it.")
+def check_openai_validator_status():
+    status = True
+    messages = []
+    if not OPENAI_API_KEY:
+        status = False
+        messages.append("OpenAI API Key missing.")
+    if not async_openai_client:
+        status = False
+        messages.append("OpenAI Async client (for Validator) initialization failed.")
+    if status and not messages:
+        messages.append(f"OpenAI Validator service ready (Model: {VALIDATION_MODEL}).")
+    return status, " ".join(messages)
+@traceable
+async def validate_paragraph_relevance_gpt4o(
+    paragraph_data: Dict,
+    user_question: str,
+    paragraph_index: int
+) -> Optional[Dict]:
+    """
+    Uses GPT-4o to validate if a SINGLE paragraph (HE+EN text) contains relevant info.
+    Args:
+        paragraph_data: A dictionary for the paragraph (needs 'hebrew_text', 'english_text').
+        user_question: The original user question in Hebrew.
+        paragraph_index: The index of this paragraph in the list being validated.
+    Returns:
+        A dictionary containing the validation result and original paragraph data.
+        Returns None if an error occurs during validation.
+    """
+    global async_openai_client
+    if not async_openai_client:
+        print(f"Error (Paragraph {paragraph_index}): OpenAI async client not available.")
+        return None
+    if not paragraph_data:
+        return {
+            "validation": {
+                "contains_relevant_info": False,
+                "justification": "Input paragraph data was empty."
+            },
+            "paragraph_data": {}
+        }
+    hebrew_text = paragraph_data.get('hebrew_text', '').strip()
+    english_text = paragraph_data.get('english_text', '').strip()
+    if not hebrew_text and not english_text:
+        return {
+            "validation": {
+                "contains_relevant_info": False,
+                "justification": "Paragraph text is empty."
+            },
+            "paragraph_data": paragraph_data
+        }
+    prompt_content = f"""User Question (Hebrew):
+"{user_question}"
+Text Paragraph (Paragraph {paragraph_index+1}):
+Hebrew:
+---
+{hebrew_text if hebrew_text else "(No Hebrew text provided)"}
+---
+English:
+---
+{english_text if english_text else "(No English text provided)"}
+---
+Instruction:
+Analyze the Text Paragraph provided above (considering both Hebrew and English versions if available). Determine if any information within this specific paragraph directly answers, or provides significant relevant details contributing to an answer for, the User Question (which is in Hebrew).
+Respond ONLY with a valid JSON object containing exactly two keys:
+1. 'contains_relevant_info': A boolean value (`true` if relevant information is found, `false` otherwise).
+2. 'justification': A brief, 1-sentence explanation (in Hebrew) for your decision, especially if 'true'.
+Example valid JSON output:
+{{ "contains_relevant_info": true, "justification": "הפסקה דנה ישירות בסיבת העיכוב בקריעת הים." }}
+OR
+{{ "contains_relevant_info": false, "justification": "הפסקה עוסקת בעניין אחר ואינה רלוונטית לשאלה." }}
+Output only the JSON object, nothing else.
+"""
+    try:
+        response = await async_openai_client.chat.completions.create(
+            model=VALIDATION_MODEL,
+            messages=[{"role": "user", "content": prompt_content}],
+            temperature=0.1,
+            max_tokens=150,
+            response_format={"type": "json_object"}
+        )
+        json_string = response.choices[0].message.content
+        try:
+            validation_result = json.loads(json_string)
+            if not isinstance(validation_result, dict) or \
+               'contains_relevant_info' not in validation_result or \
+               'justification' not in validation_result or \
+               not isinstance(validation_result['contains_relevant_info'], bool):
+                print(f"Error (Paragraph {paragraph_index+1}): Parsed JSON has incorrect structure: {validation_result}")
+                return None
+            return {
+                "validation": validation_result,
+                "paragraph_data": paragraph_data
+            }
+        except json.JSONDecodeError as json_err:
+            print(f"Error (Paragraph {paragraph_index+1}): Failed to decode JSON response: {json_err}. Response was: {json_string}")
+            return None
+        except Exception as parse_err:
+            print(f"Error (Paragraph {paragraph_index+1}): Unexpected error parsing validation structure: {parse_err}")
+            return None
+    except openai.APIError as e:
+        print(f"Error (Paragraph {paragraph_index+1}): OpenAI API Error during validation: {e}")
+        return None
+    except Exception as e:
+        print(f"Error (Paragraph {paragraph_index+1}): Unexpected error during GPT-4o validation API call: {e}")
+        traceback.print_exc()
+        return None