sivan22 commited on
Commit
7f683f9
·
verified ·
1 Parent(s): bafd905

Upload 16 files

Browse files
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
.replit ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ entrypoint = "main.py"
2
+ modules = ["nodejs-20", "python-3.11"]
3
+
4
+ [nix]
5
+ channel = "stable-24_05"
6
+
7
+ [unitTest]
8
+ language = "python3"
9
+
10
+ [gitHubImport]
11
+ requiredFiles = [".replit", "replit.nix"]
12
+
13
+ [deployment]
14
+ run = ["sh", "-c", "python -m streamlit run --server.address 0.0.0.0 --server.headless true --server.enableCORS=false --server.enableXsrfProtection=false --server.enableWebsocketCompression=false app.py"]
15
+ deploymentTarget = "cloudrun"
16
+
17
+ [workflows]
18
+ runButton = "Run"
19
+
20
+ [[workflows.workflow]]
21
+ name = "Run"
22
+ author = 22737092
23
+ mode = "sequential"
24
+
25
+ [[workflows.workflow.tasks]]
26
+ task = "shell.exec"
27
+ args = "python main.py"
28
+
29
+ [[ports]]
30
+ localPort = 8501
31
+ externalPort = 80
analysis_service_anthropic.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # analysis_service_anthropic.py
2
+
3
+ import anthropic
4
+ import os
5
+ import json
6
+ import re
7
+ import traceback
8
+
9
+ # --- Configuration ---
10
+ ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
11
+ # --- MODEL NAME CHANGED TO SPECIFIC VERSION, LATEST AS COMMENT ---
12
+ # Use the specific dated version for potentially more stable results in production.
13
+ # 'latest' alias points to this or newer snapshots.
14
+ # Original: ANALYSIS_MODEL = "claude-3-7-sonnet-latest"
15
+ ANALYSIS_MODEL = "claude-3-7-sonnet-20250219" # Or use "claude-3-7-sonnet-latest"
16
+ # --- End Configuration ---
17
+
18
+ client = None
19
+ if ANTHROPIC_API_KEY:
20
+ try:
21
+ # Use the specific model name in initialization log
22
+ client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY)
23
+ print(
24
+ f"Anthropic ASYNC client initialized for analysis (Model: {ANALYSIS_MODEL})."
25
+ )
26
+ except Exception as e:
27
+ print(f"Error initializing Anthropic ASYNC client for analysis: {e}")
28
+ else:
29
+ print("ANTHROPIC_API_KEY not found. Analysis service will not function.")
30
+
31
+
32
+ # --- Helper functions (clean_json_string, check_analyzer_status, clean_source_text) ---
33
+ def clean_json_string(json_string):
34
+ """Attempts to clean common issues in JSON strings returned by LLMs."""
35
+ if not isinstance(json_string, str):
36
+ return ""
37
+ # Remove trailing commas before closing brackets/braces
38
+ cleaned = re.sub(r",(\s*[}\]])", r"\1", json_string)
39
+ # Remove markdown code block fences
40
+ cleaned = re.sub(r"^```json\s*", "", cleaned, flags=re.IGNORECASE)
41
+ cleaned = re.sub(r"\s*```$", "", cleaned)
42
+ return cleaned.strip()
43
+
44
+ def check_analyzer_status():
45
+ """Checks if the Anthropic analyzer service is ready."""
46
+ if not client:
47
+ return False, "Anthropic client not initialized (check API key)."
48
+ return True, f"Analysis service ready (Model: {ANALYSIS_MODEL})." # Show model in status
49
+
50
+ def clean_source_text(text):
51
+ """Cleans source text by removing specific patterns."""
52
+ if not text: return ""
53
+ # Remove @number patterns, <HAL> tags (case-insensitive), <br> tags, and normalize whitespace
54
+ cleaned = text
55
+ cleaned = re.sub(r'@\d+', '', cleaned)
56
+ cleaned = re.sub(r'<HAL>', '', cleaned, flags=re.IGNORECASE)
57
+ cleaned = cleaned.replace('<br>', ' ').replace('<br />', ' ')
58
+ cleaned = re.sub(r'\s+', ' ', cleaned).strip()
59
+ return cleaned
60
+ # --- End Helper Functions ---
61
+
62
+
63
+ async def analyze_source_relevance_async(paragraph_hebrew, paragraph_english,
64
+ user_question):
65
+ """
66
+ Analyzes a Hebrew text paragraph for relevance to a user's question using Anthropic.
67
+
68
+ Args:
69
+ paragraph_hebrew (str): The Hebrew text snippet to analyze.
70
+ paragraph_english (str): The English translation (currently unused but kept for signature consistency).
71
+ user_question (str): The user's question in Hebrew.
72
+
73
+ Returns:
74
+ dict or None: A dictionary containing the analysis result (relevance score, headline, conclusion)
75
+ if successful, otherwise None.
76
+ """
77
+ global client
78
+ ready, msg = check_analyzer_status()
79
+ if not ready or client is None:
80
+ print(f"Analyzer not ready: {msg}")
81
+ return None
82
+
83
+ # Ensure inputs are strings, even if empty
84
+ paragraph_hebrew = str(paragraph_hebrew) if paragraph_hebrew is not None else ""
85
+ user_question = str(user_question) if user_question is not None else ""
86
+
87
+ if not paragraph_hebrew or not user_question:
88
+ print(
89
+ "Warning: Missing Hebrew paragraph or user question for analysis.")
90
+ # Return a default non-relevant structure instead of None if needed by downstream logic
91
+ # return {"relevance": {"is_relevant": False, "relevance_score": 1, "explanation": "קלט חסר."}, "headline": {"hebrew": "קלט חסר"}, "conclusion": {"hebrew": "קלט חסר."}}
92
+ return None # Keep returning None for now
93
+
94
+ original_snippet = paragraph_hebrew[:60].replace('\n', ' ')
95
+ cleaned_hebrew = clean_source_text(paragraph_hebrew)
96
+ cleaned_snippet = cleaned_hebrew[:60].replace('\n', ' ')
97
+
98
+ # === START NEW SYSTEM PROMPT ===
99
+ system_prompt = """You are an expert analyst specializing in Chassidic texts, particularly the works of the Satmar Rebbe, Rabbi Yoel Teitelbaum (Divrei Yoel). Your task is to evaluate a single Hebrew paragraph provided by the user based *only* on its relevance to the user's specific Hebrew question.
100
+
101
+ You MUST output your analysis STRICTLY as a single, valid JSON object, with no other text before or after the JSON structure.
102
+
103
+ The JSON object must have the following structure:
104
+
105
+ {
106
+ "relevance": {
107
+ "is_relevant": boolean, // True if the paragraph directly discusses or provides significant information related to the user's question. False otherwise.
108
+ "relevance_score": integer, // A score from 1 (completely irrelevant) to 10 (directly and fully answers a key aspect of the question). Assess based *only* on the provided paragraph content.
109
+ "explanation": string // A concise explanation IN HEBREW justifying the score and relevance assessment, referring only to the content of the paragraph. Explain *why* it is or is not relevant.
110
+ },
111
+ "headline": {
112
+ "hebrew": string // A very brief (3-7 words) headline IN HEBREW summarizing the paragraph's main topic *as it relates to the question*. If irrelevant, summarize the paragraph's general topic.
113
+ },
114
+ "conclusion": {
115
+ "hebrew": string // A single sentence IN HEBREW summarizing the key takeaway or information *from the paragraph* that is relevant to the question. If the paragraph is irrelevant, state clearly in Hebrew that it does not address the question (e.g., "הפסקה אינה עוסקת בשאלה.").
116
+ }
117
+ }
118
+
119
+ Base your entire analysis SOLELY on the Hebrew text paragraph provided. Do not use external knowledge. Ensure the output is valid JSON.
120
+ """
121
+ # === END NEW SYSTEM PROMPT ===
122
+
123
+ # === START NEW USER MESSAGE CONTENT ===
124
+ # Ensure inputs are properly escaped if they contain characters that could break JSON structure within the f-string, though unlikely here.
125
+ # Using f-string for clarity, but ensure no direct injection vulnerability if inputs were different.
126
+ user_message_content = f"""Please analyze the following Hebrew text passage based *only* on its content and relevance to the specific Hebrew question provided below. Adhere strictly to the JSON output format specified in the system prompt.
127
+
128
+ **User Question (Hebrew):**
129
+ {user_question}
130
+
131
+ **Hebrew Text Passage to Analyze:**
132
+ <paragraph>
133
+ {cleaned_hebrew}
134
+ </paragraph>
135
+ """
136
+ # === END NEW USER MESSAGE CONTENT ===
137
+
138
+ print(
139
+ f" -> Sending cleaned paragraph (Snippet: '{cleaned_snippet}...') for Claude analysis (Model: {ANALYSIS_MODEL}) regarding question: '{user_question[:60]}...'"
140
+ )
141
+
142
+ try:
143
+ # --- API Call - Adheres to Messages API format ---
144
+ message = await client.messages.create(
145
+ model=ANALYSIS_MODEL, # Correct parameter
146
+ max_tokens=1024, # Correct parameter (Estimate response size, maybe smaller?)
147
+ system=system_prompt, # Correct parameter (Using new detailed prompt)
148
+ messages=[{ # Correct parameter and structure
149
+ "role": "user",
150
+ "content": user_message_content # Using new detailed content
151
+ }],
152
+ temperature=0.1, # Lower temperature for deterministic analysis
153
+ # --- Thinking Parameter - Commented Out for Analysis ---
154
+ # thinking={"type": "enabled", "budget_tokens": 16000} # Consider if needed for complex analysis
155
+ # --- End Thinking Parameter ---
156
+ )
157
+
158
+ # --- Process the response ---
159
+ if not message or not message.content or not isinstance(message.content, list) or not message.content:
160
+ print(
161
+ f" <- Analysis failed: Invalid message object or empty content list received from API for snippet '{cleaned_snippet}...'. Message: {message}"
162
+ )
163
+ return None
164
+
165
+ first_block = message.content[0]
166
+ if not first_block or not hasattr(first_block, 'text') or not first_block.text:
167
+ print(
168
+ f" <- Analysis failed: First content block is invalid or has no text for snippet '{cleaned_snippet}...'. First block: {first_block}"
169
+ )
170
+ return None
171
+
172
+ raw_response_text = first_block.text
173
+ print(f" <- Raw analysis response snippet: {raw_response_text[:200]}...") # Log raw response
174
+
175
+ # --- JSON Parsing Logic ---
176
+ # Attempt to find JSON block first
177
+ json_match = re.search(r"\{.*\}", raw_response_text, re.DOTALL)
178
+ json_to_parse = None
179
+ if json_match:
180
+ json_block = json_match.group(0)
181
+ # Further clean the extracted block
182
+ json_to_parse = clean_json_string(json_block)
183
+ print(f" -- Extracted JSON block: {json_to_parse[:100]}...")
184
+ else:
185
+ # If no block found, try cleaning the whole response (less reliable)
186
+ json_to_parse = clean_json_string(raw_response_text)
187
+ print(f" -- No JSON block found, attempting parse on cleaned full response: {json_to_parse[:100]}...")
188
+
189
+
190
+ if not json_to_parse or not json_to_parse.startswith("{") or not json_to_parse.endswith("}"):
191
+ print(f" <- Analysis failed: Could not extract valid JSON structure after cleaning. Cleaned data: '{json_to_parse[:100]}...'")
192
+ # Log more context on failure
193
+ print(f" -- Original raw response was: {raw_response_text}")
194
+ return None
195
+
196
+ try:
197
+ analysis_result = json.loads(json_to_parse)
198
+ except json.JSONDecodeError as json_err:
199
+ print(f" <- Analysis failed: JSONDecodeError - {json_err}. Problematic JSON string (cleaned): '{json_to_parse}'")
200
+ # Log more context on failure
201
+ print(f" -- Original raw response was: {raw_response_text}")
202
+ return None
203
+
204
+ # --- Validate Structure ---
205
+ # Add more verbose checks for debugging
206
+ if not isinstance(analysis_result, dict):
207
+ print(f" <- Analysis failed: Parsed result is not a dictionary. Type: {type(analysis_result)}")
208
+ return None
209
+
210
+ # Check top-level keys
211
+ if not all(key in analysis_result for key in ['relevance', 'headline', 'conclusion']):
212
+ print(f" <- Analysis failed: Missing top-level keys. Found: {list(analysis_result.keys())}")
213
+ return None
214
+
215
+ # Check nested structure and types
216
+ relevance_data = analysis_result.get('relevance')
217
+ headline_data = analysis_result.get('headline')
218
+ conclusion_data = analysis_result.get('conclusion')
219
+
220
+ if not isinstance(relevance_data, dict) or \
221
+ not all(k in relevance_data for k in ['is_relevant', 'relevance_score', 'explanation']) or \
222
+ not isinstance(relevance_data.get('is_relevant'), bool) or \
223
+ not isinstance(relevance_data.get('relevance_score'), int) or \
224
+ not isinstance(relevance_data.get('explanation'), str):
225
+ print(f" <- Analysis failed: Incorrect structure or types in 'relevance' field. Data: {relevance_data}")
226
+ return None
227
+
228
+ if not isinstance(headline_data, dict) or \
229
+ 'hebrew' not in headline_data or \
230
+ not isinstance(headline_data.get('hebrew'), str):
231
+ print(f" <- Analysis failed: Incorrect structure or types in 'headline' field. Data: {headline_data}")
232
+ return None
233
+
234
+ if not isinstance(conclusion_data, dict) or \
235
+ 'hebrew' not in conclusion_data or \
236
+ not isinstance(conclusion_data.get('hebrew'), str):
237
+ print(f" <- Analysis failed: Incorrect structure or types in 'conclusion' field. Data: {conclusion_data}")
238
+ return None
239
+
240
+ # If all checks pass
241
+ print(
242
+ f" <- Analysis successful for snippet '{cleaned_snippet}...'. Score: {analysis_result.get('relevance', {}).get('relevance_score', 'N/A')}"
243
+ )
244
+ return analysis_result
245
+
246
+ # --- Error Handling - Aligns with library exceptions ---
247
+ except anthropic.APIStatusError as e:
248
+ print(
249
+ f" <- Anthropic API Status Error (Analysis): Status={e.status_code} Response={e.response} for snippet '{cleaned_snippet}...'. Model: {ANALYSIS_MODEL}"
250
+ )
251
+ if e.status_code == 400:
252
+ print(
253
+ f" <- NOTE: 400 Bad Request. Possible causes: Model name '{ANALYSIS_MODEL}' invalid/unavailable OR API parameters incorrect OR input/output token limits exceeded."
254
+ )
255
+ # Log relevant parts of the request if possible (be careful with sensitive data)
256
+ print(f" -- Failing request details: Question='{user_question[:60]}...', Paragraph Snippet='{cleaned_snippet}...', System Prompt Length={len(system_prompt)}, User Content Length={len(user_message_content)}")
257
+ return None
258
+ except Exception as e:
259
+ print(
260
+ f" <- Unexpected error during Claude analysis API call ({type(e).__name__}) for snippet '{cleaned_snippet}...': {e}"
261
+ )
262
+ traceback.print_exc()
263
+ # Log relevant parts of the request if possible
264
+ print(f" -- Failing request details: Question='{user_question[:60]}...', Paragraph Snippet='{cleaned_snippet}...', System Prompt Length={len(system_prompt)}, User Content Length={len(user_message_content)}")
265
+ return None
266
+
267
+ # --- Example Usage (No changes needed) ---
268
+ # Consider adding a small async test function here if needed
269
+ # import asyncio
270
+ # async def main():
271
+ # test_q = "מהי חשיבות השמחה בעבודת ה'?"
272
+ # test_p = "ועיקר עבודת ה' היא בשמחה, כמו שכתוב 'עבדו את ה' בשמחה', כי השמחה פותחת הלב ומאירה הנשמה, ומביאה לידי דביקות בהשי\"ת. ועל ידי העצבות ח\"ו נסתם הלב ואינו יכול לקבל אור הקדושה."
273
+ # result = await analyze_source_relevance_async(test_p, "", test_q)
274
+ # print("\n--- Test Analysis Result ---")
275
+ # print(json.dumps(result, indent=2, ensure_ascii=False))
276
+ # if __name__ == "__main__":
277
+ # if ANTHROPIC_API_KEY:
278
+ # asyncio.run(main())
279
+ # else:
280
+ # print("Cannot run test: ANTHROPIC_API_KEY not set.")
app.py ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - LangSmith enabled, designed for Replit + Anthropic + OpenAI
2
+ import os
3
+ import streamlit as st
4
+ import time
5
+ import traceback
6
+ import json
7
+ import asyncio
8
+ import nest_asyncio
9
+ from typing import List, Dict
10
+ from dotenv import load_dotenv
11
+
12
+ load_dotenv()
13
+
14
+ # ----- SETUP SECRETS AND ENV -----
15
+ # Hardcoded (safe): you never need these in secrets!
16
+ os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
17
+ os.environ["LANGSMITH_TRACING"] = "true"
18
+ # The following must exist in your Replit secrets:
19
+ # OPENAI_API_KEY, ANTHROPIC_API_KEY, LANGSMITH_API_KEY, LANGSMITH_PROJECT
20
+ os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
21
+ os.environ["ANTHROPIC_API_KEY"] = os.environ["ANTHROPIC_API_KEY"]
22
+ os.environ["LANGSMITH_API_KEY"] = os.environ["LANGSMITH_API_KEY"]
23
+ os.environ["LANGSMITH_PROJECT"] = os.environ["LANGSMITH_PROJECT"]
24
+ # ----------------------------------
25
+
26
+ from langsmith import traceable
27
+
28
+ nest_asyncio.apply()
29
+
30
+ from retriever_pinecone import find_similar_paragraphs, check_retriever_status
31
+ from analysis_service_anthropic import (
32
+ analyze_source_relevance_async,
33
+ check_analyzer_status,
34
+ ANALYSIS_MODEL as ANTHROPIC_ANALYSIS_MODEL,
35
+ )
36
+ from generation_service_anthropic import (
37
+ generate_response_stream_async as generate_anthropic,
38
+ check_generator_status as check_anthropic_generator,
39
+ GENERATION_MODEL as ANTHROPIC_GENERATION_MODEL,
40
+ )
41
+ from generation_service_gemini import (
42
+ generate_response_stream_gemini as generate_gemini,
43
+ check_gemini_generator_status,
44
+ GENERATION_MODEL as GEMINI_GENERATION_MODEL,
45
+ )
46
+ from validation_service_openai import (
47
+ validate_paragraph_relevance_gpt4o,
48
+ check_openai_validator_status,
49
+ VALIDATION_MODEL as GPT4O_VALIDATION_MODEL,
50
+ )
51
+
52
+ try:
53
+ from generation_service_anthropic import format_context_for_prompt
54
+ print("Format context function potentially available.")
55
+ except ImportError:
56
+ print("Warning: format_context_for_prompt not imported.")
57
+
58
+ st.set_page_config(page_title="Divrey Yoel AI Chat", layout="wide")
59
+ st.markdown(
60
+ """<style>
61
+ .rtl-text { direction: rtl; text-align: right; }
62
+ .hebrew-text { font-family: 'Arial Hebrew', 'David', sans-serif; direction: rtl; text-align: right; font-size: 1.1em; margin-bottom: 5px; }
63
+ .source-info { font-size: 0.85em; color: #666; margin-bottom: 8px; }
64
+ .expander-content > div { border-bottom: 1px solid #eee; padding-bottom: 15px; margin-bottom: 15px; }
65
+ .expander-content > div:last-child { border-bottom: none; margin-bottom: 0; padding-bottom: 0; }
66
+ .stChatMessage .stExpander { margin-top: 15px; border-left: 3px solid #ddd; padding-left: 10px; }
67
+ .stStatus div[data-testid="stStatusContent"] p { direction: rtl; text-align: right; }
68
+ .stButton > button[kind="header"] { direction: rtl; text-align: right; }
69
+ .stExpander div[data-testid="stVerticalBlock"] code { display: block; text-align: right; direction: rtl; }
70
+ .alert-warning { padding: 0.75rem 1.25rem; margin-bottom: 1rem; border: 1px solid transparent;
71
+ border-radius: 0.25rem; color: #856404; background-color: #fff3cd; border-color: #ffeeba;}
72
+ </style>""",
73
+ unsafe_allow_html=True,
74
+ )
75
+ st.markdown("<h1 class='rtl-text'>Divrey Yoel AI Chat</h1>", unsafe_allow_html=True)
76
+ st.markdown("<p class='rtl-text'>חיפוש בטקסטים חסידיים באמצעות RAG</p>", unsafe_allow_html=True)
77
+
78
+ # --- Status Checks & Sidebar ---
79
+ retriever_ready, retriever_msg = check_retriever_status()
80
+ anthropic_analyzer_ready, anthropic_analyzer_msg = check_analyzer_status()
81
+ anthropic_generator_ready, anthropic_generator_msg = check_anthropic_generator()
82
+ gemini_generator_ready, gemini_generator_msg = check_gemini_generator_status()
83
+ openai_validator_ready, openai_validator_msg = check_openai_validator_status()
84
+
85
+ st.sidebar.markdown("<h3 class='rtl-text'>מצב המערכת</h3>", unsafe_allow_html=True)
86
+ st.sidebar.markdown(
87
+ f"<p class='rtl-text'><strong>מאחזר (Pinecone):</strong> {'✅' if retriever_ready else '❌'}</p>",
88
+ unsafe_allow_html=True,
89
+ )
90
+ if not retriever_ready:
91
+ st.sidebar.markdown(
92
+ f"<div class='alert alert-warning rtl-text' role='alert'>{retriever_msg}</div>", unsafe_allow_html=True
93
+ )
94
+ st.markdown(
95
+ "<p class='rtl-text' style='color: red;'><strong>שירות האחזור (Pinecone) אינו זמין. לא ניתן להמשיך.</strong></p>",
96
+ unsafe_allow_html=True,
97
+ )
98
+ st.stop()
99
+
100
+ st.sidebar.markdown("<hr>", unsafe_allow_html=True)
101
+ st.sidebar.markdown(
102
+ f"<p class='rtl-text'><strong>מנתח (Anthropic):</strong> {'✅ <small>(נדרש לשיטת Anthropic)</small>' if anthropic_analyzer_ready else '❌ <small>(נדרש לשיטת Anthropic)</small>'}</p>",
103
+ unsafe_allow_html=True,
104
+ )
105
+ st.sidebar.markdown(
106
+ f"<p class='rtl-text'><strong>מאמת (GPT-4o):</strong> {'✅ <small>(נדרש לשיטת GPT-4o)</small>' if openai_validator_ready else '❌ <small>(נדרש לשיטת GPT-4o)</small>'}</p>",
107
+ unsafe_allow_html=True,
108
+ )
109
+ st.sidebar.markdown(
110
+ f"<p class='rtl-text'><strong>מחולל (Anthropic):</strong> {'✅ <small>(נדרש לשיטות Anthropic/GPT-4o)</small>' if anthropic_generator_ready else '❌ <small>(נדרש לשיטות Anthropic/GPT-4o)</small>'}</p>",
111
+ unsafe_allow_html=True,
112
+ )
113
+ st.sidebar.markdown(
114
+ f"<p class='rtl-text'><strong>מחולל (Gemini):</strong> {'✅ <small>(נדרש לשיטת Gemini)</small>' if gemini_generator_ready else '❌ <small>(נדרש לשיטת Gemini)</small>'}</p>",
115
+ unsafe_allow_html=True,
116
+ )
117
+ st.sidebar.markdown("<hr>", unsafe_allow_html=True)
118
+
119
+ st.sidebar.markdown("<h3 class='rtl-text'>הגדרות RAG</h3>", unsafe_allow_html=True)
120
+ pipeline_method = st.sidebar.selectbox(
121
+ "בחר שיטת עיבוד:",
122
+ options=[
123
+ "Anthropic (ניתוח וסינון פרטני)",
124
+ "Gemini (אחזור ויצירה ישירה)",
125
+ "GPT-4o Paragraph Validator + Claude Synthesizer",
126
+ ],
127
+ index=2,
128
+ )
129
+ is_anthropic_pipeline = pipeline_method == "Anthropic (ניתוח וסינון פרטני)"
130
+ is_gemini_pipeline = pipeline_method == "Gemini (אחזור ויצירה ישירה)"
131
+ is_gpt4o_para_pipeline = pipeline_method == "GPT-4o Paragraph Validator + Claude Synthesizer"
132
+
133
+ n_retrieve = st.sidebar.slider(
134
+ "מספר פסקאות לאחזור (Retrieve)", 1, 300, 100,
135
+ help="כמה פסקאות לאחזר ראשונית (משותף לכל השיטות)."
136
+ )
137
+ n_analyze = st.sidebar.slider(
138
+ "מספר פסקאות לניתוח (Anthropic בלבד)", 1, min(n_retrieve, 50), min(21, n_retrieve, 50),
139
+ help="כמה פסקאות יישלחו לניתוח רלוונטיות פרטני ע'י Claude.",
140
+ disabled=not is_anthropic_pipeline
141
+ )
142
+ relevance_thresh = st.sidebar.slider(
143
+ "סף רלוונטיות (Anthropic בלבד)", 1, 10, 5,
144
+ help="הציון המינימלי (1-10) שפסקה צריכה לקבל מ-Claude כדי להיחשב רלוונטית.",
145
+ disabled=not is_anthropic_pipeline
146
+ )
147
+ n_validate = st.sidebar.slider(
148
+ "מספר פסקאות לאימות (GPT-4o בלבד)", 1, min(n_retrieve, 100), min(50, n_retrieve),
149
+ help="כמה מהפסקאות שאוחזרו יישלחו לאימות רלוונטיות פרטני ע'י GPT-4o.",
150
+ disabled=not is_gpt4o_para_pipeline
151
+ )
152
+ n_final_context = st.sidebar.slider(
153
+ "פסקאות מקסימום להקשר סופי (Gemini/Anthropic)", 1, n_retrieve, min(21, n_retrieve),
154
+ help="Gemini/Anthropic: כמה מהפסקאות הטובות ביותר יישלחו ליצירה. GPT-4o: לא בשימוש ישיר (הקשר נקבע ע'י האימות).",
155
+ disabled=is_gpt4o_para_pipeline
156
+ )
157
+
158
+ services_ready = (
159
+ retriever_ready and
160
+ ((anthropic_analyzer_ready and anthropic_generator_ready) if is_anthropic_pipeline else True) and
161
+ (gemini_generator_ready if is_gemini_pipeline else True) and
162
+ ((openai_validator_ready and anthropic_generator_ready) if is_gpt4o_para_pipeline else True)
163
+ )
164
+
165
+ if not services_ready and retriever_ready:
166
+ st.markdown(
167
+ f"<div class='alert alert-warning rtl-text' role='alert'>שירות(ים) חסרים. ודא שכל השירותים דרושים זמינים.</div>",
168
+ unsafe_allow_html=True,
169
+ )
170
+
171
+ @traceable
172
+ def run_rag_pipeline(pipeline_prompt: str, selected_pipeline_method: str, status_container=None):
173
+ is_anthropic_pipeline = selected_pipeline_method == "Anthropic (ניתוח וסינון פרטני)"
174
+ is_gemini_pipeline = selected_pipeline_method == "Gemini (אחזור ויצירה ישירה)"
175
+ is_gpt4o_para_pipeline = selected_pipeline_method == "GPT-4o Paragraph Validator + Claude Synthesizer"
176
+ result = {
177
+ "full_response": "", "final_docs_data": [], "status_updates": [],
178
+ "error": None, "analysis_flow": selected_pipeline_method
179
+ }
180
+ current_status_label = "מתחיל עיבוד..."
181
+ message_placeholder = st.empty()
182
+ try:
183
+ current_status_label = f"1. מאחזר עד {n_retrieve} פסקאות מ-Pinecone..."
184
+ start_retrieval = time.time()
185
+ if status_container: status_container.update(label=current_status_label)
186
+ retrieved_docs = find_similar_paragraphs(query_text=pipeline_prompt, n_results=n_retrieve)
187
+ retrieval_time = time.time() - start_retrieval
188
+ status_msg = f"אוחזרו {len(retrieved_docs)} פסקאות ב-{retrieval_time:.2f} שניות."
189
+ result["status_updates"].append(f"1. {status_msg}")
190
+ current_status_label = f"1. {status_msg}"
191
+ if status_container: status_container.update(label=current_status_label)
192
+
193
+ if not retrieved_docs:
194
+ result["full_response"] = "<div class='rtl-text'>לא אותרו מקורות רלוונטיים לשאילתה.</div>"
195
+ if status_container: status_container.update(label="לא נמצאו מסמכים.", state="complete")
196
+ message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
197
+ return result
198
+
199
+ docs_for_generator = []
200
+ generator_name = ""
201
+
202
+ if is_anthropic_pipeline:
203
+ generator_name = "Anthropic"
204
+ analysis_count = min(len(retrieved_docs), n_analyze)
205
+ current_status_label = f"2. [Anthropic] מנתח רלוונטיות פרטנית ({analysis_count} פסקאות)..."
206
+ analysis_start_time = time.time()
207
+ if status_container: status_container.update(label=current_status_label)
208
+ async def run_anthropic_analysis():
209
+ docs_to_analyze_local = retrieved_docs[:analysis_count]
210
+ tasks = [analyze_source_relevance_async(d.get('hebrew_text',''), '', pipeline_prompt) for d in docs_to_analyze_local]
211
+ analysis_results = await asyncio.gather(*tasks, return_exceptions=True)
212
+ return docs_to_analyze_local, analysis_results
213
+ try:
214
+ loop = asyncio.get_event_loop_policy().get_event_loop()
215
+ if loop.is_running(): nest_asyncio.apply(); loop = asyncio.get_event_loop_policy().get_event_loop()
216
+ docs_analyzed, analysis_raw_results = loop.run_until_complete(run_anthropic_analysis())
217
+ except Exception as loop_err: raise
218
+ processed_for_filter = []; analysis_success_count = 0; analysis_fail_count = 0;
219
+ for i, doc in enumerate(docs_analyzed):
220
+ res = analysis_raw_results[i]
221
+ if isinstance(res, dict) and 'relevance' in res:
222
+ doc['analysis'] = res; processed_for_filter.append(doc); analysis_success_count += 1
223
+ elif isinstance(res, Exception): analysis_fail_count += 1;
224
+ else: analysis_fail_count += 1;
225
+ analysis_time = time.time() - analysis_start_time
226
+ status_msg = f"ניתוח Anthropic פרטני הושלם ({analysis_success_count} הצלחות, {analysis_fail_count} כשלונות) ב-{analysis_time:.2f} שניות."
227
+ result["status_updates"].append(f"2. {status_msg}")
228
+ current_status_label = f"2. {status_msg}"
229
+ if status_container: status_container.update(label=current_status_label)
230
+ current_status_label = "3. [Anthropic] סינון לפי ציון רלוונטיות..."
231
+ if status_container: status_container.update(label=current_status_label)
232
+ filtered_docs = []
233
+ for doc in processed_for_filter:
234
+ try:
235
+ score = int(doc.get('analysis', {}).get('relevance', {}).get('relevance_score', '0'))
236
+ doc['analysis']['relevance']['numeric_score'] = score
237
+ if score >= relevance_thresh: filtered_docs.append(doc)
238
+ except Exception as filter_err: pass
239
+ filtered_docs.sort(key=lambda d: d.get('analysis',{}).get('relevance',{}).get('numeric_score', 0), reverse=True)
240
+ docs_for_generator = filtered_docs[:n_final_context]
241
+ status_msg = f"נבחרו {len(docs_for_generator)} פסקאות לאחר סינון Anthropic (סף: {relevance_thresh}, מקס': {n_final_context})."
242
+ result["status_updates"].append(f"3. {status_msg}")
243
+ current_status_label = f"3. {status_msg}"
244
+ if status_container: status_container.update(label=current_status_label)
245
+ if not docs_for_generator:
246
+ result["full_response"] = "<div class='rtl-text'>לא נמצאו פסקאות רלוונטיות מספיק לאחר סינון Anthropic פרטני.</div>"
247
+ if status_container: status_container.update(label="לא נמצאו פסקאות מסוננות.", state="complete")
248
+ message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
249
+ return result
250
+
251
+ elif is_gemini_pipeline:
252
+ generator_name = "Gemini"
253
+ status_msg = "2. דילוג על שלב ניתוח/סינון (שיטת Gemini)."; result["status_updates"].append(status_msg)
254
+ current_status_label = status_msg;
255
+ if status_container: status_container.update(label=current_status_label)
256
+ docs_for_generator = retrieved_docs[:n_final_context]
257
+ status_msg = f"3. נבחרו {len(docs_for_generator)} פסקאות מובילות (לפי אחזור) להקשר עבור Gemini (מקס': {n_final_context})."
258
+ result["status_updates"].append(status_msg)
259
+ current_status_label = status_msg
260
+ if status_container: status_container.update(label=current_status_label)
261
+ if not docs_for_generator:
262
+ result["full_response"] = "<div class='rtl-text'>לא אותרו מסמכים כלל (שגיאה פנימית).</div>"
263
+ if status_container: status_container.update(label="שגיאה בבחירת הקשר.", state="error")
264
+ message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
265
+ return result
266
+
267
+ elif is_gpt4o_para_pipeline:
268
+ generator_name = "Anthropic"
269
+ docs_to_validate = retrieved_docs[:n_validate]
270
+ num_to_validate = len(docs_to_validate)
271
+ if not docs_to_validate:
272
+ result["full_response"] = "<div class='rtl-text'>שגיאה: אין מסמכים לאימות (לאחר אחזור).</div>"
273
+ if status_container: status_container.update(label="שגיאה לפני אימות.", state="error")
274
+ message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
275
+ return result
276
+ status_msg = f"2. נבחרו {num_to_validate} פסקאות מובילות לאימות פרטני (מתוך {len(retrieved_docs)})."
277
+ result["status_updates"].append(status_msg)
278
+ current_status_label = status_msg
279
+ if status_container: status_container.update(label=current_status_label)
280
+ current_status_label = f"3. [GPT-4o] מתחיל אימות מקבילי של {num_to_validate} פסקאות..."
281
+ validation_start_time = time.time()
282
+ if status_container: status_container.update(label=current_status_label)
283
+ tasks = [validate_paragraph_relevance_gpt4o(doc, pipeline_prompt, i) for i, doc in enumerate(docs_to_validate)]
284
+ validation_results = []
285
+ try:
286
+ loop = asyncio.get_event_loop_policy().get_event_loop()
287
+ if loop.is_running(): nest_asyncio.apply(); loop = asyncio.get_event_loop_policy().get_event_loop()
288
+ validation_results = loop.run_until_complete(asyncio.gather(*tasks, return_exceptions=True))
289
+ except Exception as gather_err:
290
+ result["error"] = f"שגיאה בביצוע האימות המקבילי: {gather_err}"
291
+ result["full_response"] = f"<div class='rtl-text'>אירעה שגיאה קריטית בשלב אימות המידע.</div>";
292
+ if status_container: status_container.update(label="שגיאה באימות!", state="error")
293
+ message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
294
+ return result
295
+ validation_time = time.time() - validation_start_time
296
+ passed_count = 0; failed_count = 0; filtered_paragraphs = []
297
+ current_status_label = "4. [GPT-4o] סינון פסקאות לפי תוצאות האימות..."
298
+ if status_container: status_container.update(label=current_status_label)
299
+ for i, res in enumerate(validation_results):
300
+ para_num = i + 1
301
+ if isinstance(res, Exception): failed_count += 1;
302
+ elif isinstance(res, dict) and res.get("validation"):
303
+ if res["validation"].get("contains_relevant_info") is True:
304
+ passed_count += 1; filtered_paragraphs.append(res.get("paragraph_data", {}))
305
+ else: failed_count += 1;
306
+ filtered_paragraphs = [p for p in filtered_paragraphs if p]
307
+ status_msg_val = f"אימות GPT-4o פרטני הושלם ({passed_count} עברו, {num_to_validate - passed_count - failed_count} נדחו, {failed_count} נכשלו) ב-{validation_time:.2f} שניות."
308
+ result["status_updates"].append(f"3. {status_msg_val}")
309
+ status_msg_filter = f"נאספו {len(filtered_paragraphs)} פסקאות רלוונטיות לאחר אימות."
310
+ result["status_updates"].append(f"4. {status_msg_filter}")
311
+ current_status_label = f"4. {status_msg_filter}"
312
+ if status_container: status_container.update(label=current_status_label)
313
+ if not filtered_paragraphs:
314
+ result["full_response"] = "<div class='rtl-text'>לא נמצא מידע רלוונטי בפסקאות שנבדקו ע'י GPT-4o.</div>"
315
+ if status_container: status_container.update(label="לא נמצא מידע רלוונטי.", state="complete")
316
+ message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
317
+ return result
318
+ docs_for_generator = filtered_paragraphs
319
+
320
+ else:
321
+ raise ValueError(f"שיטת עיבוד לא ידועה: {selected_pipeline_method}")
322
+
323
+ current_status_label = f"5. מכין הקשר ({len(docs_for_generator)} פסקאות) ומחולל תשובה סופית ({generator_name})..."
324
+ result["status_updates"].append(f"5. מכין הקשר ומחולל תשובה ({generator_name})...")
325
+ if status_container: status_container.update(label=current_status_label)
326
+
327
+ start_generation = time.time()
328
+ final_response_text = ""
329
+ generation_error_details = None
330
+ result["final_docs_data"] = docs_for_generator
331
+
332
+ try:
333
+ if generator_name == "Gemini":
334
+ generator_stream = generate_gemini(query=pipeline_prompt, context_documents=docs_for_generator)
335
+ response_chunks = []
336
+ for chunk in generator_stream:
337
+ if isinstance(chunk, str) and chunk.strip().startswith("--- שגיאה"):
338
+ generation_error_details = chunk.strip()
339
+ break
340
+ response_chunks.append(str(chunk))
341
+ temp_stream_response = "".join(response_chunks)
342
+ message_placeholder.markdown(f"<div class='rtl-text'>{temp_stream_response}▌</div>", unsafe_allow_html=True)
343
+ if generation_error_details is None: final_response_text = "".join(response_chunks)
344
+ elif generator_name == "Anthropic":
345
+ async def consume_anthropic_stream():
346
+ history = [{"role": "user", "content": pipeline_prompt}]
347
+ local_chunks = []
348
+ async for chunk in generate_anthropic(messages=history, context_documents=docs_for_generator):
349
+ if isinstance(chunk, str) and chunk.strip().startswith("--- שגיאה"):
350
+ raise RuntimeError(f"Error yielded from Anthropic generator: {chunk.strip()}")
351
+ local_chunks.append(str(chunk))
352
+ temp_response = "".join(local_chunks)
353
+ message_placeholder.markdown(f"<div class='rtl-text'>{temp_response}▌</div>", unsafe_allow_html=True)
354
+ return "".join(local_chunks)
355
+ try:
356
+ loop = asyncio.get_event_loop_policy().get_event_loop()
357
+ if loop.is_running(): nest_asyncio.apply(); loop = asyncio.get_event_loop_policy().get_event_loop()
358
+ final_response_text = loop.run_until_complete(consume_anthropic_stream())
359
+ except Exception as consume_err:
360
+ generation_error_details = f"{type(consume_err).__name__}: {str(consume_err)}"
361
+ else:
362
+ raise RuntimeError(f"Generator name '{generator_name}' not recognized.")
363
+
364
+ except Exception as gen_err:
365
+ generation_error_details = f"{type(gen_err).__name__}: {str(gen_err)}"
366
+
367
+ generation_time = time.time() - start_generation
368
+ if generation_error_details:
369
+ result["error"] = f"שגיאה במהלך יצירת התשובה ({generator_name}): {generation_error_details}"
370
+ result["full_response"] = f"<div class='rtl-text'><strong>שגיאה ביצירת התשובה.</strong><br>פרטים: {generation_error_details}</div>"
371
+ message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
372
+ else:
373
+ lines_to_remove = ["יהי רצון שנזכה לגאולה השלמה במהרה בימינו אמן.", "יהי רצון שנזכה...", "הכותב וחותם לכבוד התורה ולומדיה", "הכותב וחותם לכבוד התורה...", "בכבוד רב,", "בברכה,"]
374
+ response_lines = final_response_text.strip().split('\n'); cleaned_lines = response_lines[:]
375
+ while cleaned_lines:
376
+ last_line = cleaned_lines[-1].strip()
377
+ if any(last_line.lower() == ltr.lower() or last_line.lower().startswith(ltr.lower().replace('...','')) for ltr in lines_to_remove): cleaned_lines.pop()
378
+ else: break
379
+ final_response_text = "\n".join(cleaned_lines).strip()
380
+ result["full_response"] = final_response_text
381
+ message_placeholder.markdown(f"<div class='rtl-text'>{final_response_text}</div>", unsafe_allow_html=True)
382
+
383
+ except Exception as e:
384
+ pipeline_error_type = type(e).__name__; pipeline_error_msg = str(e)
385
+ result["error"] = f"שגיאה בזמן הריצה: {pipeline_error_type}: {pipeline_error_msg}"
386
+ result["full_response"] = f"<div class='rtl-text'><strong>שגיאה במהלך העיבוד ({pipeline_error_type})</strong><br>אנא נסה שוב מאוחר יותר.<details><summary>פרטים טכניים</summary><pre>{traceback.format_exc()}</pre></details></div>"
387
+ message_placeholder.markdown(result["full_response"], unsafe_allow_html=True)
388
+ if status_container: status_container.update(label="שגיאה בעיבוד!", state="error")
389
+ return result
390
+
391
+ if "messages" not in st.session_state:
392
+ st.session_state.messages = []
393
+
394
+ for message in st.session_state.messages:
395
+ with st.chat_message(message["role"]):
396
+ content_display = message['content']
397
+ if not content_display.strip().startswith(('<div', '<p', '<strong', '<details')):
398
+ content_display = f"<div class='rtl-text'>{content_display}</div>"
399
+ st.markdown(content_display, unsafe_allow_html=True)
400
+ if message["role"] == "assistant" and "final_docs" in message and message["final_docs"]:
401
+ final_docs_data = message.get("final_docs", [])
402
+ pipeline_flow_used = message.get("analysis_flow", "לא ידוע")
403
+ if final_docs_data:
404
+ st.expander("מסמכים שנמצאו", expanded=False).write(final_docs_data)
405
+ expander_title_text = f"הצג {len(final_docs_data)} פסקאות מקור שנשלחו למחולל"
406
+ if pipeline_flow_used == "Anthropic (ניתוח וסינון פרטני)":
407
+ expander_title_text += " (לאחר סינון Anthropic פרטני)"
408
+ elif pipeline_flow_used == "Gemini (אחזור ויצירה ישירה)":
409
+ expander_title_text += " (ללא סינון נוסף)"
410
+ elif pipeline_flow_used == "GPT-4o Paragraph Validator + Claude Synthesizer":
411
+ expander_title_text += " (לאחר אימות GPT-4o פרטני)"
412
+ else:
413
+ expander_title_text += " (לאחר עיבוד)"
414
+ expander_title = f"<span class='rtl-text'>{expander_title_text}</span>"
415
+ with st.expander(expander_title, expanded=False):
416
+ st.markdown("<div class='expander-content'>", unsafe_allow_html=True)
417
+ for i, doc in enumerate(final_docs_data):
418
+ score_info = ""
419
+ source_name = doc.get('source_name', 'לא ידוע')
420
+ original_id = doc.get('original_id', 'N/A')
421
+ hebrew_text = doc.get('hebrew_text', 'טקסט המקור חסר')
422
+ st.markdown(
423
+ f"<div class='source-info rtl-text'><strong>מקור {i+1}:</strong> ספר: {source_name}, ID: {original_id}{score_info}</div>",
424
+ unsafe_allow_html=True,
425
+ )
426
+ st.markdown(f"<div class='hebrew-text'>{hebrew_text}</div>", unsafe_allow_html=True)
427
+ st.markdown("</div>", unsafe_allow_html=True)
428
+
429
+ if prompt := st.chat_input("שאל שאלה בענייני חסידות...", disabled=not services_ready, key="chat_input"):
430
+ st.session_state.messages.append({"role": "user", "content": prompt})
431
+ with st.chat_message("user"):
432
+ st.markdown(f"<div class='rtl-text'>{prompt}</div>", unsafe_allow_html=True)
433
+ with st.chat_message("assistant"):
434
+ status_control_asst = None
435
+ rag_result_asst = None
436
+ try:
437
+ status_label = f"<span class='rtl-text'>מעבד בקשה באמצעות '{pipeline_method}'...</span>"
438
+ with st.status(status_label, expanded=True) as status:
439
+ status_control_asst = status
440
+ rag_result_asst = run_rag_pipeline(
441
+ pipeline_prompt=prompt,
442
+ selected_pipeline_method=pipeline_method,
443
+ status_container=status_control_asst,
444
+ )
445
+ if rag_result_asst and isinstance(rag_result_asst, dict):
446
+ pipeline_error_value = rag_result_asst.get("error")
447
+ final_docs_value = rag_result_asst.get("final_docs_data", [])
448
+ final_docs_to_store = []
449
+ if pipeline_error_value is None:
450
+ final_docs_to_store = final_docs_value
451
+ flow_to_store = rag_result_asst.get("analysis_flow", "Error")
452
+ if pipeline_error_value is not None:
453
+ flow_to_store = "Error"
454
+ st.session_state.messages.append({
455
+ "role": "assistant",
456
+ "content": rag_result_asst.get("full_response", "..."),
457
+ "final_docs": final_docs_to_store,
458
+ "analysis_flow": flow_to_store,
459
+ })
460
+ if rag_result_asst.get("status_updates"):
461
+ expander_label = "<span class='rtl-text'>הצג שלבי עיבוד</span>"
462
+ with st.expander(expander_label, expanded=False):
463
+ for update in rag_result_asst["status_updates"]:
464
+ st.markdown(f"<div class='rtl-text'><code>- {update}</code></div>", unsafe_allow_html=True)
465
+ else:
466
+ fallback_err_msg_html = "<div class='rtl-text'><strong>שגיאה בלתי צפויה בתקשורת עם מנגנון העיבוד (fallback).</strong></div>"
467
+ st.session_state.messages.append({
468
+ "role": "assistant",
469
+ "content": fallback_err_msg_html,
470
+ "final_docs": [],
471
+ "analysis_flow": "Error",
472
+ })
473
+ except Exception as e:
474
+ error_display_html = f"<div class='rtl-text'><strong>שגיאה קריטית!</strong><br><pre>{traceback.format_exc()}</pre></div>"
475
+ st.error(error_display_html, icon="🔥")
476
+ st.session_state.messages.append({
477
+ "role": "assistant",
478
+ "content": error_display_html,
479
+ "final_docs": [],
480
+ "analysis_flow": "Critical Error",
481
+ })
file_processor.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import json
4
+ import uuid
5
+ import re
6
+ import asyncio
7
+ import time
8
+ import argparse
9
+ from typing import List, Dict, Optional, Tuple
10
+ from dotenv import load_dotenv
11
+
12
+ # --- Required Libraries ---
13
+ try:
14
+ from docx import Document
15
+ except ImportError:
16
+ print("Requirement Missing: Please install 'python-docx' (`pip install python-docx`)")
17
+ exit()
18
+ # PDF library (PyPDF2) import removed
19
+ try:
20
+ from langdetect import detect, DetectorFactory, LangDetectException
21
+ DetectorFactory.seed = 0
22
+ except ImportError:
23
+ print("Requirement Missing: Please install 'langdetect' (`pip install langdetect`)")
24
+ exit()
25
+
26
+ # --- Configuration ---
27
+ load_dotenv()
28
+ API_KEY = os.environ.get("OPENAI_API_KEY")
29
+ if not API_KEY:
30
+ print("🛑 ERROR: OpenAI API key not found. Set OPENAI_API_KEY in your .env file.")
31
+ exit()
32
+
33
+ OUTPUT_DIR = "data"
34
+ TRANSLATION_MODEL = "gpt-4o-mini"
35
+ MAX_CONCURRENT_TRANSLATIONS = 10
36
+ TARGET_LANGUAGE = "en"
37
+
38
+ # --- Chunking Configuration ---
39
+ PARAGRAPH_CHUNK_THRESHOLD = 2000 # Characters
40
+ CHUNK_SIZE = 800 # Characters
41
+ CHUNK_OVERLAP = 100 # Characters
42
+
43
+ # Validate chunking config
44
+ if CHUNK_OVERLAP >= CHUNK_SIZE:
45
+ print(f"🛑 ERROR: CHUNK_OVERLAP ({CHUNK_OVERLAP}) must be less than CHUNK_SIZE ({CHUNK_SIZE}).")
46
+ exit()
47
+
48
+ # --- Setup OpenAI Client ---
49
+ try:
50
+ client = openai.AsyncOpenAI(api_key=API_KEY)
51
+ print("✅ OpenAI Async Client Initialized.")
52
+ except Exception as e:
53
+ print(f"🛑 ERROR: Failed to initialize OpenAI client: {e}")
54
+ exit()
55
+
56
+ # --- Text Extraction Functions ---
57
+
58
+ def extract_text_from_docx(file_path: str) -> Optional[str]:
59
+ """Extracts all text from a DOCX file."""
60
+ try:
61
+ doc = Document(file_path)
62
+ full_text = [para.text for para in doc.paragraphs if para.text.strip()]
63
+ print(f" 📄 Extracted {len(full_text)} paragraphs from DOCX: {os.path.basename(file_path)}")
64
+ return "\n\n".join(full_text) # Use double newline join as a base
65
+ except Exception as e:
66
+ print(f" ❌ ERROR reading DOCX file '{os.path.basename(file_path)}': {e}")
67
+ return None
68
+
69
+ # --- PDF Extraction Function Removed ---
70
+
71
+ def extract_text_from_txt(file_path: str) -> Optional[str]:
72
+ """Reads text from a TXT file."""
73
+ try:
74
+ with open(file_path, 'r', encoding='utf-8') as file:
75
+ text = file.read()
76
+ print(f" 📄 Read TXT file: {os.path.basename(file_path)} (length: {len(text)} chars)")
77
+ return text
78
+ except Exception as e:
79
+ print(f" ❌ ERROR reading TXT file '{os.path.basename(file_path)}': {e}")
80
+ return None
81
+
82
+ # --- Text Processing Functions (segment, chunk, detect, translate - No changes needed here) ---
83
+
84
+ def _chunk_text(text: str, size: int, overlap: int) -> List[str]:
85
+ """Helper function to chunk a single block of text."""
86
+ # (Implementation remains the same as previous version)
87
+ if not text: return []
88
+ chunks = []
89
+ start_index = 0
90
+ text_len = len(text)
91
+ while start_index < text_len:
92
+ end_index = start_index + size
93
+ chunk = text[start_index:end_index]
94
+ chunks.append(chunk.strip())
95
+ next_start = start_index + size - overlap
96
+ if next_start <= start_index: next_start = start_index + 1
97
+ start_index = next_start
98
+ if start_index >= text_len: break
99
+ return [c for c in chunks if c]
100
+
101
+ def segment_into_paragraphs_or_chunks(text: str) -> List[str]:
102
+ """
103
+ Segments text into paragraphs based on newlines.
104
+ If a resulting paragraph exceeds PARAGRAPH_CHUNK_THRESHOLD,
105
+ it chunks that specific paragraph instead.
106
+ """
107
+ # (Implementation remains the same as previous version)
108
+ if not text: return []
109
+ normalized_text = text.replace('\r\n', '\n').replace('\r', '\n')
110
+ initial_segments = re.split(r'\n\s*\n+', normalized_text)
111
+ initial_segments = [s.strip() for s in initial_segments if s.strip()]
112
+ if len(initial_segments) <= 1 and '\n' in normalized_text:
113
+ print(" Parsing: Double newline split yielded few segments, trying single newline split.")
114
+ initial_segments = [s.strip() for s in normalized_text.split('\n') if s.strip()]
115
+ if not initial_segments:
116
+ print(" Parsing: No segments found after initial splitting.")
117
+ return []
118
+ print(f" Parsing: Initial segmentation yielded {len(initial_segments)} segments.")
119
+ final_segments = []
120
+ long_segment_count = 0
121
+ for segment in initial_segments:
122
+ if len(segment) > PARAGRAPH_CHUNK_THRESHOLD:
123
+ long_segment_count += 1
124
+ print(f" ❗ Segment ({len(segment)} chars > {PARAGRAPH_CHUNK_THRESHOLD}) is too long. Applying chunking (Size: {CHUNK_SIZE}, Overlap: {CHUNK_OVERLAP})...")
125
+ chunks = _chunk_text(segment, CHUNK_SIZE, CHUNK_OVERLAP)
126
+ print(f" -> Chunked into {len(chunks)} pieces.")
127
+ final_segments.extend(chunks)
128
+ elif segment:
129
+ final_segments.append(segment)
130
+ if long_segment_count > 0:
131
+ print(f" Parsing: Chunking applied to {long_segment_count} long segments.")
132
+ print(f" 🔪 Final segmentation/chunking resulted in {len(final_segments)} pieces.")
133
+ return final_segments
134
+
135
+ def detect_language_safe(text: str, default_lang: str = "unknown") -> str:
136
+ """Detects language, handling short text and errors."""
137
+ # (Implementation remains the same as previous version)
138
+ clean_text = text.strip()
139
+ if not clean_text or len(clean_text) < 10: return default_lang
140
+ try: return detect(clean_text)
141
+ except LangDetectException: return default_lang
142
+ except Exception as e:
143
+ print(f" ❌ Unexpected error during language detection: {e}")
144
+ return "error"
145
+
146
+ async def translate_paragraph(text: str, target_lang: str, semaphore: asyncio.Semaphore) -> Tuple[str, Optional[str]]:
147
+ """Translates a single paragraph/chunk using OpenAI, with rate limiting."""
148
+ # (Implementation remains the same as previous version)
149
+ async with semaphore:
150
+ detected_lang = detect_language_safe(text)
151
+ if detected_lang != 'he': return text, None
152
+ print(f" 🌍 Translating Hebrew segment to {target_lang.upper()}: '{text[:60]}...'")
153
+ prompt = f"Translate the following Hebrew text accurately to {target_lang}. Provide only the translation, without any introductory phrases.\nHebrew Text:\n```heb\n{text}\n```\nTranslation:"
154
+ retries = 1
155
+ for attempt in range(retries + 1):
156
+ try:
157
+ response = await client.chat.completions.create(
158
+ model=TRANSLATION_MODEL, messages=[ {"role": "system", "content": f"You are an expert translator specializing in Hebrew to {target_lang} translation. Provide only the translated text."}, {"role": "user", "content": prompt} ],
159
+ max_tokens=int(len(text.split()) * 2.5) + 50, temperature=0.1, n=1, stop=None, )
160
+ translation = response.choices[0].message.content.strip()
161
+ if translation:
162
+ if translation.strip() == text.strip():
163
+ print(f" ⚠️ Translation attempt returned original text for: '{text[:60]}...'")
164
+ return text, "Translation Failed: Model returned original text"
165
+ return text, translation
166
+ else:
167
+ print(f" ❌ Translation attempt returned empty response for: '{text[:60]}...'")
168
+ if attempt == retries: return text, "Translation Failed: Empty Response"
169
+ except openai.RateLimitError as e:
170
+ wait_time = 5 * (attempt + 1)
171
+ print(f" ⏳ Rate limit hit during translation. Waiting {wait_time}s... ({e})")
172
+ await asyncio.sleep(wait_time)
173
+ if attempt == retries: return text, "Translation Failed: Rate Limited"
174
+ except openai.APIError as e:
175
+ print(f" ❌ OpenAI API Error during translation: {e}")
176
+ wait_time = 3 * (attempt + 1); await asyncio.sleep(wait_time)
177
+ if attempt == retries: return text, f"Translation Failed: API Error ({e.code})"
178
+ except Exception as e:
179
+ print(f" ❌ Unexpected error during translation: {e}")
180
+ if attempt == retries: return text, f"Translation Failed: Unexpected Error ({type(e).__name__})"
181
+ if attempt < retries: await asyncio.sleep(2 * (attempt + 1))
182
+ return text, "Translation Failed: Max Retries"
183
+
184
+
185
+ # --- Main Processing Function ---
186
+
187
+ async def process_file(input_path: str, output_dir: str):
188
+ """Processes a single DOCX or TXT file: extracts, segments/chunks, translates, saves JSON."""
189
+ print(f"\n--- Processing file: {os.path.basename(input_path)} ---")
190
+ start_time = time.time()
191
+ file_ext = os.path.splitext(input_path)[1].lower()
192
+ extracted_text: Optional[str] = None
193
+
194
+ # 1. Extract Text (Only DOCX and TXT)
195
+ if file_ext == ".docx":
196
+ extracted_text = extract_text_from_docx(input_path)
197
+ elif file_ext == ".txt":
198
+ extracted_text = extract_text_from_txt(input_path)
199
+ else:
200
+ # This case should ideally not be hit if input is pre-filtered, but acts as safeguard
201
+ print(f" ⚠️ Internal Skip: Unsupported extension '{file_ext}' passed to process_file.")
202
+ return
203
+
204
+ if not extracted_text or not extracted_text.strip():
205
+ print(" ❌ Text extraction failed or returned empty. Skipping.")
206
+ return
207
+
208
+ # 2. Segment into Paragraphs or Chunks
209
+ segments = segment_into_paragraphs_or_chunks(extracted_text)
210
+ if not segments:
211
+ print(" ❌ No paragraphs or chunks found after segmentation. Skipping.")
212
+ return
213
+
214
+ # 3. Translate Hebrew Segments (Asynchronously)
215
+ output_data = []
216
+ translation_semaphore = asyncio.Semaphore(MAX_CONCURRENT_TRANSLATIONS)
217
+ tasks = []
218
+ print(f" 🗣️ Preparing to translate {len(segments)} segments (max concurrent: {MAX_CONCURRENT_TRANSLATIONS})...")
219
+
220
+ for i, seg_text in enumerate(segments):
221
+ task = asyncio.create_task(translate_paragraph(seg_text, TARGET_LANGUAGE, translation_semaphore))
222
+ tasks.append(task)
223
+
224
+ translation_results = await asyncio.gather(*tasks)
225
+
226
+ # 4. Format into JSON Structure
227
+ print(" 📝 Formatting results into JSON...")
228
+ translation_failures = 0
229
+ for i, (original_he, translation_en) in enumerate(translation_results):
230
+ failure_msg = "Translation Failed"
231
+ is_failure = isinstance(translation_en, str) and failure_msg in translation_en
232
+ if is_failure:
233
+ translation_failures += 1
234
+ english_text = translation_en # Store the error message
235
+ else:
236
+ english_text = translation_en if translation_en else ""
237
+ output_data.append({ "id": str(uuid.uuid4()), "hebrew": original_he, "english": english_text })
238
+
239
+ if translation_failures > 0:
240
+ print(f" ⚠️ Encountered {translation_failures} translation failures out of {len(segments)} segments.")
241
+
242
+ # 5. Save to JSON File
243
+ base_filename = os.path.splitext(os.path.basename(input_path))[0]
244
+ output_filename = f"{base_filename}.json"
245
+ output_path = os.path.join(output_dir, output_filename)
246
+
247
+ try:
248
+ os.makedirs(output_dir, exist_ok=True)
249
+ with open(output_path, 'w', encoding='utf-8') as f:
250
+ json.dump(output_data, f, ensure_ascii=False, indent=2)
251
+ end_time = time.time()
252
+ print(f"✅ Successfully saved {len(output_data)} segments to: {output_path}")
253
+ print(f"⏱️ File processing time: {end_time - start_time:.2f} seconds")
254
+ except Exception as e:
255
+ print(f" ❌ ERROR saving JSON file '{output_path}': {e}")
256
+
257
+
258
+ # --- Script Execution ---
259
+
260
+ if __name__ == "__main__":
261
+ # Update description to remove PDF mention
262
+ parser = argparse.ArgumentParser(description="Process DOCX and TXT files into paragraph/chunk-based JSON with Hebrew-to-English translation.")
263
+ parser.add_argument("input_paths", nargs='+', help="Path(s) to input file(s) or directory(ies) containing DOCX/TXT files.")
264
+ parser.add_argument("-o", "--output_dir", default=OUTPUT_DIR, help=f"Directory to save output JSON files (default: '{OUTPUT_DIR}')")
265
+ parser.add_argument("--chunk_threshold", type=int, default=PARAGRAPH_CHUNK_THRESHOLD, help="Max chars per paragraph before chunking.")
266
+ parser.add_argument("--chunk_size", type=int, default=CHUNK_SIZE, help="Target chunk size in chars.")
267
+ parser.add_argument("--chunk_overlap", type=int, default=CHUNK_OVERLAP, help="Chunk overlap in chars.")
268
+
269
+ args = parser.parse_args()
270
+ OUTPUT_DIR = args.output_dir
271
+ PARAGRAPH_CHUNK_THRESHOLD = args.chunk_threshold
272
+ CHUNK_SIZE = args.chunk_size
273
+ CHUNK_OVERLAP = args.chunk_overlap
274
+
275
+ if CHUNK_OVERLAP >= CHUNK_SIZE:
276
+ print(f"🛑 ERROR: Chunk overlap ({CHUNK_OVERLAP}) must be less than chunk size ({CHUNK_SIZE}). Adjust --chunk_overlap or --chunk_size.")
277
+ exit()
278
+
279
+ print(f"🚀 Starting File Processor (DOCX & TXT only)...") # Updated startup message
280
+ print(f"📂 Output Directory: {os.path.abspath(OUTPUT_DIR)}")
281
+ print(f"🔪 Paragraph/Chunking Settings: Threshold={PARAGRAPH_CHUNK_THRESHOLD}, Size={CHUNK_SIZE}, Overlap={CHUNK_OVERLAP}")
282
+
283
+ files_to_process = []
284
+ for path in args.input_paths:
285
+ if os.path.isfile(path):
286
+ files_to_process.append(path)
287
+ elif os.path.isdir(path):
288
+ print(f"📁 Scanning directory: {path}")
289
+ for filename in os.listdir(path):
290
+ full_path = os.path.join(path, filename)
291
+ if os.path.isfile(full_path):
292
+ files_to_process.append(full_path)
293
+ else:
294
+ print(f"⚠️ Warning: Input path not found or not a file/directory: {path}")
295
+
296
+ # Update supported extensions list
297
+ supported_extensions = ('.docx', '.txt')
298
+ valid_files = [f for f in files_to_process if f.lower().endswith(supported_extensions)]
299
+
300
+ if not valid_files:
301
+ # Update message for no supported files found
302
+ print(f"\n🛑 No supported files ({', '.join(supported_extensions)}) found in the specified paths. Exiting.")
303
+ else:
304
+ print(f"\nFound {len(valid_files)} supported files to process:")
305
+ for f in valid_files:
306
+ print(f" - {os.path.basename(f)}")
307
+
308
+ async def main():
309
+ process_tasks = [process_file(f, OUTPUT_DIR) for f in valid_files]
310
+ await asyncio.gather(*process_tasks)
311
+
312
+ script_start_time = time.time()
313
+ asyncio.run(main())
314
+ script_end_time = time.time()
315
+ print(f"\n🏁 File processing complete. Total script time: {script_end_time - script_start_time:.2f} seconds.")
generated-icon.png ADDED
generation_service_anthropic.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generation_service_anthropic.py
2
+ # For LangSmith tracing; NO Braintrust; clean for OpenAI/Anthropic API
3
+
4
+ import os
5
+ import anthropic
6
+ import re
7
+ import traceback
8
+ from typing import List, Dict, AsyncGenerator
9
+ from langsmith import traceable
10
+
11
+ # --- Environment: ensure API key is injected (from Replit secrets) ---
12
+ os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
13
+ os.environ["LANGSMITH_TRACING"] = "true"
14
+ os.environ["ANTHROPIC_API_KEY"] = os.environ["ANTHROPIC_API_KEY"]
15
+ os.environ["LANGSMITH_API_KEY"] = os.environ["LANGSMITH_API_KEY"]
16
+ os.environ["LANGSMITH_PROJECT"] = os.environ["LANGSMITH_PROJECT"]
17
+
18
+ # --- Anthropic config and client ---
19
+ ANTHROPIC_API_KEY = os.environ["ANTHROPIC_API_KEY"]
20
+ GENERATION_MODEL = "claude-3-7-sonnet-20250219"
21
+ client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY) if ANTHROPIC_API_KEY else None
22
+
23
+ def check_generator_status():
24
+ if not client: return False, "Anthropic client not initialized."
25
+ return True, f"Anthropic generation service ready (Model: {GENERATION_MODEL})."
26
+
27
+ def clean_source_text(text):
28
+ if not text: return ""
29
+ cleaned = text; cleaned = re.sub(r'@\d+', '', cleaned); cleaned = re.sub(r'<HAL>', '', cleaned, flags=re.IGNORECASE); cleaned = cleaned.replace('<br>', ' ').replace('<br />', ' '); cleaned = re.sub(r'\s+', ' ', cleaned).strip()
30
+ return cleaned
31
+
32
+ def format_context_for_prompt(documents):
33
+ if not documents: return ""
34
+ formatted_docs = []
35
+ language_key = 'hebrew_text'; id_key = 'original_id'
36
+ for index, doc in enumerate(documents):
37
+ full_text_original = doc.get(language_key, ''); doc_id = doc.get(id_key, f'unknown_{index+1}')
38
+ full_text_cleaned = clean_source_text(full_text_original)
39
+ if full_text_cleaned: formatted_docs.append(f"<source index=\"{index + 1}\" id=\"{doc_id}\">\n<full_text>{full_text_cleaned}</full_text>\n</source>")
40
+ return "\n\n".join(formatted_docs)
41
+
42
+ EXAMPLE_RESPONSE_HEBREW = """<example response hebrew>
43
+ על פי המקורות שהובאו, חשיבות השמחה בעבודת ה' היא מרכזית. נאמר כי <quote source_index="1">עיקר עבודת ה' היא בשמחה</quote>, כפי הפסוק <quote source_index="1">'עבדו את ה' בשמחה'</quote>. הסיבה לכך היא <quote source_index="1">כי השמחה פותחת הלב ומאירה הנשמה, ומביאה לידי דביקות בהשי"ת</quote>. לעומת זאת, מצב של עצבות גורם לתוצאה הפוכה, שכן <quote source_index="1">על ידי העצבות ח"ו נסתם הלב ואינו יכול לקבל אור הקדושה</quote>. מקור נוסף מדגיש כי השמחה היא תנאי לקבלת רוח הקודש והשראת השכינה, כפי שנאמר <quote source_index="2">שאין השכינה שורה אלא מתוך שמחה של מצוה</quote>, וכן <quote source_index="2">שלא שרתה עליו שכינה מפני שהיה עצב</quote>, כפי שלמדו מיעקב אבינו.
44
+ </example response hebrew>"""
45
+
46
+ @traceable
47
+ async def generate_response_stream_async(
48
+ messages: List[Dict],
49
+ context_documents: List[Dict],
50
+ ) -> AsyncGenerator:
51
+ """
52
+ Generates a response using Anthropic, yields text chunks.
53
+ Traced with LangSmith.
54
+ """
55
+ global client
56
+ ready, msg = check_generator_status()
57
+ if not ready or client is None: yield f"--- שגיאה: {msg} ---"; return
58
+
59
+ last_user_msg_content = "שאלה לא נמצאה"
60
+ for msg_ in reversed(messages):
61
+ if msg_.get("role") == "user": last_user_msg_content = str(msg_.get("content", "")); break
62
+
63
+ try:
64
+ formatted_context = format_context_for_prompt(context_documents)
65
+ has_context = bool(formatted_context)
66
+ if not has_context and context_documents:
67
+ yield f"--- שגיאה: המקורות שסופקו ריקים לאחר ניקוי. ---"; return
68
+ elif not has_context and not context_documents:
69
+ yield f"--- שגיאה: לא סופקו מקורות להקשר. ---"; return
70
+ except Exception as format_err:
71
+ yield f"--- שגיאה בעיצוב ההקשר: {format_err} ---"; return
72
+
73
+ # System prompt as before
74
+ system_prompt = f"""<instructions>
75
+ You are an expert assistant specializing in Chassidic texts...
76
+ **Response Requirements:**
77
+ (Keep all instructions as before)
78
+ </instructions>
79
+
80
+ {EXAMPLE_RESPONSE_HEBREW}"""
81
+
82
+ api_messages = []
83
+ user_prompt_content = f"<context>\n{formatted_context}\n</context>\n\nBased *exclusively* on the source text provided... Question (Hebrew):\n{last_user_msg_content}"
84
+ api_messages.append({"role": "user", "content": user_prompt_content})
85
+
86
+ print(f" -> Sending request to Anthropic (Model: {GENERATION_MODEL})...")
87
+ final_response_text_chunks = []
88
+
89
+ try:
90
+ async with client.messages.stream(
91
+ model=GENERATION_MODEL, max_tokens=20000, system=system_prompt,
92
+ messages=api_messages, temperature=1.0,
93
+ thinking={"type": "enabled", "budget_tokens": 16000}
94
+ ) as stream:
95
+ print(f" -> Anthropic stream created successfully...")
96
+ async for chunk in stream.text_stream:
97
+ if chunk and chunk.strip():
98
+ final_response_text_chunks.append(chunk)
99
+ yield chunk
100
+
101
+ except Exception as e:
102
+ yield f"\n\n--- שגיאה: {type(e).__name__} - {e} ---"
103
+ traceback.print_exc()
generation_service_gemini.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generation_service_gemini.py
2
+
3
+ import google.generativeai as genai
4
+ import os
5
+ import re
6
+ import traceback
7
+ from typing import List, Dict, Generator # Use standard Generator
8
+
9
+ # --- Attempt to Import Shared Functions ---
10
+ try:
11
+ from generation_service_anthropic import clean_source_text
12
+ print("Successfully imported clean_source_text from generation_service_anthropic.")
13
+ except ImportError:
14
+ print("Warning: Could not import clean_source_text. Using fallback cleaner.")
15
+ def clean_source_text(text): # Fallback
16
+ if not text: return ""
17
+ cleaned = text; cleaned = re.sub(r'@\d+', '', cleaned); cleaned = re.sub(r'<HAL>', '', cleaned, flags=re.IGNORECASE); cleaned = cleaned.replace('<br>', ' ').replace('<br />', ' '); cleaned = re.sub(r'\s+', ' ', cleaned).strip()
18
+ return cleaned
19
+ # --- End Fallback Definitions ---
20
+
21
+ # --- Configuration ---
22
+ GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
23
+ GENERATION_MODEL = "gemini-2.5-pro-preview-03-25" # Your model
24
+ # --- End Configuration ---
25
+
26
+ # --- Client Initialization ---
27
+ genai_client = None
28
+ if GOOGLE_API_KEY:
29
+ try:
30
+ genai.configure(api_key=GOOGLE_API_KEY)
31
+ genai_client = genai.GenerativeModel(GENERATION_MODEL)
32
+ print(f"Google AI client initialized for Gemini generation (Model: {GENERATION_MODEL}).")
33
+ except Exception as e: print(f"Error initializing Google AI client: {e}"); traceback.print_exc()
34
+ else: print("GOOGLE_API_KEY not found. Gemini generation service will not function.")
35
+
36
+ def check_gemini_generator_status():
37
+ if not genai_client: return False, f"Gemini generator client not initialized."
38
+ return True, f"Gemini generation service ready (Model: {GENERATION_MODEL})."
39
+
40
+ # --- MODIFIED format_context_for_prompt (Keep ID attribute) ---
41
+ def format_context_for_prompt(documents: List[Dict]) -> str:
42
+ if not documents: return ""
43
+ formatted_docs = []
44
+ language_key = 'hebrew_text'
45
+ id_key = 'original_id'
46
+ for index, doc in enumerate(documents):
47
+ full_text_original = doc.get(language_key, '')
48
+ paragraph_id = doc.get(id_key, f'unknown_id_{index+1}')
49
+ try: full_text_cleaned = clean_source_text(full_text_original)
50
+ except NameError: full_text_cleaned = full_text_original # Fallback
51
+ if full_text_cleaned:
52
+ formatted_docs.append(
53
+ f'<source index="{index + 1}" id="{paragraph_id}">\n' # Keep id attribute
54
+ f'<full_text>{full_text_cleaned}</full_text>\n'
55
+ f'</source>'
56
+ )
57
+ return "\n\n".join(formatted_docs)
58
+ # --- END MODIFIED format_context_for_prompt ---
59
+
60
+
61
+ # --- *** NEW SIMPLIFIED EXAMPLE_RESPONSE_HEBREW *** ---
62
+ EXAMPLE_RESPONSE_HEBREW = """<example response hebrew>
63
+ במקורות שהובאו מצאנו כמה נקודות בנוגע לרדיפת פרעה אחר בני ישראל. ראשית, היה זה רצון השי"ת להביא את המצריים לים סוף "כדי שיטבעו" (ID: cc792519-8a96-4c2e-96a7-e940a3d6688f) ויתפרסם כבודו יתברך בעולם. הקב"ה סיבב זאת על ידי שהטעה את פרעה לחשוב שבני ישראל "נבוכים הם בארץ סגר עליהם המדבר" (ID: 2e0227b5-f359-4a60-ab51-2ba9f6c3fca5), מה שעורר אותו לרדוף אחריהם.
64
+
65
+ עוד מבואר כי נס קריעת ים סוף נועד להורות "הוראה מפורסמת היות בו יתברך פעולת ההפכים" (ID: cde20ae5-0374-4023-9f15-e721b4920db8), דהיינו שבאותו רגע שהיטיב לישראל וקרע לפניהם את הים, הוא הרע למצרים והטביעם בתוכו.
66
+
67
+ בנוגע לשאלה מדוע הים לא נקרע מיד, מובא שהיו טענות שונות, כגון שעדיין לא הושלם זמן הגלות של ת' שנה, וכן טענת המקטרג ש"הללו עובדי עבודה זרה והללו עובדי עבודה זרה" (ID: [Could be a different ID if cited]). טענות אלו נדחו, בין היתר, משום שהשעבוד הקשה השלים את הזמן, וכן מפני שעבודתם של ישראל היתה "באונס ושוגג" (ID: [Could be a different ID if cited]).
68
+ </example response hebrew>"""
69
+ # --- *** END NEW SIMPLIFIED EXAMPLE_RESPONSE_HEBREW *** ---
70
+
71
+ # --- Synchronous Generation Function ---
72
+ def generate_response_stream_gemini(
73
+ query: str,
74
+ context_documents: List[Dict]
75
+ ) -> Generator[str, None, None]:
76
+ global genai_client
77
+ ready, msg = check_gemini_generator_status()
78
+ if not ready or genai_client is None: yield f"שגיאה: ..."; return
79
+ if not query: yield "שגיאה: ..."; return
80
+
81
+ try:
82
+ formatted_context = format_context_for_prompt(context_documents)
83
+ except Exception as format_err: yield f"שגיאה ...: {format_err}"; return
84
+
85
+ has_context = bool(formatted_context)
86
+ if not has_context: yield "לא סופקו מקורות לעיון."; return
87
+
88
+ # --- *** REVISED System Instruction Content for Simple Output *** ---
89
+ system_instruction_content = f"""<instructions>
90
+ You are a highly knowledgeable assistant acting as a learned scholar specializing in Chassidic texts, particularly Divrei Yoel. Your function is to answer the user's Hebrew question based *strictly and exclusively* on the provided Hebrew source text passages found in the <context> section.
91
+
92
+ **Response Requirements:**
93
+
94
+ 1. **Language:** Respond ONLY in formal, traditional Rabbinic/Torah Hebrew (עברית תורנית, לשון הקודש). ABSOLUTELY NO MODERN HEBREW. Use only Hebrew letters and standard punctuation.
95
+ 2. **Content:** Base your answer *solely* on information present in the `<source>` passages provided in the context. Do not add external knowledge or opinions.
96
+ 3. **Structure:** Write a clear, coherent answer to the user's question.
97
+ 4. **Citations:** When directly quoting or closely paraphrasing a specific point from a source to support your answer, incorporate a **short, relevant snippet** of the source text directly into your sentence. Immediately following the snippet or the sentence containing it, you MUST add the paragraph ID in the format `(ID: <id_value>)`. Extract the `<id_value>` from the `id` attribute of the corresponding `<source>` tag in the context.
98
+ 5. **Conciseness:** Keep quoted snippets brief and directly relevant to the point you are making.
99
+ 6. **Irrelevant Sources:** If the provided sources do not contain information to answer the question, state this clearly (e.g., "על פי המקורות שסופקו, לא נמצאה תשובה לשאלה זו."). Do not invent answers.
100
+ 7. **Format:** Output *only* the final Hebrew answer with embedded citations as described. Do not include greetings, apologies, the original question, or any meta-commentary about the process.
101
+ 8. **Example Adherence:** Follow the style, language, and citation format shown in the example response below.
102
+
103
+ </instructions>
104
+
105
+ {EXAMPLE_RESPONSE_HEBREW}""" # Use the NEW simplified example
106
+ # --- *** END REVISED System Instruction Content *** ---
107
+
108
+ # --- Prepare User Prompt Content ---
109
+ user_prompt_content = f"<context>\n{formatted_context}\n</context>\n\nBased *exclusively* on the source text provided in the context above, please answer the following question according to the detailed instructions:\n\nQuestion:\n{query}"
110
+
111
+ print(f" -> Sending request to Gemini (Model: {GENERATION_MODEL})...")
112
+ print(f" -> Context size: ~{len(formatted_context)} characters")
113
+ print(f" -> System Instruction Length: ~{len(system_instruction_content)} characters")
114
+ print(f" -> User Prompt Length: ~{len(user_prompt_content)} characters")
115
+ print(f" -> Query: '{query[:50]}...'")
116
+
117
+ # --- API Call Block (Compatible with v0.8.5) ---
118
+ try:
119
+ generation_config = genai.types.GenerationConfig(temperature=0.2, max_output_tokens=8192) # Keep large output for now
120
+ safety_settings=[ # Keep safety settings
121
+ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
122
+ {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
123
+ {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
124
+ ]
125
+ contents_for_api = [ system_instruction_content, user_prompt_content ] # Instructions first
126
+
127
+ response_stream = genai_client.generate_content(
128
+ contents=contents_for_api, # Pass combined list
129
+ generation_config=generation_config,
130
+ safety_settings=safety_settings,
131
+ stream=True
132
+ )
133
+
134
+ print(f" -> Gemini SYNC stream iterator created successfully...")
135
+ chunk_count = 0
136
+ # --- Stream Handling Loop (Keep existing robust logic) ---
137
+ for chunk in response_stream:
138
+ try:
139
+ chunk_text = chunk.text
140
+ block_reason_str = chunk.safety_block_reason
141
+ if block_reason_str:
142
+ print(f" -> Gemini BLOCKED chunk: {block_reason_str}")
143
+ yield f"שגיאה: {block_reason_str}"
144
+ break # Stop processing if blocked
145
+ if chunk_text:
146
+ chunk_count += 1
147
+ yield chunk_text # Yield to app.py
148
+ except AttributeError as ae:
149
+ print(f" -> Gemini chunk error: {ae}")
150
+ yield f"שגיאה: {ae}"
151
+ break
152
+
153
+ # --- Final Check if No Chunks Yielded ---
154
+ if chunk_count == 0: yield "(לא התקבלה תשובה טקסטואלית מ-Gemini)"
155
+
156
+ # --- General Error Handling ---
157
+ except Exception as e:
158
+ # ... (print error, yield error message) ...
159
+ error_type = type(e).__name__; error_msg = str(e)
160
+ print(f" <- Error during Gemini SYNC stream ({error_type}): {error_msg}"); traceback.print_exc()
161
+ yield f"\n\n--- שגיאה ביצירת התשובה מ-Gemini ({error_type}): {error_msg} ---"
162
+ # --- END API Call Block ---
163
+
164
+
165
+ # --- Test function (Synchronous) ---
166
+ # Commenting out as manual check is better now
167
+ # def run_gemini_generation_test_sync():
168
+ # print("\n--- Running Gemini SYNC Generation Test (Manual Check Needed) ---")
169
+ # ...
170
+
171
+ # --- Main Execution Block ---
172
+ if __name__ == "__main__":
173
+ pass # Or run test if adapted
174
+ # if GOOGLE_API_KEY: run_gemini_generation_test_sync()
175
+ # else: print("\nError: GOOGLE_API_KEY environment variable not set.")
ingestion_service.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ingestion_service.py
2
+
3
+ import os
4
+ import json
5
+ import openai
6
+ import pinecone
7
+ from pinecone import ServerlessSpec, PodSpec # Import spec classes
8
+ from typing import List, Dict, Optional
9
+ import time
10
+ import traceback
11
+ import urllib.parse # Keep for potential future ID encoding if needed
12
+
13
+ # --- Configuration ---
14
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
15
+ PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
16
+ # PINECONE_ENVIRONMENT is deprecated for serverless/starter, use cloud/region
17
+ PINECONE_CLOUD = os.environ.get("PINECONE_CLOUD", "aws") # Default cloud
18
+ PINECONE_REGION = os.environ.get("PINECONE_REGION", "us-east-1") # Default region
19
+ INDEX_NAME = "chassidus-index" # Ensure this matches your index name
20
+ EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model
21
+ EMBEDDING_DIMENSIONS = 3072 # Dimension for text-embedding-3-large
22
+
23
+ print(f"Using Pinecone Index: {INDEX_NAME}")
24
+ print(f"Using Pinecone Cloud: {PINECONE_CLOUD}")
25
+ print(f"Using Pinecone Region: {PINECONE_REGION}")
26
+ print(f"Using OpenAI Embedding Model: {EMBEDDING_MODEL} (Dimensions: {EMBEDDING_DIMENSIONS})")
27
+ # --- End Configuration ---
28
+
29
+
30
+ # --- Initialize OpenAI Client ---
31
+ openai_client = None
32
+ if OPENAI_API_KEY:
33
+ try:
34
+ openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
35
+ print("OpenAI client initialized.")
36
+ except Exception as e:
37
+ print(f"Error initializing OpenAI client: {e}")
38
+ traceback.print_exc()
39
+ else:
40
+ print("ERROR: OPENAI_API_KEY not found. Ingestion requires it for embeddings.")
41
+
42
+
43
+ # --- Initialize Pinecone Client and Index ---
44
+ pc = None
45
+ index = None
46
+ if PINECONE_API_KEY and PINECONE_CLOUD and PINECONE_REGION:
47
+ try:
48
+ print("Initializing Pinecone client...")
49
+ pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
50
+
51
+ # Check if index exists
52
+ if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
53
+ print(f"Index '{INDEX_NAME}' does not exist. Creating it now...")
54
+ # --- Create Index (Choose ONE spec type) ---
55
+
56
+ # Option A: Serverless (Recommended for new projects, pay-as-you-go)
57
+ try:
58
+ pc.create_index(
59
+ name=INDEX_NAME,
60
+ dimension=EMBEDDING_DIMENSIONS,
61
+ metric="cosine", # or 'dotproduct', 'euclidean'
62
+ spec=ServerlessSpec(
63
+ cloud=PINECONE_CLOUD,
64
+ region=PINECONE_REGION
65
+ )
66
+ )
67
+ print(f"Serverless index '{INDEX_NAME}' created. Waiting for initialization...")
68
+ while not pc.describe_index(INDEX_NAME).status['ready']:
69
+ time.sleep(1)
70
+ print("Index is ready.")
71
+ except Exception as create_err:
72
+ print(f"Error creating Serverless index '{INDEX_NAME}': {create_err}")
73
+ traceback.print_exc()
74
+ # Fallback or specific error handling needed here
75
+
76
+ # Option B: Pod-based (Older style, requires specifying pod type/size)
77
+ # Uncomment below and comment out ServerlessSpec if you need Pod-based
78
+ # try:
79
+ # # Example: Using a free tier pod (s1.x1) - adjust if needed
80
+ # # Note: PINECONE_ENVIRONMENT might be needed for older pod-based index creation
81
+ # pinecone_environment = os.environ.get("PINECONE_ENVIRONMENT") # Get environment if needed for pod
82
+ # if not pinecone_environment:
83
+ # raise ValueError("PINECONE_ENVIRONMENT is required for pod-based index creation.")
84
+ # pc.create_index(
85
+ # name=INDEX_NAME,
86
+ # dimension=EMBEDDING_DIMENSIONS,
87
+ # metric="cosine",
88
+ # spec=PodSpec(
89
+ # environment=pinecone_environment, # Use environment here
90
+ # pod_type="p1.x1", # Example pod type, check Pinecone docs
91
+ # pods=1
92
+ # )
93
+ # )
94
+ # print(f"Pod-based index '{INDEX_NAME}' created in environment '{pinecone_environment}'. Waiting...")
95
+ # while not pc.describe_index(INDEX_NAME).status['ready']:
96
+ # time.sleep(1)
97
+ # print("Index is ready.")
98
+ # except Exception as create_err:
99
+ # print(f"Error creating Pod-based index '{INDEX_NAME}': {create_err}")
100
+ # traceback.print_exc()
101
+ # # Fallback or specific error handling needed here
102
+
103
+ else:
104
+ print(f"Index '{INDEX_NAME}' already exists.")
105
+
106
+ # Connect to the index
107
+ print(f"Connecting to index '{INDEX_NAME}'...")
108
+ index = pc.Index(INDEX_NAME)
109
+ print("Connected to Pinecone index.")
110
+ stats = index.describe_index_stats()
111
+ print(f"Initial index stats: {stats}")
112
+
113
+ except Exception as e:
114
+ print(f"Error initializing Pinecone or connecting to index: {e}")
115
+ traceback.print_exc()
116
+ else:
117
+ print("ERROR: Pinecone API Key, Cloud, or Region not found. Cannot connect to Pinecone.")
118
+
119
+
120
+ # --- Helper Functions ---
121
+
122
+ def get_embedding(text: str, model=EMBEDDING_MODEL) -> Optional[List[float]]:
123
+ """Generate embedding for text using OpenAI API."""
124
+ if not openai_client:
125
+ print("Error: OpenAI client not initialized, cannot generate embedding.")
126
+ return None
127
+ try:
128
+ text = text.replace("\n", " ") # OpenAI recommends replacing newlines
129
+ if not text.strip(): # Handle empty strings
130
+ print("Warning: Attempted to embed empty string.")
131
+ return None
132
+ response = openai_client.embeddings.create(input=[text], model=model)
133
+ return response.data[0].embedding
134
+ except openai.APIError as e:
135
+ print(f"OpenAI API Error getting embedding: {e}")
136
+ except Exception as e:
137
+ print(f"Error getting embedding for text snippet: '{text[:100]}...'")
138
+ traceback.print_exc()
139
+ return None
140
+
141
+ def process_json_file(file_path: str) -> List[Dict]:
142
+ """
143
+ Process a JSON file containing documents in the specified format.
144
+ Reads objects with "id", "hebrew", "english" keys.
145
+ """
146
+ documents = []
147
+ try:
148
+ with open(file_path, 'r', encoding='utf-8') as f:
149
+ data = json.load(f)
150
+ if not isinstance(data, list):
151
+ print(f"Warning: Expected a list of objects in JSON file '{file_path}', found {type(data)}. Skipping.")
152
+ return []
153
+
154
+ for i, item in enumerate(data):
155
+ if isinstance(item, dict):
156
+ original_id = item.get("id")
157
+ hebrew_text = item.get("hebrew")
158
+ english_text = item.get("english")
159
+
160
+ if not original_id:
161
+ print(f"Warning: Missing 'id' in item {i} of file '{file_path}'. Skipping.")
162
+ continue
163
+ if not hebrew_text and not english_text:
164
+ print(f"Warning: Missing both 'hebrew' and 'english' text in item {i} (ID: {original_id}) of file '{file_path}'. Skipping.")
165
+ continue
166
+ # Ensure texts are strings, default to empty if missing but not skipping
167
+ hebrew_text = hebrew_text or ""
168
+ english_text = english_text or ""
169
+
170
+ doc = {
171
+ "original_id": str(original_id), # Ensure ID is string
172
+ "hebrew_text": hebrew_text.strip(),
173
+ "english_text": english_text.strip(),
174
+ "source_name": os.path.basename(file_path) # Add source filename
175
+ }
176
+ documents.append(doc)
177
+ else:
178
+ print(f"Warning: Item {i} in file '{file_path}' is not a dictionary. Skipping.")
179
+
180
+ except json.JSONDecodeError as e:
181
+ print(f"Error decoding JSON from file '{file_path}': {e}")
182
+ return []
183
+ except Exception as e:
184
+ print(f"Error processing file '{file_path}': {e}")
185
+ traceback.print_exc()
186
+ return []
187
+
188
+ print(f"Processed {len(documents)} documents from '{file_path}'")
189
+ return documents
190
+
191
+ def upload_documents(documents: List[Dict], batch_size: int = 100) -> bool:
192
+ """
193
+ Embeds combined Hebrew+English text and uploads vectors and metadata to Pinecone.
194
+ Metadata includes separate hebrew_text and english_text.
195
+ """
196
+ if not index:
197
+ print("Error: Pinecone index not initialized. Cannot upload.")
198
+ return False
199
+ if not documents:
200
+ print("No documents provided to upload.")
201
+ return True # Technically successful as there's nothing to do
202
+
203
+ total_uploaded = 0
204
+ try:
205
+ num_batches = (len(documents) + batch_size - 1) // batch_size
206
+ print(f"Preparing to upload {len(documents)} documents in {num_batches} batches of size {batch_size}...")
207
+
208
+ for i in range(0, len(documents), batch_size):
209
+ batch_start_time = time.time()
210
+ batch = documents[i : i + batch_size]
211
+ vectors_to_upload = []
212
+ ids_in_batch = set()
213
+
214
+ print(f"Processing batch {i//batch_size + 1}/{num_batches}...")
215
+
216
+ for doc in batch:
217
+ original_id = doc["original_id"]
218
+ if original_id in ids_in_batch:
219
+ print(f"Warning: Duplicate ID '{original_id}' detected within the same batch. Skipping duplicate.")
220
+ continue
221
+ ids_in_batch.add(original_id)
222
+
223
+ hebrew = doc["hebrew_text"]
224
+ english = doc["english_text"]
225
+
226
+ # --- Create combined text for embedding ---
227
+ # Add separators to potentially help the model distinguish languages
228
+ combined_text = f"Hebrew:\n{hebrew}\n\nEnglish:\n{english}"
229
+ # Alternative: Just concatenate if separators don't help much
230
+ # combined_text = hebrew + "\n\n" + english
231
+
232
+ if not combined_text.strip():
233
+ print(f"Warning: Skipping document ID '{original_id}' due to empty combined text.")
234
+ continue
235
+
236
+ # --- Get Embedding ---
237
+ embedding = get_embedding(combined_text)
238
+ if embedding is None:
239
+ print(f"Warning: Failed to get embedding for document ID '{original_id}'. Skipping.")
240
+ continue
241
+
242
+ # --- Prepare Metadata ---
243
+ # Ensure metadata values are strings or numbers, handle None/empty
244
+ metadata_payload = {
245
+ "hebrew_text": hebrew if hebrew else "N/A",
246
+ "english_text": english if english else "N/A",
247
+ "source_name": doc.get("source_name", "Unknown"),
248
+ "original_id": original_id # Store original ID in metadata too
249
+ }
250
+ # Optional: Clean metadata further if needed (e.g., truncate long texts)
251
+
252
+ vectors_to_upload.append({
253
+ "id": original_id, # Use the original document ID as the Pinecone vector ID
254
+ "values": embedding,
255
+ "metadata": metadata_payload
256
+ })
257
+
258
+ if not vectors_to_upload:
259
+ print(f"Batch {i//batch_size + 1} resulted in no vectors to upload. Skipping API call.")
260
+ continue
261
+
262
+ # --- Upsert to Pinecone ---
263
+ try:
264
+ print(f"Upserting {len(vectors_to_upload)} vectors for batch {i//batch_size + 1}...")
265
+ upsert_response = index.upsert(vectors=vectors_to_upload)
266
+ print(f" Upsert response: {upsert_response}")
267
+ total_uploaded += upsert_response.upserted_count
268
+ except Exception as upsert_err:
269
+ print(f"Error upserting batch {i//batch_size + 1}: {upsert_err}")
270
+ traceback.print_exc()
271
+ # Decide whether to continue with next batch or stop
272
+ # return False # Stop on first batch error
273
+
274
+ batch_time = time.time() - batch_start_time
275
+ print(f"Batch {i//batch_size + 1} processed in {batch_time:.2f} seconds.")
276
+ time.sleep(0.1) # Small delay between batches
277
+
278
+ print(f"\nFinished uploading. Total vectors successfully upserted: {total_uploaded}")
279
+ # Verify with index stats
280
+ try:
281
+ final_stats = index.describe_index_stats()
282
+ print(f"Final index stats: {final_stats}")
283
+ except Exception as stats_err:
284
+ print(f"Could not fetch final index stats: {stats_err}")
285
+
286
+ return True
287
+
288
+ except Exception as e:
289
+ print(f"An unexpected error occurred during the upload process: {e}")
290
+ traceback.print_exc()
291
+ return False
292
+
293
+ def process_and_upload_file(file_path: str) -> bool:
294
+ """Main function to process a JSON file and upload its documents."""
295
+ if not os.path.exists(file_path):
296
+ print(f"Error: File not found at '{file_path}'")
297
+ return False
298
+
299
+ if not file_path.lower().endswith(".json"):
300
+ print(f"Error: This script currently only processes .json files. Found: '{file_path}'")
301
+ return False
302
+
303
+ if not openai_client or not index:
304
+ print("Error: OpenAI client or Pinecone index not initialized. Cannot proceed.")
305
+ return False
306
+
307
+ print(f"\n--- Starting processing for file: {file_path} ---")
308
+ start_time = time.time()
309
+
310
+ # 1. Process the JSON file
311
+ documents = process_json_file(file_path)
312
+ if not documents:
313
+ print(f"No valid documents found in '{file_path}'. Upload skipped.")
314
+ return False # Or True if "empty file processed successfully" is the desired outcome
315
+
316
+ # 2. Upload the documents
317
+ success = upload_documents(documents)
318
+
319
+ end_time = time.time()
320
+ print(f"--- Finished processing file: {file_path} in {end_time - start_time:.2f} seconds ---")
321
+
322
+ if success:
323
+ print(f"Successfully processed and uploaded data from {file_path}")
324
+ else:
325
+ print(f"Failed to upload data from {file_path}")
326
+
327
+ return success
328
+
329
+ # --- Main Execution Block ---
330
+ if __name__ == "__main__":
331
+ # --- Configuration for script execution ---
332
+ # Set the directory containing your JSON files
333
+ data_directory = "data" # CHANGE THIS to your data folder path
334
+ # ---
335
+
336
+ if not os.path.isdir(data_directory):
337
+ print(f"Error: Data directory '{data_directory}' not found.")
338
+ print("Please create the directory and place your JSON files inside, or update the 'data_directory' variable.")
339
+ else:
340
+ print(f"Looking for JSON files in directory: '{data_directory}'")
341
+ json_files = [f for f in os.listdir(data_directory) if f.lower().endswith(".json")]
342
+
343
+ if not json_files:
344
+ print(f"No .json files found in '{data_directory}'.")
345
+ else:
346
+ print(f"Found {len(json_files)} JSON files: {json_files}")
347
+ overall_success = True
348
+ for filename in json_files:
349
+ file_path = os.path.join(data_directory, filename)
350
+ success = process_and_upload_file(file_path)
351
+ if not success:
352
+ overall_success = False
353
+ print(f"Processing failed for {filename}. Check logs above.")
354
+ # Optional: stop processing remaining files on failure
355
+ # break
356
+
357
+ if overall_success:
358
+ print("\nAll files processed successfully.")
359
+ else:
360
+ print("\nSome files encountered errors during processing.")
361
+
362
+ # Example for single file upload:
363
+ # file_to_upload = "path/to/your/single_file.json"
364
+ # if os.path.exists(file_to_upload):
365
+ # process_and_upload_file(file_to_upload)
366
+ # else:
367
+ # print(f"File {file_to_upload} not found")
main.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # main.py
3
+ # Entry point for the Chat Chassidus RAG application
4
+
5
+ import os
6
+ import subprocess
7
+ import sys
8
+
9
+ # This file serves as an entry point to run the Streamlit app
10
+ # The actual application logic is in app.py
11
+
12
+ if __name__ == "__main__":
13
+ # Run the Streamlit app with specific port and address for Replit
14
+ # Disable WebSocket compression and CORS to prevent connection issues
15
+ os.environ["STREAMLIT_SERVER_PORT"] = "8501" # Use port forwarded to 80
16
+ os.environ["STREAMLIT_SERVER_ADDRESS"] = "localhost"
17
+ os.environ["STREAMLIT_SERVER_HEADLESS"] = "true"
18
+ os.environ["STREAMLIT_SERVER_ENABLE_WEBSOCKET_COMPRESSION"] = "false"
19
+ os.environ["STREAMLIT_SERVER_ENABLE_CORS"] = "false"
20
+ os.environ["STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION"] = "false"
21
+ subprocess.run([sys.executable, "-m", "streamlit", "run", "app.py"], check=True)
package-lock.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "name": "workspace",
3
+ "lockfileVersion": 3,
4
+ "requires": true,
5
+ "packages": {}
6
+ }
pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "python-template"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Your Name <[email protected]>"]
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "anthropic>=0.49.0",
9
+ "google-generativeai>=0.8.5",
10
+ "nest-asyncio>=1.6.0",
11
+ "openai>=1.72.0",
12
+ "pinecone>=6.0.2",
13
+ "python-dotenv>=1.1.0",
14
+ "streamlit>=1.44.1",
15
+ "langsmith"
16
+ ]
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ anthropic>=0.49.0
3
+ google-generativeai>=0.8.5
4
+ nest-asyncio>=1.6.0
5
+ openai>=1.72.0
6
+ pinecone>=6.0.2
7
+ python-dotenv>=1.1.0
8
+ streamlit>=1.44.1
9
+ langchain>=0.0.335
10
+ langsmith>=0.0.56
retriever_pinecone.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # retriever_pinecone.py
2
+
3
+ import os
4
+ import time
5
+ import traceback
6
+ import urllib.parse # Keep for potential future ID decoding if needed
7
+ from pinecone import Pinecone
8
+ import openai # For generating query embeddings
9
+ from typing import List, Dict # <<< --- ADD THIS IMPORT ---
10
+
11
+ # --- Configuration ---
12
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
13
+ PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
14
+ # PINECONE_ENVIRONMENT is deprecated for serverless/starter, use index host or name directly
15
+ INDEX_NAME = "chassidus-index" # Match the index used in upload script
16
+ EMBEDDING_MODEL = "text-embedding-3-large" # Ensure this matches your embedding model
17
+
18
+ print(f"Retriever using Pinecone Index: {INDEX_NAME}")
19
+ # Removed Environment print, less relevant for v3 client usage
20
+ print(f"Retriever using OpenAI Embedding Model: {EMBEDDING_MODEL}")
21
+ # --- End Configuration ---
22
+
23
+ # --- Initialize OpenAI Client ---
24
+ openai_client = None
25
+ if OPENAI_API_KEY:
26
+ try:
27
+ openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
28
+ print("OpenAI client initialized for retriever.")
29
+ except Exception as e:
30
+ print(f"Error initializing OpenAI client for retriever: {e}")
31
+ traceback.print_exc()
32
+ else:
33
+ print(
34
+ "Warning: OPENAI_API_KEY not found. Retriever requires it for query embeddings."
35
+ )
36
+
37
+ # --- Initialize Pinecone Client and Index ---
38
+ pc = None
39
+ index = None
40
+ if PINECONE_API_KEY:
41
+ try:
42
+ print("Initializing Pinecone client for retriever...")
43
+ pc = Pinecone(api_key=PINECONE_API_KEY)
44
+ print(f"Connecting to index '{INDEX_NAME}'...")
45
+
46
+ # Check if index exists before connecting
47
+ if INDEX_NAME not in [idx.name for idx in pc.list_indexes().indexes]:
48
+ print(
49
+ f"Error: Index '{INDEX_NAME}' does not exist. Cannot connect retriever."
50
+ )
51
+ else:
52
+ index = pc.Index(INDEX_NAME)
53
+ print("Connected to Pinecone index for retriever.")
54
+ # Verify connection with stats
55
+ stats = index.describe_index_stats()
56
+ print(f"Index stats: {stats}")
57
+ if stats.total_vector_count == 0:
58
+ print(f"Warning: Pinecone index '{INDEX_NAME}' is empty.")
59
+
60
+ except Exception as e:
61
+ print(
62
+ f"Error initializing Pinecone or connecting to index for retriever: {e}"
63
+ )
64
+ traceback.print_exc()
65
+ else:
66
+ print(
67
+ "Error: PINECONE_API_KEY not found. Cannot initialize Pinecone client."
68
+ )
69
+
70
+
71
+ # --- Status Check ---
72
+ def check_retriever_status():
73
+ """Checks if the Pinecone retriever is ready."""
74
+ status = True
75
+ messages = []
76
+ if not OPENAI_API_KEY:
77
+ status = False
78
+ messages.append("OpenAI API Key missing.")
79
+ if not openai_client:
80
+ status = False
81
+ messages.append("OpenAI client initialization failed.")
82
+ if not PINECONE_API_KEY:
83
+ status = False
84
+ messages.append("Pinecone API Key missing.")
85
+ if not pc:
86
+ status = False
87
+ messages.append("Pinecone client failed to initialize.")
88
+ if not index: # Check if index object was successfully created
89
+ status = False
90
+ messages.append(
91
+ f"Pinecone index '{INDEX_NAME}' could not be connected to or doesn't exist."
92
+ )
93
+ elif index:
94
+ try:
95
+ stats = index.describe_index_stats()
96
+ if stats.total_vector_count == 0:
97
+ messages.append(
98
+ f"Retriever ready, but Pinecone index '{INDEX_NAME}' is empty."
99
+ )
100
+ except Exception as stats_err:
101
+ status = False
102
+ messages.append(
103
+ f"Failed to get stats for index '{INDEX_NAME}': {stats_err}")
104
+
105
+ if status and not messages:
106
+ messages.append("Retriever ready.")
107
+
108
+ return status, " ".join(messages)
109
+
110
+
111
+ # --- Retrieval Function ---
112
+ def get_embedding(text, model=EMBEDDING_MODEL):
113
+ """Generates embedding for the given text using OpenAI."""
114
+ if not openai_client:
115
+ raise ValueError("OpenAI client not initialized.")
116
+ try:
117
+ text = text.replace("\n", " ")
118
+ response = openai_client.embeddings.create(input=[text], model=model)
119
+ return response.data[0].embedding
120
+ except Exception as e:
121
+ print(f"Error getting embedding for text: '{text[:100]}...'")
122
+ traceback.print_exc()
123
+ return None
124
+
125
+
126
+ # This is line 114 where the error occurred, now List and Dict are defined via import
127
+ def find_similar_paragraphs(query_text: str,
128
+ n_results: int = 10) -> List[Dict]:
129
+ """
130
+ Retrieves similar paragraphs from Pinecone based on the query text.
131
+ Searches against combined Hebrew+English embeddings.
132
+ Retrieves metadata including separate hebrew_text and english_text.
133
+ """
134
+ ready, message = check_retriever_status()
135
+ if not ready or index is None: # Check index specifically
136
+ print(f"Retriever not ready: {message}")
137
+ return []
138
+
139
+ print(f"\nRetrieving similar paragraphs for: '{query_text[:100]}...'")
140
+ start_time = time.time()
141
+
142
+ try:
143
+ # 1. Get query embedding
144
+ print("Generating query embedding...")
145
+ query_embedding = get_embedding(query_text)
146
+ if query_embedding is None:
147
+ print("Failed to generate query embedding.")
148
+ return []
149
+ embed_time = time.time() - start_time
150
+ print(f"Query embedding generated in {embed_time:.4f} seconds.")
151
+
152
+ # 2. Query Pinecone
153
+ print(
154
+ f"Querying Pinecone index '{INDEX_NAME}' for top {n_results} results..."
155
+ )
156
+ query_start_time = time.time()
157
+ response = index.query(
158
+ vector=query_embedding,
159
+ top_k=n_results,
160
+ include_metadata=True # Essential to get the text back
161
+ )
162
+ query_time = time.time() - query_start_time
163
+ print(f"Pinecone query completed in {query_time:.4f} seconds.")
164
+
165
+ # 3. Process results
166
+ formatted_results = []
167
+ if not response or not response.matches:
168
+ print("No results found by Pinecone for this query.")
169
+ return []
170
+
171
+ print(
172
+ f"Processing {len(response.matches)} raw results from Pinecone...")
173
+ for match in response.matches:
174
+ score = match.score # Cosine similarity score (higher is better)
175
+ vector_id = match.id # The ID stored in Pinecone (should be original_id)
176
+ metadata = match.metadata if match.metadata else {}
177
+
178
+ # --- Extract data from metadata ---
179
+ # Use .get() with defaults for robustness
180
+ original_id = metadata.get(
181
+ 'original_id', vector_id) # Fallback to vector_id if missing
182
+ hebrew_text = metadata.get('hebrew_text', '')
183
+ english_text = metadata.get('english_text', '')
184
+ source_name = metadata.get('source_name', 'Unknown Source')
185
+
186
+ # Calculate distance from similarity score (for consistency if needed)
187
+ # Distance = 1 - Cosine Similarity
188
+ distance = 1.0 - score
189
+
190
+ doc_data = {
191
+ "vector_id": vector_id, # The ID used in Pinecone
192
+ "original_id":
193
+ original_id, # The original ID from the source JSON
194
+ "source_name": source_name,
195
+ "hebrew_text": hebrew_text,
196
+ "english_text": english_text, # Include English text
197
+ "distance": distance, # Calculated distance (lower is better)
198
+ "similarity_score":
199
+ score, # Direct score from Pinecone (higher is better)
200
+ }
201
+ formatted_results.append(doc_data)
202
+
203
+ # Pinecone results are already sorted by score (descending),
204
+ # which means distance is ascending (most similar first).
205
+
206
+ total_retrieval_time = time.time() - start_time
207
+ print(
208
+ f"Retrieved and processed {len(formatted_results)} paragraphs from Pinecone in {total_retrieval_time:.2f} seconds."
209
+ )
210
+ return formatted_results
211
+
212
+ except Exception as e:
213
+ print(f"Error during Pinecone query or processing: {e}")
214
+ traceback.print_exc()
215
+ return []
216
+
217
+
218
+ # --- Main Test Block ---
219
+ if __name__ == "__main__":
220
+ ready, msg = check_retriever_status()
221
+ print(f"\nRetriever Status: {ready} - {msg}")
222
+ if ready:
223
+ print("\n--- Running Retriever Test ---")
224
+ test_query = "role of joy in divine service" # Test query in English
225
+ # test_query_he = "תפקיד השמחה בעבודת ה'" # Test query in Hebrew (optional)
226
+
227
+ retrieved_docs = find_similar_paragraphs(test_query, n_results=5)
228
+
229
+ if retrieved_docs:
230
+ print("\n--- Top Test Results ---")
231
+ for i, doc in enumerate(retrieved_docs):
232
+ print(
233
+ f"\n{i+1}. Score: {doc['similarity_score']:.4f} (Distance: {doc['distance']:.4f})"
234
+ )
235
+ print(
236
+ f" Source: {doc['source_name']} (Orig ID: {doc['original_id']}, VecID: {doc['vector_id']})"
237
+ )
238
+ print(f" Hebrew: {doc['hebrew_text'][:150]}...")
239
+ print(f" English: {doc['english_text'][:150]}...")
240
+ else:
241
+ print("No documents retrieved for the test query.")
242
+ else:
243
+ print(f"Cannot run test because retriever is not ready.")
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
validation_service_openai.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # validation_service_openai.py
2
+ # Works with LangSmith, OpenAI async, built for RAG validation
3
+
4
+ import os
5
+ import traceback
6
+ import openai
7
+ import asyncio
8
+ import json
9
+ from typing import Dict, Optional
10
+ from langsmith import traceable
11
+
12
+ # ----- ENVIRONMENT SETUP (Replit secret-based) -----
13
+ os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
14
+ os.environ["LANGSMITH_TRACING"] = "true"
15
+ os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_KEY"]
16
+ os.environ["LANGSMITH_API_KEY"] = os.environ["LANGSMITH_API_KEY"]
17
+ os.environ["LANGSMITH_PROJECT"] = os.environ["LANGSMITH_PROJECT"]
18
+ # ---------------------------------------------------
19
+
20
+ OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
21
+ VALIDATION_MODEL = "gpt-4o"
22
+
23
+ # Initialize OpenAI Async Client
24
+ async_openai_client = None
25
+ if OPENAI_API_KEY:
26
+ try:
27
+ # (no need for wrap_openai here unless you want call-level traces)
28
+ async_openai_client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)
29
+ print("OpenAI ASYNC client initialized for validator service.")
30
+ except Exception as e:
31
+ print(f"Error initializing OpenAI ASYNC client for validator: {e}")
32
+ traceback.print_exc()
33
+ else:
34
+ print("Warning: OPENAI_API_KEY not found. Validator service (GPT-4o) requires it.")
35
+
36
+ def check_openai_validator_status():
37
+ status = True
38
+ messages = []
39
+ if not OPENAI_API_KEY:
40
+ status = False
41
+ messages.append("OpenAI API Key missing.")
42
+ if not async_openai_client:
43
+ status = False
44
+ messages.append("OpenAI Async client (for Validator) initialization failed.")
45
+ if status and not messages:
46
+ messages.append(f"OpenAI Validator service ready (Model: {VALIDATION_MODEL}).")
47
+ return status, " ".join(messages)
48
+
49
+ @traceable
50
+ async def validate_paragraph_relevance_gpt4o(
51
+ paragraph_data: Dict,
52
+ user_question: str,
53
+ paragraph_index: int
54
+ ) -> Optional[Dict]:
55
+ """
56
+ Uses GPT-4o to validate if a SINGLE paragraph (HE+EN text) contains relevant info.
57
+
58
+ Args:
59
+ paragraph_data: A dictionary for the paragraph (needs 'hebrew_text', 'english_text').
60
+ user_question: The original user question in Hebrew.
61
+ paragraph_index: The index of this paragraph in the list being validated.
62
+
63
+ Returns:
64
+ A dictionary containing the validation result and original paragraph data.
65
+ Returns None if an error occurs during validation.
66
+ """
67
+ global async_openai_client
68
+ if not async_openai_client:
69
+ print(f"Error (Paragraph {paragraph_index}): OpenAI async client not available.")
70
+ return None
71
+ if not paragraph_data:
72
+ return {
73
+ "validation": {
74
+ "contains_relevant_info": False,
75
+ "justification": "Input paragraph data was empty."
76
+ },
77
+ "paragraph_data": {}
78
+ }
79
+
80
+ hebrew_text = paragraph_data.get('hebrew_text', '').strip()
81
+ english_text = paragraph_data.get('english_text', '').strip()
82
+ if not hebrew_text and not english_text:
83
+ return {
84
+ "validation": {
85
+ "contains_relevant_info": False,
86
+ "justification": "Paragraph text is empty."
87
+ },
88
+ "paragraph_data": paragraph_data
89
+ }
90
+
91
+ prompt_content = f"""User Question (Hebrew):
92
+ "{user_question}"
93
+
94
+ Text Paragraph (Paragraph {paragraph_index+1}):
95
+ Hebrew:
96
+ ---
97
+ {hebrew_text if hebrew_text else "(No Hebrew text provided)"}
98
+ ---
99
+ English:
100
+ ---
101
+ {english_text if english_text else "(No English text provided)"}
102
+ ---
103
+
104
+ Instruction:
105
+ Analyze the Text Paragraph provided above (considering both Hebrew and English versions if available). Determine if any information within this specific paragraph directly answers, or provides significant relevant details contributing to an answer for, the User Question (which is in Hebrew).
106
+ Respond ONLY with a valid JSON object containing exactly two keys:
107
+ 1. 'contains_relevant_info': A boolean value (`true` if relevant information is found, `false` otherwise).
108
+ 2. 'justification': A brief, 1-sentence explanation (in Hebrew) for your decision, especially if 'true'.
109
+
110
+ Example valid JSON output:
111
+ {{ "contains_relevant_info": true, "justification": "הפסקה דנה ישירות בסיבת העיכוב בקריעת הים." }}
112
+ OR
113
+ {{ "contains_relevant_info": false, "justification": "הפסקה עוסקת בעניין אחר ואינה רלוונטית לשאלה." }}
114
+
115
+ Output only the JSON object, nothing else.
116
+ """
117
+
118
+ try:
119
+ response = await async_openai_client.chat.completions.create(
120
+ model=VALIDATION_MODEL,
121
+ messages=[{"role": "user", "content": prompt_content}],
122
+ temperature=0.1,
123
+ max_tokens=150,
124
+ response_format={"type": "json_object"}
125
+ )
126
+
127
+ json_string = response.choices[0].message.content
128
+
129
+ try:
130
+ validation_result = json.loads(json_string)
131
+ if not isinstance(validation_result, dict) or \
132
+ 'contains_relevant_info' not in validation_result or \
133
+ 'justification' not in validation_result or \
134
+ not isinstance(validation_result['contains_relevant_info'], bool):
135
+ print(f"Error (Paragraph {paragraph_index+1}): Parsed JSON has incorrect structure: {validation_result}")
136
+ return None
137
+
138
+ return {
139
+ "validation": validation_result,
140
+ "paragraph_data": paragraph_data
141
+ }
142
+
143
+ except json.JSONDecodeError as json_err:
144
+ print(f"Error (Paragraph {paragraph_index+1}): Failed to decode JSON response: {json_err}. Response was: {json_string}")
145
+ return None
146
+ except Exception as parse_err:
147
+ print(f"Error (Paragraph {paragraph_index+1}): Unexpected error parsing validation structure: {parse_err}")
148
+ return None
149
+
150
+ except openai.APIError as e:
151
+ print(f"Error (Paragraph {paragraph_index+1}): OpenAI API Error during validation: {e}")
152
+ return None
153
+ except Exception as e:
154
+ print(f"Error (Paragraph {paragraph_index+1}): Unexpected error during GPT-4o validation API call: {e}")
155
+ traceback.print_exc()
156
+ return None