dolphinium commited on
Commit
f43f2d3
Β·
1 Parent(s): 4d74be9

refactor: Modularize application into single-responsibility components

Browse files

Decompose the monolithic app.py into a clean, modular architecture to improve maintainability, readability, and scalability. Each new
module now has a clearly defined purpose, separating concerns like configuration, data, connections, and UI.

Key changes:
- `app.py`: Transformed into a lean entry point that initializes services and launches the UI.
- `config.py`: Created to manage all environment variables and configuration constants.
- `solr_metadata.py`: Isolates the static Solr field definitions, removing a large data structure from the main logic.
- `connections.py`: Centralizes the setup and initialization of external services (SSH Tunnel, Solr, Gemini LLM).
- `llm_prompts.py`: Contains all large f-string prompts for the LLM, cleaning up the data processing logic.
- `data_processing.py`: Holds the core data analysis functions for querying Solr and orchestrating the LLM.
- `ui.py`: Defines the Gradio user interface and its associated event-handling logic.

Files changed (7) hide show
  1. app.py +32 -803
  2. config.py +28 -0
  3. connections.py +54 -0
  4. data_processing.py +130 -0
  5. llm_prompts.py +346 -0
  6. solr_metadata.py +152 -0
  7. ui.py +175 -0
app.py CHANGED
@@ -1,813 +1,42 @@
1
- import gradio as gr
2
- import json
3
- import re
4
- import datetime
5
- import pandas as pd
6
- import pysolr
7
- import google.generativeai as genai
8
- from sshtunnel import SSHTunnelForwarder
9
- import matplotlib.pyplot as plt
10
- import seaborn as sns
11
- import io
12
- import os
13
  import logging
14
- import concurrent.futures
15
- from IPython.display import display, Markdown
16
- import copy
17
-
18
 
19
  # --- Suppress Matplotlib Debug Logs ---
20
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
21
 
22
- # --- SSH Tunnel Configuration ---
23
- # It's recommended to load secrets securely, e.g., from environment variables
24
- SSH_HOST = os.environ.get('SSH_HOST')
25
- SSH_PORT = 5322
26
- SSH_USER = os.environ.get('SSH_USER')
27
- SSH_PASS = os.environ.get('SSH_PASS')
28
-
29
- # --- Solr Configuration ---
30
- REMOTE_SOLR_HOST = '69.167.186.48'
31
- REMOTE_SOLR_PORT = 8983
32
- LOCAL_BIND_PORT = 8983
33
- SOLR_CORE_NAME = 'news'
34
- SOLR_USER = os.environ.get('SOLR_USER')
35
- SOLR_PASS = os.environ.get('SOLR_PASS')
36
-
37
- # --- Google Gemini Configuration ---
38
- try:
39
- genai.configure(api_key=os.environ.get('GEMINI_API_KEY'))
40
- except Exception as e:
41
- print(f"❌ Gemini API Key Error: {e}. Please ensure 'GEMINI_API_KEY' is set in your environment.")
42
-
43
- # --- Global Variables ---
44
- ssh_tunnel_server = None
45
- solr_client = None
46
- llm_model = None
47
- is_initialized = False
48
-
49
- try:
50
- # 1. Start the SSH Tunnel
51
- ssh_tunnel_server = SSHTunnelForwarder(
52
- (SSH_HOST, SSH_PORT),
53
- ssh_username=SSH_USER,
54
- ssh_password=SSH_PASS,
55
- remote_bind_address=(REMOTE_SOLR_HOST, REMOTE_SOLR_PORT),
56
- local_bind_address=('127.0.0.1', LOCAL_BIND_PORT)
57
- )
58
- ssh_tunnel_server.start()
59
- print(f"πŸš€ SSH tunnel established: Local Port {ssh_tunnel_server.local_bind_port} -> Remote Solr.")
60
-
61
- # 2. Initialize the pysolr client
62
- solr_url = f'http://127.0.0.1:{ssh_tunnel_server.local_bind_port}/solr/{SOLR_CORE_NAME}'
63
- solr_client = pysolr.Solr(solr_url, auth=(SOLR_USER, SOLR_PASS), always_commit=True)
64
- solr_client.ping()
65
- print(f"βœ… Solr connection successful on core '{SOLR_CORE_NAME}'.")
66
-
67
- # 3. Initialize the LLM
68
- llm_model = genai.GenerativeModel('gemini-2.5-flash', generation_config=genai.types.GenerationConfig(temperature=0))
69
- print(f"βœ… LLM Model '{llm_model.model_name}' initialized.")
70
-
71
- print("βœ… System Initialized Successfully.")
72
- is_initialized = True
73
-
74
- except Exception as e:
75
- print(f"\n❌ An error occurred during setup: {e}")
76
- if ssh_tunnel_server and ssh_tunnel_server.is_active:
77
- ssh_tunnel_server.stop()
78
-
79
-
80
- field_metadata = [
81
- {
82
- "field_name": "business_model",
83
- "type": "string (categorical)",
84
- "example_values": ["pharma/bio", "drug delivery", "pharma services"],
85
- "definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments."
86
- },
87
- {
88
- "field_name": "news_type",
89
- "type": "string (categorical)",
90
- "example_values": ["product news", "financial news", "regulatory news"],
91
- "definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported."
92
- },
93
- {
94
- "field_name": "event_type",
95
- "type": "string (categorical)",
96
- "example_values": ["phase 2", "phase 1", "pre clinical", "marketed"],
97
- "definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases."
98
- },
99
- {
100
- "field_name": "source",
101
- "type": "string (categorical)",
102
- "example_values": ["Press Release", "PR Newswire", "Business Wire"],
103
- "definition": "The original source of the news article, such as a newswire or official report."
104
- },
105
- {
106
- "field_name": "company_name",
107
- "type": "string (exact match, for faceting)",
108
- "example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
109
- "definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching."
110
- },
111
- {
112
- "field_name": "company_name_s",
113
- "type": "string (multi-valued, for searching)",
114
- "example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
115
- "definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting."
116
- },
117
- {
118
- "field_name": "territory_hq_s",
119
- "type": "string (multi-valued, hierarchical)",
120
- "example_values": ["united states of america", "europe", "europe western"],
121
- "definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location."
122
- },
123
- {
124
- "field_name": "therapeutic_category",
125
- "type": "string (specific)",
126
- "example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
127
- "definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries."
128
- },
129
- {
130
- "field_name": "therapeutic_category_s",
131
- "type": "string (multi-valued, for searching)",
132
- "example_values": ["cancer", "oncology", "infections", "cns"],
133
- "definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter."
134
- },
135
- {
136
- "field_name": "compound_name",
137
- "type": "string (exact match, for faceting)",
138
- "example_values": ["opdivo injection solution", "keytruda injection solution"],
139
- "definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds."
140
- },
141
- {
142
- "field_name": "compound_name_s",
143
- "type": "string (multi-valued, for searching)",
144
- "example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
145
- "definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name."
146
- },
147
- {
148
- "field_name": "molecule_name",
149
- "type": "string (exact match, for faceting)",
150
- "example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
151
- "definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules."
152
- },
153
- {
154
- "field_name": "molecule_name_s",
155
- "type": "string (multi-valued, for searching)",
156
- "example_values": ["cbd", "s1-220", "a1002n5s"],
157
- "definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name."
158
- },
159
- {
160
- "field_name": "highest_phase",
161
- "type": "string (categorical)",
162
- "example_values": ["marketed", "phase 2", "phase 1"],
163
- "definition": "The highest stage of development a drug has ever reached."
164
- },
165
- {
166
- "field_name": "drug_delivery_branch_s",
167
- "type": "string (multi-valued, for searching)",
168
- "example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
169
- "definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms."
170
- },
171
- {
172
- "field_name": "drug_delivery_branch",
173
- "type": "string (categorical, specific, for faceting)",
174
- "example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
175
- "definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies."
176
- },
177
- {
178
- "field_name": "route_branch",
179
- "type": "string (categorical)",
180
- "example_values": ["injection", "oral", "topical", "inhalation"],
181
- "definition": "The primary route of drug administration. Good for faceting on exact routes."
182
- },
183
- {
184
- "field_name": "molecule_api_group",
185
- "type": "string (categorical)",
186
- "example_values": ["small molecules", "biologics", "nucleic acids"],
187
- "definition": "High-level classification of the drug's molecular type."
188
- },
189
- {
190
- "field_name": "content",
191
- "type": "text (full-text search)",
192
- "example_values": ["The largest study to date...", "balstilimab..."],
193
- "definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields."
194
- },
195
- {
196
- "field_name": "date",
197
- "type": "date",
198
- "example_values": ["2020-10-22T00:00:00Z"],
199
- "definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries."
200
- },
201
- {
202
- "field_name": "date_year",
203
- "type": "number (year)",
204
- "example_values": [2020, 2021, 2022],
205
- "definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')."
206
- },
207
- {
208
- "field_name": "total_deal_value_in_million",
209
- "type": "number (metric)",
210
- "example_values": [50, 120.5, 176.157, 1000],
211
- "definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'."
212
- }
213
- ]
214
-
215
- # Helper function to format the metadata for the prompt
216
- def format_metadata_for_prompt(metadata):
217
- formatted_string = ""
218
- for field in metadata:
219
- formatted_string += f"- **{field['field_name']}**\n"
220
- formatted_string += f" - **Type**: {field['type']}\n"
221
- formatted_string += f" - **Definition**: {field['definition']}\n"
222
- formatted_string += f" - **Examples**: {', '.join(map(str, field['example_values']))}\n\n"
223
- return formatted_string
224
- formatted_field_info = format_metadata_for_prompt(field_metadata)
225
-
226
-
227
- def parse_suggestions_from_report(report_text):
228
- """Extracts numbered suggestions from the report's markdown text."""
229
- suggestions_match = re.search(r"### (?:Deeper Dive: Suggested Follow-up Analyses|Suggestions for Further Exploration)\s*\n(.*?)$", report_text, re.DOTALL | re.IGNORECASE)
230
- if not suggestions_match: return []
231
- suggestions_text = suggestions_match.group(1)
232
- suggestions = re.findall(r"^\s*\d+\.\s*(.*)", suggestions_text, re.MULTILINE)
233
- return [s.strip() for s in suggestions]
234
-
235
-
236
- def llm_generate_analysis_plan_with_history(natural_language_query, field_metadata, chat_history):
237
  """
238
- Generates a complete analysis plan from a user query, considering chat history.
239
- This plan includes dimensions, measures, and requests for both quantitative (
240
- facet)
241
- and qualitative (grouping) data.
242
  """
243
- formatted_history = ""
244
- for user_msg, bot_msg in chat_history:
245
- if user_msg:
246
- formatted_history += f"- User: \"{user_msg}\"\n"
247
-
248
- prompt = f"""
249
- You are an expert data analyst and Solr query engineer. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping).
250
-
251
- ---
252
- ### CONTEXT & RULES
253
-
254
- 1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
255
- 2. **Field Usage**: You MUST use the fields described in the 'Field Definitions'. Pay close attention to the definitions to select the correct field, especially the `_s` fields for searching. Do not use fields ending with `_s` in `group.field` or facet `field` unless necessary for the analysis.
256
- 3. **Dimension vs. Measure**:
257
- * `analysis_dimension`: The primary categorical field the user wants to group by (e.g., `company_name`, `route_branch`). This is the `group by` field.
258
- * `analysis_measure`: The metric to aggregate (e.g., `sum(total_deal_value_in_million)`) or the method of counting (`count`).
259
- * `sort_field_for_examples`: The raw field used to find the "best" example. If `analysis_measure` is `sum(field)`, this should be `field`. If `analysis_measure` is `count`, this should be a relevant field like `date`.
260
- 4. **Crucial Sorting Rules**:
261
- * For `group.sort`: If `analysis_measure` involves a function on a field (e.g., `sum(total_deal_value_in_million)`), you MUST use the full function: `group.sort: 'sum(total_deal_value_in_million) desc'`.
262
- * If `analysis_measure` is 'count', you MUST OMIT the `group.sort` parameter entirely.
263
- * For sorting, NEVER use 'date_year'; use 'date' instead.
264
- 5. **Output Format**: Your final output must be a single, raw JSON object. Do not add comments or markdown formatting.
265
-
266
- ---
267
- ### FIELD DEFINITIONS (Your Source of Truth)
268
-
269
- {formatted_field_info}
270
- ---
271
- ### CHAT HISTORY
272
- {formatted_history}
273
- ---
274
- ### EXAMPLES
275
-
276
- **User Query 1:** "What are the top 5 companies by total deal value in 2023?"
277
- **Correct JSON Output 1:**
278
- ```json
279
- {{
280
- "analysis_dimension": "company_name",
281
- "analysis_measure": "sum(total_deal_value_in_million)",
282
- "sort_field_for_examples": "total_deal_value_in_million",
283
- "query_filter": "date_year:2023 AND total_deal_value_in_million:[0 TO *]",
284
- "quantitative_request": {{
285
- "json.facet": {{
286
- "companies_by_deal_value": {{
287
- "type": "terms",
288
- "field": "company_name",
289
- "limit": 5,
290
- "sort": "total_value desc",
291
- "facet": {{
292
- "total_value": "sum(total_deal_value_in_million)"
293
- }}
294
- }}
295
- }}
296
- }},
297
- "qualitative_request": {{
298
- "group": true,
299
- "group.field": "company_name",
300
- "group.limit": 1,
301
- "group.sort": "sum(total_deal_value_in_million) desc",
302
- "sort": "total_deal_value_in_million desc"
303
- }}
304
- }}
305
- ```
306
-
307
- **User Query 2:** "What are the most common news types for infections this year?"
308
- **Correct JSON Output 2:**
309
- ```json
310
- {{
311
- "analysis_dimension": "news_type",
312
- "analysis_measure": "count",
313
- "sort_field_for_examples": "date",
314
- "query_filter": "therapeutic_category_s:infections AND date_year:{datetime.datetime.now().year}",
315
- "quantitative_request": {{
316
- "json.facet": {{
317
- "news_by_type": {{
318
- "type": "terms",
319
- "field": "news_type",
320
- "limit": 10,
321
- "sort": "count desc"
322
- }}
323
- }}
324
- }},
325
- "qualitative_request": {{
326
- "group": true,
327
- "group.field": "news_type",
328
- "group.limit": 1,
329
- "sort": "date desc"
330
- }}
331
- }}
332
- ```
333
- ---
334
- ### YOUR TASK
335
-
336
- Convert the following user query into a single, raw JSON "Analysis Plan" object, strictly following all rules and considering the chat history.
337
-
338
- **Current User Query:** `{natural_language_query}`
339
- """
340
- try:
341
- response = llm_model.generate_content(prompt)
342
- cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
343
- plan = json.loads(cleaned_text)
344
- return plan
345
- except Exception as e:
346
- raw_response_text = response.text if 'response' in locals() else 'N/A'
347
- print(f"Error in llm_generate_analysis_plan_with_history: {e}\nRaw Response:\n{raw_response_text}")
348
- return None
349
-
350
- def execute_quantitative_query(plan, solr):
351
- """Executes the facet query to get aggregate data."""
352
- if not plan or 'quantitative_request' not in plan or 'json.facet' not in plan.get('quantitative_request', {}):
353
- return None
354
- try:
355
- params = {
356
- "q": plan.get('query_filter', '*:*'),
357
- "rows": 0,
358
- "json.facet": json.dumps(plan['quantitative_request']['json.facet'])
359
- }
360
- results = solr.search(**params)
361
- return results.raw_response.get("facets", {})
362
- except Exception as e:
363
- print(f"Error in quantitative query: {e}")
364
- return None
365
-
366
- def execute_qualitative_query(plan, solr):
367
- """Executes the grouping query to get the best example docs."""
368
- if not plan or 'qualitative_request' not in plan:
369
- return None
370
- try:
371
- qual_request = copy.deepcopy(plan['qualitative_request'])
372
- params = {
373
- "q": plan.get('query_filter', '*:*'),
374
- "rows": 3, # Get a few examples per group
375
- "fl": "*,score",
376
- **qual_request
377
- }
378
- results = solr.search(**params)
379
- return results.grouped
380
- except Exception as e:
381
- print(f"Error in qualitative query: {e}")
382
- return None
383
-
384
- def llm_synthesize_enriched_report_stream(query, quantitative_data, qualitative_data, plan):
385
- """
386
- Generates an enriched report by synthesizing quantitative aggregates
387
- and qualitative examples, and streams the result.
388
- """
389
- qualitative_prompt_str = ""
390
- dimension = plan.get('analysis_dimension', 'N/A')
391
- if qualitative_data and dimension in qualitative_data:
392
- for group in qualitative_data.get(dimension, {}).get('groups', []):
393
- group_value = group.get('groupValue', 'N/A')
394
- if group.get('doclist', {}).get('docs'):
395
- doc = group.get('doclist', {}).get('docs', [{}])[0]
396
- title = doc.get('abstract', ['No Title'])
397
- content_list = doc.get('content', [])
398
- content_snip = (' '.join(content_list[0].split()[:40]) + '...') if content_list else 'No content available.'
399
- metric_val_raw = doc.get(plan.get('sort_field_for_examples'), 'N/A')
400
- metric_val = metric_val_raw[0] if isinstance(metric_val_raw, list) else metric_val_raw
401
-
402
- qualitative_prompt_str += f"- **For category `{group_value}`:**\n"
403
- qualitative_prompt_str += f" - **Top Example Title:** {title}\n"
404
- qualitative_prompt_str += f" - **Metric Value:** {metric_val}\n"
405
- qualitative_prompt_str += f" - **Content Snippet:** {content_snip}\n\n"
406
-
407
- prompt = f"""
408
- You are a top-tier business intelligence analyst. Your task is to write an insightful, data-driven report for an executive. You must synthesize quantitative data (the 'what') with qualitative examples (the 'why') to tell a complete story.
409
-
410
- ---
411
- ### AVAILABLE INFORMATION
412
-
413
- **1. The User's Core Question:**
414
- \"{query}\"
415
-
416
- **2. Quantitative Data (The 'What'):**
417
- This data shows the high-level aggregates.
418
- ```json
419
- {json.dumps(quantitative_data, indent=2)}
420
- ```
421
-
422
- **3. Qualitative Data (The 'Why'):**
423
- These are the single most significant documents driving the numbers for each category.
424
- {qualitative_prompt_str}
425
-
426
- ---
427
- ### REPORTING INSTRUCTIONS
428
-
429
- Your report must be in clean, professional Markdown and follow this structure precisely.
430
-
431
- **Report Structure:**
432
-
433
- `## Executive Summary`
434
- - A 1-2 sentence, top-line answer to the user's question based on the quantitative data.
435
-
436
- `### Key Findings`
437
- - Use bullet points to highlight the main figures from the quantitative data. Interpret the numbers.
438
-
439
- `### Key Drivers & Illustrative Examples`
440
- - **This is the most important section.** Explain the "so what?" behind the numbers.
441
- - Use the qualitative examples to explain *why* a category is high or low. Reference the top example document for each main category.
442
-
443
- `### Deeper Dive: Suggested Follow-up Analyses`
444
- - Propose 2-3 logical next questions based on your analysis to uncover deeper trends.
445
-
446
- ---
447
- **Generate the full report now, paying close attention to all formatting and spacing rules.**
448
- """
449
- try:
450
- response_stream = llm_model.generate_content(prompt, stream=True)
451
- for chunk in response_stream:
452
- yield chunk.text
453
- except Exception as e:
454
- print(f"Error in llm_synthesize_enriched_report_stream: {e}")
455
- yield "Sorry, I was unable to generate a report for this data."
456
-
457
-
458
- def llm_generate_visualization_code(query_context, facet_data):
459
- """Generates Python code for visualization based on query and data."""
460
- prompt = f"""
461
- You are a Python Data Visualization expert specializing in Matplotlib and Seaborn.
462
- Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data.
463
-
464
- **User's Analytical Goal:**
465
- \"{query_context}\"
466
-
467
- **Aggregated Data (from Solr Facets):**
468
- ```json
469
- {json.dumps(facet_data, indent=2)}
470
- ```
471
-
472
- ---
473
- ### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES**
474
- You MUST follow these rules to avoid errors.
475
-
476
- **1. Identify the Data Structure FIRST:**
477
- Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below.
478
-
479
- * **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts.
480
- * **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection").
481
- * **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`.
482
-
483
- **2. Use the Correct Parsing Template:**
484
-
485
- ---
486
- **TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):**
487
- ```python
488
- import matplotlib.pyplot as plt
489
- import seaborn as sns
490
- import pandas as pd
491
-
492
- plt.style.use('seaborn-v0_8-whitegrid')
493
- fig, ax = plt.subplots(figsize=(12, 8))
494
-
495
- # Dynamically find the main facet key (the one with 'buckets')
496
- facet_key = None
497
- for key, value in facet_data.items():
498
- if isinstance(value, dict) and 'buckets' in value:
499
- facet_key = key
500
- break
501
-
502
- if facet_key:
503
- buckets = facet_data[facet_key].get('buckets', [])
504
- # Check if buckets contain data
505
- if buckets:
506
- df = pd.DataFrame(buckets)
507
- # Check for a nested metric or use 'count'
508
- if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc):
509
- # Example for nested sum metric
510
- df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0))
511
- y_axis_label = 'Sum of Total Deal Value'
512
- else:
513
- df.rename(columns={{'count': 'value'}}, inplace=True)
514
- y_axis_label = 'Count'
515
-
516
- sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis')
517
- ax.set_xlabel('Category')
518
- ax.set_ylabel(y_axis_label)
519
- else:
520
- ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')
521
-
522
-
523
- ax.set_title('Your Insightful Title Here')
524
- # Correct way to rotate labels to prevent errors
525
- plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
526
- plt.tight_layout()
527
- ```
528
- ---
529
- **TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):**
530
- ```python
531
- import matplotlib.pyplot as plt
532
- import seaborn as sns
533
- import pandas as pd
534
-
535
- plt.style.use('seaborn-v0_8-whitegrid')
536
- fig, ax = plt.subplots(figsize=(10, 6))
537
-
538
- labels = []
539
- values = []
540
- # Iterate through top-level keys, skipping the 'count'
541
- for key, data_dict in facet_data.items():
542
- if key == 'count' or not isinstance(data_dict, dict):
543
- continue
544
- # Extract the label (e.g., 'oral_deals' -> 'Oral')
545
- label = key.replace('_deals', '').replace('_', ' ').title()
546
- # Find the metric value, which is NOT 'count'
547
- metric_value = 0
548
- for sub_key, sub_value in data_dict.items():
549
- if sub_key != 'count':
550
- metric_value = sub_value
551
- break # Found the metric
552
- labels.append(label)
553
- values.append(metric_value)
554
-
555
- if labels:
556
- sns.barplot(x=labels, y=values, ax=ax, palette='mako')
557
- ax.set_ylabel('Total Deal Value') # Or other metric name
558
- ax.set_xlabel('Category')
559
- else:
560
- ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center')
561
-
562
-
563
- ax.set_title('Your Insightful Title Here')
564
- plt.tight_layout()
565
- ```
566
- ---
567
- **TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):**
568
- ```python
569
- import matplotlib.pyplot as plt
570
- import seaborn as sns
571
- import pandas as pd
572
-
573
- plt.style.use('seaborn-v0_8-whitegrid')
574
- fig, ax = plt.subplots(figsize=(14, 8))
575
-
576
- # Find the key that has the buckets
577
- facet_key = None
578
- for key, value in facet_data.items():
579
- if isinstance(value, dict) and 'buckets' in value:
580
- facet_key = key
581
- break
582
-
583
- if facet_key and facet_data[facet_key].get('buckets'):
584
- # This list comprehension is robust for parsing nested metrics
585
- plot_data = []
586
- for bucket in facet_data[facet_key]['buckets']:
587
- category = bucket['val']
588
- # Find all nested metrics (e.g., total_deal_value_2025)
589
- for sub_key, sub_value in bucket.items():
590
- if isinstance(sub_value, dict) and 'sum' in sub_value:
591
- # Extracts year from 'total_deal_value_2025' -> '2025'
592
- year = sub_key.split('_')[-1]
593
- value = sub_value['sum']
594
- plot_data.append({{'Category': category, 'Year': year, 'Value': value}})
595
-
596
- if plot_data:
597
- df = pd.DataFrame(plot_data)
598
- sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax)
599
- ax.set_ylabel('Total Deal Value')
600
- ax.set_xlabel('Business Model')
601
- # Correct way to rotate labels to prevent errors
602
- plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
603
- else:
604
- ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center')
605
- else:
606
- ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')
607
-
608
- ax.set_title('Your Insightful Title Here')
609
- plt.tight_layout()
610
- ```
611
- ---
612
- **3. Final Code Generation:**
613
- - **DO NOT** include `plt.show()`.
614
- - **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`.
615
- - **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code.
616
- - Adapt the chosen template to the specific keys and metrics in the provided `facet_data`.
617
-
618
- **Your Task:**
619
- Now, generate the Python code.
620
- """
621
- try:
622
- # Increase the timeout for potentially complex generation
623
- generation_config = genai.types.GenerationConfig(temperature=0, max_output_tokens=2048)
624
- response = llm_model.generate_content(prompt, generation_config=generation_config)
625
- # Clean the response to remove markdown formatting
626
- code = re.sub(r'^```python\s*|```$', '', response.text, flags=re.MULTILINE)
627
- return code
628
- except Exception as e:
629
- print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {response.text}")
630
- return None
631
 
632
- def execute_viz_code_and_get_path(viz_code, facet_data):
633
- """Executes visualization code and returns the path to the saved plot image."""
634
- if not viz_code: return None
635
  try:
636
- if not os.path.exists('/tmp/plots'): os.makedirs('/tmp/plots')
637
- plot_path = f"/tmp/plots/plot_{datetime.datetime.now().timestamp()}.png"
638
- # The exec environment needs access to the required libraries and the data
639
- exec_globals = {'facet_data': facet_data, 'plt': plt, 'sns': sns, 'pd': pd}
640
- exec(viz_code, exec_globals)
641
- fig = exec_globals.get('fig')
642
- if fig:
643
- fig.savefig(plot_path, bbox_inches='tight')
644
- plt.close(fig) # Important to free up memory
645
- return plot_path
646
- return None
647
  except Exception as e:
648
- print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}")
649
- return None
650
-
651
-
652
- def process_analysis_flow(user_input, history, state):
653
- """
654
- A generator that manages the conversation and yields tuples of UI updates for Gradio.
655
- This version uses the dual-query (quantitative/qualitative) approach.
656
- """
657
- if state is None:
658
- state = {'query_count': 0, 'last_suggestions': []}
659
- if history is None:
660
- history = []
661
-
662
- # Reset UI for new analysis
663
- yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False))
664
-
665
- query_context = user_input.strip()
666
- if not query_context:
667
- history.append((user_input, "Please enter a question to analyze."))
668
- yield (history, state, None, None, None, None, None)
669
- return
670
-
671
- # 1. Acknowledge and generate plan
672
- history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating analysis plan...*"))
673
- yield (history, state, None, None, None, None, None)
674
-
675
- analysis_plan = llm_generate_analysis_plan_with_history(query_context, field_metadata, history)
676
- if not analysis_plan:
677
- history.append((None, "I'm sorry, I couldn't generate a valid analysis plan for that request. Please try rephrasing."))
678
- yield (history, state, None, None, None, None, None)
679
- return
680
-
681
- history.append((None, "βœ… Analysis plan generated!"))
682
- plan_summary = f"""
683
- * **Analysis Dimension:** `{analysis_plan.get('analysis_dimension')}`
684
- * **Analysis Measure:** `{analysis_plan.get('analysis_measure')}`
685
- * **Query Filter:** `{analysis_plan.get('query_filter')}`
686
- """
687
- # Show the plan summary in the main chat
688
- history.append((None, plan_summary))
689
- # Put the full plan in the accordion
690
- formatted_plan = f"**Full Analysis Plan:**\n```json\n{json.dumps(analysis_plan, indent=2)}\n```"
691
- yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
692
-
693
-
694
- # 2. Execute Queries in Parallel
695
- history.append((None, "*Executing queries for aggregates and examples...*"))
696
- yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
697
-
698
- aggregate_data = None
699
- example_data = None
700
- with concurrent.futures.ThreadPoolExecutor() as executor:
701
- future_agg = executor.submit(execute_quantitative_query, analysis_plan, solr_client)
702
- future_ex = executor.submit(execute_qualitative_query, analysis_plan, solr_client)
703
- aggregate_data = future_agg.result()
704
- example_data = future_ex.result()
705
-
706
- if not aggregate_data or aggregate_data.get('count', 0) == 0:
707
- history.append((None, "No data was found for your query. Please try a different question."))
708
- yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
709
- return
710
-
711
- # Display retrieved data in accordions
712
- formatted_agg_data = f"**Quantitative (Aggregate) Data:**\n```json\n{json.dumps(aggregate_data, indent=2)}\n```"
713
- formatted_qual_data = f"**Qualitative (Example) Data:**\n```json\n{json.dumps(example_data, indent=2)}\n```"
714
- qual_data_display_update = gr.update(value=formatted_qual_data, visible=True)
715
- yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
716
-
717
-
718
- # 3. Generate Visualization (in parallel with report)
719
- history.append((None, "βœ… Data retrieved. Generating visualization and final report..."))
720
- yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
721
-
722
- with concurrent.futures.ThreadPoolExecutor() as executor:
723
- viz_future = executor.submit(llm_generate_visualization_code, query_context, aggregate_data)
724
-
725
- # 4. Generate and Stream Enriched Report
726
- report_text = ""
727
- stream_history = history[:]
728
- for chunk in llm_synthesize_enriched_report_stream(query_context, aggregate_data, example_data, analysis_plan):
729
- report_text += chunk
730
- yield (stream_history, state, None, gr.update(value=report_text, visible=True), gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
731
-
732
- history.append((None, report_text))
733
-
734
- # Get visualization from future
735
- viz_code = viz_future.result()
736
- plot_path = execute_viz_code_and_get_path(viz_code, aggregate_data)
737
- output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False)
738
- if not plot_path:
739
- history.append((None, "*I was unable to generate a plot for this data.*\n"))
740
-
741
- yield (history, state, output_plot, report_text, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
742
-
743
- # 5. Finalize
744
- state['query_count'] += 1
745
- state['last_suggestions'] = parse_suggestions_from_report(report_text)
746
- next_prompt = "Analysis complete. What would you like to explore next?"
747
- history.append((None, next_prompt))
748
- yield (history, state, output_plot, report_text, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
749
-
750
-
751
- # --- Gradio UI ---
752
- with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo:
753
- state = gr.State()
754
-
755
- with gr.Row():
756
- with gr.Column(scale=4):
757
- gr.Markdown("# πŸ’Š PharmaCircle AI Data Analyst")
758
- with gr.Column(scale=1):
759
- clear_button = gr.Button("πŸ”„ Start New Analysis", variant="primary")
760
-
761
- gr.Markdown("Ask a question to begin your analysis. I will generate an analysis plan, retrieve quantitative and qualitative data, create a visualization, and write an enriched report.")
762
-
763
- with gr.Row():
764
- with gr.Column(scale=1):
765
- chatbot = gr.Chatbot(label="Analysis Chat Log", height=700, show_copy_button=True)
766
- msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True)
767
-
768
- with gr.Column(scale=2):
769
- with gr.Accordion("Generated Analysis Plan", open=False):
770
- plan_display = gr.Markdown("Plan will appear here...", visible=True)
771
- with gr.Accordion("Retrieved Quantitative Data", open=False):
772
- quantitative_data_display = gr.Markdown("Aggregate data will appear here...", visible=False)
773
- with gr.Accordion("Retrieved Qualitative Data (Examples)", open=False):
774
- qualitative_data_display = gr.Markdown("Example data will appear here...", visible=False)
775
- plot_display = gr.Image(label="Visualization", type="filepath", visible=False)
776
- report_display = gr.Markdown("Report will be streamed here...", visible=False)
777
-
778
- # --- Event Wiring ---
779
- def reset_all():
780
- """Resets the entire UI for a new analysis session."""
781
- return (
782
- [], # chatbot
783
- None, # state
784
- "", # msg_textbox
785
- gr.update(value=None, visible=False), # plot_display
786
- gr.update(value=None, visible=False), # report_display
787
- gr.update(value=None, visible=False), # plan_display
788
- gr.update(value=None, visible=False), # quantitative_data_display
789
- gr.update(value=None, visible=False) # qualitative_data_display
790
- )
791
-
792
- msg_textbox.submit(
793
- fn=process_analysis_flow,
794
- inputs=[msg_textbox, chatbot, state],
795
- outputs=[chatbot, state, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display],
796
- ).then(
797
- lambda: gr.update(value=""),
798
- None,
799
- [msg_textbox],
800
- queue=False,
801
- )
802
-
803
- clear_button.click(
804
- fn=reset_all,
805
- inputs=None,
806
- outputs=[chatbot, state, msg_textbox, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display],
807
- queue=False
808
- )
809
-
810
- if is_initialized:
811
- demo.queue().launch(debug=True, share=True)
812
- else:
813
- print("\nSkipping Gradio launch due to initialization errors.")
 
1
+ """
2
+ Main entry point for the PharmaCircle AI Data Analyst application.
3
+ This script initializes the necessary services and launches the Gradio user interface.
4
+ It has been refactored to be a lean entry point, delegating all complex logic
5
+ to specialized modules.
6
+ """
 
 
 
 
 
 
7
  import logging
8
+ import connections
9
+ from ui import create_ui
 
 
10
 
11
  # --- Suppress Matplotlib Debug Logs ---
12
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
13
 
14
+ def main():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  """
16
+ Initializes connections and launches the Gradio application.
 
 
 
17
  """
18
+ # Initialize all external connections (SSH, Solr, LLM)
19
+ ssh_tunnel, solr_client, llm_model = connections.initialize_connections()
20
+
21
+ if not all([ssh_tunnel, solr_client, llm_model]):
22
+ print("\nSkipping Gradio launch due to initialization errors.")
23
+ # Ensure the tunnel is closed if it was partially opened
24
+ if ssh_tunnel and ssh_tunnel.is_active:
25
+ ssh_tunnel.stop()
26
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # Create and launch the Gradio UI
29
+ demo = create_ui(llm_model, solr_client)
 
30
  try:
31
+ demo.queue().launch(debug=True, share=True)
 
 
 
 
 
 
 
 
 
 
32
  except Exception as e:
33
+ print(f"An error occurred while launching the Gradio app: {e}")
34
+ finally:
35
+ # Ensure the SSH tunnel is closed when the app is shut down
36
+ print("\nClosing SSH tunnel...")
37
+ if ssh_tunnel.is_active:
38
+ ssh_tunnel.stop()
39
+ print("SSH tunnel closed. Exiting.")
40
+
41
+ if __name__ == "__main__":
42
+ main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration management for the PharmaCircle AI Data Analyst.
3
+
4
+ Loads all necessary secrets and configuration parameters from environment variables.
5
+ This approach keeps sensitive data out of the source code and allows for flexible deployment.
6
+ """
7
+
8
+ import os
9
+
10
+ # --- SSH Tunnel Configuration ---
11
+ # Credentials for establishing the SSH tunnel to the Solr server environment.
12
+ SSH_HOST = os.environ.get('SSH_HOST')
13
+ SSH_PORT = 5322
14
+ SSH_USER = os.environ.get('SSH_USER')
15
+ SSH_PASS = os.environ.get('SSH_PASS')
16
+
17
+ # --- Solr Configuration ---
18
+ # Details for the remote Solr instance and the local port for the tunnel.
19
+ REMOTE_SOLR_HOST = '69.167.186.48'
20
+ REMOTE_SOLR_PORT = 8983
21
+ LOCAL_BIND_PORT = 8983
22
+ SOLR_CORE_NAME = 'news'
23
+ SOLR_USER = os.environ.get('SOLR_USER')
24
+ SOLR_PASS = os.environ.get('SOLR_PASS')
25
+
26
+ # --- Google Gemini Configuration ---
27
+ # API key for accessing the Google Gemini large language model.
28
+ GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY')
connections.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Manages connections to external services: SSH, Solr, and Google Gemini.
3
+
4
+ This module centralizes the initialization logic, making the main application
5
+ cleaner and more focused on its primary task. It provides a single function
6
+ to set up all necessary connections.
7
+ """
8
+
9
+
10
+ import pysolr
11
+ import google.generativeai as genai
12
+ from sshtunnel import SSHTunnelForwarder
13
+ import config
14
+
15
+ def initialize_connections():
16
+ """
17
+ Establishes the SSH tunnel, and initializes Solr and Gemini clients.
18
+
19
+ Returns:
20
+ A tuple containing the initialized (ssh_tunnel_server, solr_client, llm_model).
21
+ Returns (None, None, None) if any part of the initialization fails.
22
+ """
23
+ ssh_tunnel_server = None
24
+ try:
25
+ # 1. Configure and start the SSH Tunnel
26
+ ssh_tunnel_server = SSHTunnelForwarder(
27
+ (config.SSH_HOST, config.SSH_PORT),
28
+ ssh_username=config.SSH_USER,
29
+ ssh_password=config.SSH_PASS,
30
+ remote_bind_address=(config.REMOTE_SOLR_HOST, config.REMOTE_SOLR_PORT),
31
+ local_bind_address=('127.0.0.1', config.LOCAL_BIND_PORT)
32
+ )
33
+ ssh_tunnel_server.start()
34
+ print(f"πŸš€ SSH tunnel established: Local Port {ssh_tunnel_server.local_bind_port} -> Remote Solr.")
35
+
36
+ # 2. Initialize the pysolr client
37
+ solr_url = f'http://127.0.0.1:{ssh_tunnel_server.local_bind_port}/solr/{config.SOLR_CORE_NAME}'
38
+ solr_client = pysolr.Solr(solr_url, auth=(config.SOLR_USER, config.SOLR_PASS), always_commit=True)
39
+ solr_client.ping()
40
+ print(f"βœ… Solr connection successful on core '{config.SOLR_CORE_NAME}'.")
41
+
42
+ # 3. Initialize the LLM
43
+ genai.configure(api_key=config.GEMINI_API_KEY)
44
+ llm_model = genai.GenerativeModel('gemini-2.5-flash', generation_config=genai.types.GenerationConfig(temperature=0))
45
+ print(f"βœ… LLM Model '{llm_model.model_name}' initialized.")
46
+
47
+ print("βœ… System Initialized Successfully.")
48
+ return ssh_tunnel_server, solr_client, llm_model
49
+
50
+ except Exception as e:
51
+ print(f"\n❌ An error occurred during setup: {e}")
52
+ if ssh_tunnel_server and ssh_tunnel_server.is_active:
53
+ ssh_tunnel_server.stop()
54
+ return None, None, None
data_processing.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Core data processing and analysis logic for the PharmaCircle AI Data Analyst.
3
+
4
+ This module orchestrates the main analysis workflow:
5
+ 1. Takes a user's natural language query.
6
+ 2. Uses the LLM to generate a structured analysis plan.
7
+ 3. Executes parallel queries against Solr for quantitative and qualitative data.
8
+ 4. Generates a data visualization using the LLM.
9
+ 5. Synthesizes the findings into a comprehensive, user-facing report.
10
+ """
11
+
12
+
13
+ import json
14
+ import re
15
+ import datetime
16
+ import pandas as pd
17
+ import matplotlib.pyplot as plt
18
+ import seaborn as sns
19
+ import os
20
+ import concurrent.futures
21
+ import copy
22
+ import google.generativeai as genai
23
+
24
+ from llm_prompts import (
25
+ get_analysis_plan_prompt,
26
+ get_synthesis_report_prompt,
27
+ get_visualization_code_prompt
28
+ )
29
+
30
+ def parse_suggestions_from_report(report_text):
31
+ """Extracts numbered suggestions from the report's markdown text."""
32
+ suggestions_match = re.search(r"### (?:Deeper Dive: Suggested Follow-up Analyses|Suggestions for Further Exploration)\s*\n(.*?)$", report_text, re.DOTALL | re.IGNORECASE)
33
+ if not suggestions_match: return []
34
+ suggestions_text = suggestions_match.group(1)
35
+ suggestions = re.findall(r"^\s*\d+\.\s*(.*)", suggestions_text, re.MULTILINE)
36
+ return [s.strip() for s in suggestions]
37
+
38
+ def llm_generate_analysis_plan_with_history(llm_model, natural_language_query, chat_history):
39
+ """
40
+ Generates a complete analysis plan from a user query, considering chat history.
41
+ """
42
+ prompt = get_analysis_plan_prompt(natural_language_query, chat_history)
43
+ try:
44
+ response = llm_model.generate_content(prompt)
45
+ cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
46
+ plan = json.loads(cleaned_text)
47
+ return plan
48
+ except Exception as e:
49
+ raw_response_text = response.text if 'response' in locals() else 'N/A'
50
+ print(f"Error in llm_generate_analysis_plan_with_history: {e}\nRaw Response:\n{raw_response_text}")
51
+ return None
52
+
53
+ def execute_quantitative_query(solr_client, plan):
54
+ """Executes the facet query to get aggregate data."""
55
+ if not plan or 'quantitative_request' not in plan or 'json.facet' not in plan.get('quantitative_request', {}):
56
+ return None
57
+ try:
58
+ params = {
59
+ "q": plan.get('query_filter', '*_*'),
60
+ "rows": 0,
61
+ "json.facet": json.dumps(plan['quantitative_request']['json.facet'])
62
+ }
63
+ results = solr_client.search(**params)
64
+ return results.raw_response.get("facets", {})
65
+ except Exception as e:
66
+ print(f"Error in quantitative query: {e}")
67
+ return None
68
+
69
+ def execute_qualitative_query(solr_client, plan):
70
+ """Executes the grouping query to get the best example docs."""
71
+ if not plan or 'qualitative_request' not in plan:
72
+ return None
73
+ try:
74
+ qual_request = copy.deepcopy(plan['qualitative_request'])
75
+ params = {
76
+ "q": plan.get('query_filter', '*_*'),
77
+ "rows": 3, # Get a few examples per group
78
+ "fl": "*,score",
79
+ **qual_request
80
+ }
81
+ results = solr_client.search(**params)
82
+ return results.grouped
83
+ except Exception as e:
84
+ print(f"Error in qualitative query: {e}")
85
+ return None
86
+
87
+ def llm_synthesize_enriched_report_stream(llm_model, query, quantitative_data, qualitative_data, plan):
88
+ """
89
+ Generates an enriched report by synthesizing quantitative aggregates
90
+ and qualitative examples, and streams the result.
91
+ """
92
+ prompt = get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan)
93
+ try:
94
+ response_stream = llm_model.generate_content(prompt, stream=True)
95
+ for chunk in response_stream:
96
+ yield chunk.text
97
+ except Exception as e:
98
+ print(f"Error in llm_synthesize_enriched_report_stream: {e}")
99
+ yield "Sorry, I was unable to generate a report for this data."
100
+
101
+ def llm_generate_visualization_code(llm_model, query_context, facet_data):
102
+ """Generates Python code for visualization based on query and data."""
103
+ prompt = get_visualization_code_prompt(query_context, facet_data)
104
+ try:
105
+ generation_config = genai.types.GenerationConfig(temperature=0, max_output_tokens=2048)
106
+ response = llm_model.generate_content(prompt, generation_config=generation_config)
107
+ code = re.sub(r'^```python\s*|```$', '', response.text, flags=re.MULTILINE)
108
+ return code
109
+ except Exception as e:
110
+ print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {response.text}")
111
+ return None
112
+
113
+ def execute_viz_code_and_get_path(viz_code, facet_data):
114
+ """Executes visualization code and returns the path to the saved plot image."""
115
+ if not viz_code: return None
116
+ try:
117
+ if not os.path.exists('/tmp/plots'): os.makedirs('/tmp/plots')
118
+ plot_path = f"/tmp/plots/plot_{datetime.datetime.now().timestamp()}.png"
119
+ exec_globals = {'facet_data': facet_data, 'plt': plt, 'sns': sns, 'pd': pd}
120
+ exec(viz_code, exec_globals)
121
+ fig = exec_globals.get('fig')
122
+ if fig:
123
+ fig.savefig(plot_path, bbox_inches='tight')
124
+ plt.close(fig)
125
+ return plot_path
126
+ return None
127
+ except Exception as e:
128
+ print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}")
129
+ return None
130
+
llm_prompts.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Contains the prompt templates for interacting with the Gemini LLM.
3
+
4
+ Separating prompts from the application logic makes them easier to manage,
5
+ modify, and version. This module provides functions that return the formatted
6
+ prompt strings required by the data processing module.
7
+ """
8
+
9
+
10
+ import datetime
11
+ import json
12
+ from solr_metadata import format_metadata_for_prompt
13
+
14
+ def get_analysis_plan_prompt(natural_language_query, chat_history):
15
+ """
16
+ Generates the prompt for creating a Solr analysis plan from a user query.
17
+ """
18
+ formatted_field_info = format_metadata_for_prompt()
19
+ formatted_history = ""
20
+ for user_msg, bot_msg in chat_history:
21
+ if user_msg:
22
+ formatted_history += f"- User: \"{user_msg}\"\n"
23
+
24
+ return f"""
25
+ You are an expert data analyst and Solr query engineer. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping).
26
+
27
+ ---
28
+ ### CONTEXT & RULES
29
+
30
+ 1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
31
+ 2. **Field Usage**: You MUST use the fields described in the 'Field Definitions'. Pay close attention to the definitions to select the correct field, especially the `_s` fields for searching. Do not use fields ending with `_s` in `group.field` or facet `field` unless necessary for the analysis.
32
+ 3. **Dimension vs. Measure**:
33
+ * `analysis_dimension`: The primary categorical field the user wants to group by (e.g., `company_name`, `route_branch`). This is the `group by` field.
34
+ * `analysis_measure`: The metric to aggregate (e.g., `sum(total_deal_value_in_million)`) or the method of counting (`count`).
35
+ * `sort_field_for_examples`: The raw field used to find the "best" example. If `analysis_measure` is `sum(field)`, this should be `field`. If `analysis_measure` is `count`, this should be a relevant field like `date`.
36
+ 4. **Crucial Sorting Rules**:
37
+ * For `group.sort`: If `analysis_measure` involves a function on a field (e.g., `sum(total_deal_value_in_million)`), you MUST use the full function: `group.sort: 'sum(total_deal_value_in_million) desc'`.
38
+ * If `analysis_measure` is 'count', you MUST OMIT the `group.sort` parameter entirely.
39
+ * For sorting, NEVER use 'date_year'; use 'date' instead.
40
+ 5. **Output Format**: Your final output must be a single, raw JSON object. Do not add comments or markdown formatting.
41
+
42
+ ---
43
+ ### FIELD DEFINITIONS (Your Source of Truth)
44
+
45
+ {formatted_field_info}
46
+ ---
47
+ ### CHAT HISTORY
48
+ {formatted_history}
49
+ ---
50
+ ### EXAMPLES
51
+
52
+ **User Query 1:** "What are the top 5 companies by total deal value in 2023?"
53
+ **Correct JSON Output 1:**
54
+ ```json
55
+ {{
56
+ "analysis_dimension": "company_name",
57
+ "analysis_measure": "sum(total_deal_value_in_million)",
58
+ "sort_field_for_examples": "total_deal_value_in_million",
59
+ "query_filter": "date_year:2023 AND total_deal_value_in_million:[0 TO *]",
60
+ "quantitative_request": {{
61
+ "json.facet": {{
62
+ "companies_by_deal_value": {{
63
+ "type": "terms",
64
+ "field": "company_name",
65
+ "limit": 5,
66
+ "sort": "total_value desc",
67
+ "facet": {{
68
+ "total_value": "sum(total_deal_value_in_million)"
69
+ }}
70
+ }}
71
+ }}
72
+ }},
73
+ "qualitative_request": {{
74
+ "group": true,
75
+ "group.field": "company_name",
76
+ "group.limit": 1,
77
+ "group.sort": "sum(total_deal_value_in_million) desc",
78
+ "sort": "total_deal_value_in_million desc"
79
+ }}
80
+ }}
81
+ ```
82
+
83
+ **User Query 2:** "What are the most common news types for infections this year?"
84
+ **Correct JSON Output 2:**
85
+ ```json
86
+ {{
87
+ "analysis_dimension": "news_type",
88
+ "analysis_measure": "count",
89
+ "sort_field_for_examples": "date",
90
+ "query_filter": "therapeutic_category_s:infections AND date_year:{datetime.datetime.now().year}",
91
+ "quantitative_request": {{
92
+ "json.facet": {{
93
+ "news_by_type": {{
94
+ "type": "terms",
95
+ "field": "news_type",
96
+ "limit": 10,
97
+ "sort": "count desc"
98
+ }}
99
+ }}
100
+ }},
101
+ "qualitative_request": {{
102
+ "group": true,
103
+ "group.field": "news_type",
104
+ "group.limit": 1,
105
+ "sort": "date desc"
106
+ }}
107
+ }}
108
+ ```
109
+ ---
110
+ ### YOUR TASK
111
+
112
+ Convert the following user query into a single, raw JSON "Analysis Plan" object, strictly following all rules and considering the chat history.
113
+
114
+ **Current User Query:** `{natural_language_query}`
115
+ """
116
+
117
+ def get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan):
118
+ """
119
+ Generates the prompt for synthesizing a final report from the query results.
120
+ """
121
+ qualitative_prompt_str = ""
122
+ dimension = plan.get('analysis_dimension', 'N/A')
123
+ if qualitative_data and dimension in qualitative_data:
124
+ for group in qualitative_data.get(dimension, {}).get('groups', []):
125
+ group_value = group.get('groupValue', 'N/A')
126
+ if group.get('doclist', {}).get('docs'):
127
+ doc = group.get('doclist', {}).get('docs', [{}])[0]
128
+ title = doc.get('abstract', ['No Title'])
129
+ content_list = doc.get('content', [])
130
+ content_snip = (' '.join(content_list[0].split()[:40]) + '...') if content_list else 'No content available.'
131
+ metric_val_raw = doc.get(plan.get('sort_field_for_examples'), 'N/A')
132
+ metric_val = metric_val_raw[0] if isinstance(metric_val_raw, list) else metric_val_raw
133
+
134
+ qualitative_prompt_str += f"- **For category `{group_value}`:**\n"
135
+ qualitative_prompt_str += f" - **Top Example Title:** {title}\n"
136
+ qualitative_prompt_str += f" - **Metric Value:** {metric_val}\n"
137
+ qualitative_prompt_str += f" - **Content Snippet:** {content_snip}\n\n"
138
+
139
+ return f"""
140
+ You are a top-tier business intelligence analyst. Your task is to write an insightful, data-driven report for an executive. You must synthesize quantitative data (the 'what') with qualitative examples (the 'why') to tell a complete story.
141
+
142
+ ---
143
+ ### AVAILABLE INFORMATION
144
+
145
+ **1. The User's Core Question:**
146
+ \"{query}\"
147
+
148
+ **2. Quantitative Data (The 'What'):**
149
+ This data shows the high-level aggregates.
150
+ ```json
151
+ {json.dumps(quantitative_data, indent=2)}
152
+ ```
153
+
154
+ **3. Qualitative Data (The 'Why'):
155
+ These are the single most significant documents driving the numbers for each category.
156
+ {qualitative_prompt_str}
157
+
158
+ ---
159
+ ### REPORTING INSTRUCTIONS
160
+
161
+ Your report must be in clean, professional Markdown and follow this structure precisely.
162
+
163
+ **Report Structure:**
164
+
165
+ `## Executive Summary`
166
+ - A 1-2 sentence, top-line answer to the user's question based on the quantitative data.
167
+
168
+ `### Key Findings`
169
+ - Use bullet points to highlight the main figures from the quantitative data. Interpret the numbers.
170
+
171
+ `### Key Drivers & Illustrative Examples`
172
+ - **This is the most important section.** Explain the "so what?" behind the numbers.
173
+ - Use the qualitative examples to explain *why* a category is high or low. Reference the top example document for each main category.
174
+
175
+ `### Deeper Dive: Suggested Follow-up Analyses`
176
+ - Propose 2-3 logical next questions based on your analysis to uncover deeper trends.
177
+
178
+ ---
179
+ **Generate the full report now, paying close attention to all formatting and spacing rules.**
180
+ """
181
+
182
+ def get_visualization_code_prompt(query_context, facet_data):
183
+ """
184
+ Generates the prompt for creating Python visualization code.
185
+ """
186
+ return f"""
187
+ You are a Python Data Visualization expert specializing in Matplotlib and Seaborn.
188
+ Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data.
189
+
190
+ **User's Analytical Goal:**
191
+ \"{query_context}\"
192
+
193
+ **Aggregated Data (from Solr Facets):**
194
+ ```json
195
+ {json.dumps(facet_data, indent=2)}
196
+ ```
197
+
198
+ ---
199
+ ### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES**
200
+ You MUST follow these rules to avoid errors.
201
+
202
+ **1. Identify the Data Structure FIRST:**
203
+ Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below.
204
+
205
+ * **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts.
206
+ * **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection").
207
+ * **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`.
208
+
209
+ **2. Use the Correct Parsing Template:**
210
+
211
+ ---
212
+ **TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):**
213
+ ```python
214
+ import matplotlib.pyplot as plt
215
+ import seaborn as sns
216
+ import pandas as pd
217
+
218
+ plt.style.use('seaborn-v0_8-whitegrid')
219
+ fig, ax = plt.subplots(figsize=(12, 8))
220
+
221
+ # Dynamically find the main facet key (the one with 'buckets')
222
+ facet_key = None
223
+ for key, value in facet_data.items():
224
+ if isinstance(value, dict) and 'buckets' in value:
225
+ facet_key = key
226
+ break
227
+
228
+ if facet_key:
229
+ buckets = facet_data[facet_key].get('buckets', [])
230
+ # Check if buckets contain data
231
+ if buckets:
232
+ df = pd.DataFrame(buckets)
233
+ # Check for a nested metric or use 'count'
234
+ if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc):
235
+ # Example for nested sum metric
236
+ df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0))
237
+ y_axis_label = 'Sum of Total Deal Value'
238
+ else:
239
+ df.rename(columns={{'count': 'value'}}, inplace=True)
240
+ y_axis_label = 'Count'
241
+
242
+ sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis')
243
+ ax.set_xlabel('Category')
244
+ ax.set_ylabel(y_axis_label)
245
+ else:
246
+ ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')
247
+
248
+
249
+ ax.set_title('Your Insightful Title Here')
250
+ # Correct way to rotate labels to prevent errors
251
+ plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
252
+ plt.tight_layout()
253
+ ```
254
+ ---
255
+ **TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):**
256
+ ```python
257
+ import matplotlib.pyplot as plt
258
+ import seaborn as sns
259
+ import pandas as pd
260
+
261
+ plt.style.use('seaborn-v0_8-whitegrid')
262
+ fig, ax = plt.subplots(figsize=(10, 6))
263
+
264
+ labels = []
265
+ values = []
266
+ # Iterate through top-level keys, skipping the 'count'
267
+ for key, data_dict in facet_data.items():
268
+ if key == 'count' or not isinstance(data_dict, dict):
269
+ continue
270
+ # Extract the label (e.g., 'oral_deals' -> 'Oral')
271
+ label = key.replace('_deals', '').replace('_', ' ').title()
272
+ # Find the metric value, which is NOT 'count'
273
+ metric_value = 0
274
+ for sub_key, sub_value in data_dict.items():
275
+ if sub_key != 'count':
276
+ metric_value = sub_value
277
+ break # Found the metric
278
+ labels.append(label)
279
+ values.append(metric_value)
280
+
281
+ if labels:
282
+ sns.barplot(x=labels, y=values, ax=ax, palette='mako')
283
+ ax.set_ylabel('Total Deal Value') # Or other metric name
284
+ ax.set_xlabel('Category')
285
+ else:
286
+ ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center')
287
+
288
+
289
+ ax.set_title('Your Insightful Title Here')
290
+ plt.tight_layout()
291
+ ```
292
+ ---
293
+ **TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):**
294
+ ```python
295
+ import matplotlib.pyplot as plt
296
+ import seaborn as sns
297
+ import pandas as pd
298
+
299
+ plt.style.use('seaborn-v0_8-whitegrid')
300
+ fig, ax = plt.subplots(figsize=(14, 8))
301
+
302
+ # Find the key that has the buckets
303
+ facet_key = None
304
+ for key, value in facet_data.items():
305
+ if isinstance(value, dict) and 'buckets' in value:
306
+ facet_key = key
307
+ break
308
+
309
+ if facet_key and facet_data[facet_key].get('buckets'):
310
+ # This list comprehension is robust for parsing nested metrics
311
+ plot_data = []
312
+ for bucket in facet_data[facet_key]['buckets']:
313
+ category = bucket['val']
314
+ # Find all nested metrics (e.g., total_deal_value_2025)
315
+ for sub_key, sub_value in bucket.items():
316
+ if isinstance(sub_value, dict) and 'sum' in sub_value:
317
+ # Extracts year from 'total_deal_value_2025' -> '2025'
318
+ year = sub_key.split('_')[-1]
319
+ value = sub_value['sum']
320
+ plot_data.append({{'Category': category, 'Year': year, 'Value': value}})
321
+
322
+ if plot_data:
323
+ df = pd.DataFrame(plot_data)
324
+ sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax)
325
+ ax.set_ylabel('Total Deal Value')
326
+ ax.set_xlabel('Business Model')
327
+ # Correct way to rotate labels to prevent errors
328
+ plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
329
+ else:
330
+ ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center')
331
+ else:
332
+ ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center')
333
+
334
+ ax.set_title('Your Insightful Title Here')
335
+ plt.tight_layout()
336
+ ```
337
+ ---
338
+ **3. Final Code Generation:**
339
+ - **DO NOT** include `plt.show()`.
340
+ - **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`.
341
+ - **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code.
342
+ - Adapt the chosen template to the specific keys and metrics in the provided `facet_data`.
343
+
344
+ **Your Task:**
345
+ Now, generate the Python code.
346
+ """
solr_metadata.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stores the detailed metadata for the Solr index fields.
3
+
4
+ This information is crucial for the LLM to understand the data schema,
5
+ enabling it to construct accurate and efficient Solr queries. Separating it
6
+ into its own module keeps the main application logic cleaner.
7
+ """
8
+
9
+ field_metadata = [
10
+ {
11
+ "field_name": "business_model",
12
+ "type": "string (categorical)",
13
+ "example_values": ["pharma/bio", "drug delivery", "pharma services"],
14
+ "definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments."
15
+ },
16
+ {
17
+ "field_name": "news_type",
18
+ "type": "string (categorical)",
19
+ "example_values": ["product news", "financial news", "regulatory news"],
20
+ "definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported."
21
+ },
22
+ {
23
+ "field_name": "event_type",
24
+ "type": "string (categorical)",
25
+ "example_values": ["phase 2", "phase 1", "pre clinical", "marketed"],
26
+ "definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases."
27
+ },
28
+ {
29
+ "field_name": "source",
30
+ "type": "string (categorical)",
31
+ "example_values": ["Press Release", "PR Newswire", "Business Wire"],
32
+ "definition": "The original source of the news article, such as a newswire or official report."
33
+ },
34
+ {
35
+ "field_name": "company_name",
36
+ "type": "string (exact match, for faceting)",
37
+ "example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
38
+ "definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching."
39
+ },
40
+ {
41
+ "field_name": "company_name_s",
42
+ "type": "string (multi-valued, for searching)",
43
+ "example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
44
+ "definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting."
45
+ },
46
+ {
47
+ "field_name": "territory_hq_s",
48
+ "type": "string (multi-valued, hierarchical)",
49
+ "example_values": ["united states of america", "europe", "europe western"],
50
+ "definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location."
51
+ },
52
+ {
53
+ "field_name": "therapeutic_category",
54
+ "type": "string (specific)",
55
+ "example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
56
+ "definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries."
57
+ },
58
+ {
59
+ "field_name": "therapeutic_category_s",
60
+ "type": "string (multi-valued, for searching)",
61
+ "example_values": ["cancer", "oncology", "infections", "cns"],
62
+ "definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter."
63
+ },
64
+ {
65
+ "field_name": "compound_name",
66
+ "type": "string (exact match, for faceting)",
67
+ "example_values": ["opdivo injection solution", "keytruda injection solution"],
68
+ "definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds."
69
+ },
70
+ {
71
+ "field_name": "compound_name_s",
72
+ "type": "string (multi-valued, for searching)",
73
+ "example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
74
+ "definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name."
75
+ },
76
+ {
77
+ "field_name": "molecule_name",
78
+ "type": "string (exact match, for faceting)",
79
+ "example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
80
+ "definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules."
81
+ },
82
+ {
83
+ "field_name": "molecule_name_s",
84
+ "type": "string (multi-valued, for searching)",
85
+ "example_values": ["cbd", "s1-220", "a1002n5s"],
86
+ "definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name."
87
+ },
88
+ {
89
+ "field_name": "highest_phase",
90
+ "type": "string (categorical)",
91
+ "example_values": ["marketed", "phase 2", "phase 1"],
92
+ "definition": "The highest stage of development a drug has ever reached."
93
+ },
94
+ {
95
+ "field_name": "drug_delivery_branch_s",
96
+ "type": "string (multi-valued, for searching)",
97
+ "example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
98
+ "definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms."
99
+ },
100
+ {
101
+ "field_name": "drug_delivery_branch",
102
+ "type": "string (categorical, specific, for faceting)",
103
+ "example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
104
+ "definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies."
105
+ },
106
+ {
107
+ "field_name": "route_branch",
108
+ "type": "string (categorical)",
109
+ "example_values": ["injection", "oral", "topical", "inhalation"],
110
+ "definition": "The primary route of drug administration. Good for faceting on exact routes."
111
+ },
112
+ {
113
+ "field_name": "molecule_api_group",
114
+ "type": "string (categorical)",
115
+ "example_values": ["small molecules", "biologics", "nucleic acids"],
116
+ "definition": "High-level classification of the drug's molecular type."
117
+ },
118
+ {
119
+ "field_name": "content",
120
+ "type": "text (full-text search)",
121
+ "example_values": ["The largest study to date...", "balstilimab..."],
122
+ "definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields."
123
+ },
124
+ {
125
+ "field_name": "date",
126
+ "type": "date",
127
+ "example_values": ["2020-10-22T00:00:00Z"],
128
+ "definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries."
129
+ },
130
+ {
131
+ "field_name": "date_year",
132
+ "type": "number (year)",
133
+ "example_values": [2020, 2021, 2022],
134
+ "definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')."
135
+ },
136
+ {
137
+ "field_name": "total_deal_value_in_million",
138
+ "type": "number (metric)",
139
+ "example_values": [50, 120.5, 176.157, 1000],
140
+ "definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'."
141
+ }
142
+ ]
143
+
144
+ def format_metadata_for_prompt():
145
+ """Formats the field metadata into a string for the LLM prompt."""
146
+ formatted_string = ""
147
+ for field in field_metadata:
148
+ formatted_string += f"- **{field['field_name']}**\n"
149
+ formatted_string += f" - **Type**: {field['type']}\n"
150
+ formatted_string += f" - **Definition**: {field['definition']}\n"
151
+ formatted_string += f" - **Examples**: {', '.join(map(str, field['example_values']))}\n\n"
152
+ return formatted_string
ui.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Defines the Gradio user interface and manages the application's state
3
+ and event handling.
4
+
5
+ This module is responsible for the presentation layer of the application.
6
+ It creates the interactive components and orchestrates the analysis workflow
7
+ by calling functions from the data_processing module.
8
+ """
9
+
10
+
11
+ import gradio as gr
12
+ import json
13
+ import concurrent.futures
14
+ from data_processing import (
15
+ llm_generate_analysis_plan_with_history,
16
+ execute_quantitative_query,
17
+ execute_qualitative_query,
18
+ llm_synthesize_enriched_report_stream,
19
+ llm_generate_visualization_code,
20
+ execute_viz_code_and_get_path,
21
+ parse_suggestions_from_report
22
+ )
23
+
24
+ def create_ui(llm_model, solr_client):
25
+ """
26
+ Builds the Gradio UI and wires up all the event handlers.
27
+
28
+ Args:
29
+ llm_model: The initialized Google Gemini model client.
30
+ solr_client: The initialized pysolr client.
31
+ """
32
+ with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo:
33
+ state = gr.State()
34
+
35
+ with gr.Row():
36
+ with gr.Column(scale=4):
37
+ gr.Markdown("# PharmaCircle AI Data Analyst")
38
+ with gr.Column(scale=1):
39
+ clear_button = gr.Button("πŸ”„ Start New Analysis", variant="primary")
40
+
41
+ gr.Markdown("Ask a question to begin your analysis. I will generate an analysis plan, retrieve quantitative and qualitative data, create a visualization, and write an enriched report.")
42
+
43
+ with gr.Row():
44
+ with gr.Column(scale=1):
45
+ chatbot = gr.Chatbot(label="Analysis Chat Log", height=700, show_copy_button=True)
46
+ msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True)
47
+
48
+ with gr.Column(scale=2):
49
+ with gr.Accordion("Generated Analysis Plan", open=False):
50
+ plan_display = gr.Markdown("Plan will appear here...", visible=True)
51
+ with gr.Accordion("Retrieved Quantitative Data", open=False):
52
+ quantitative_data_display = gr.Markdown("Aggregate data will appear here...", visible=False)
53
+ with gr.Accordion("Retrieved Qualitative Data (Examples)", open=False):
54
+ qualitative_data_display = gr.Markdown("Example data will appear here...", visible=False)
55
+ plot_display = gr.Image(label="Visualization", type="filepath", visible=False)
56
+ report_display = gr.Markdown("Report will be streamed here...", visible=False)
57
+
58
+ def process_analysis_flow(user_input, history, state):
59
+ """
60
+ Manages the conversation and yields UI updates.
61
+ """
62
+ if state is None:
63
+ state = {'query_count': 0, 'last_suggestions': []}
64
+ if history is None:
65
+ history = []
66
+
67
+ yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False))
68
+
69
+ query_context = user_input.strip()
70
+ if not query_context:
71
+ history.append((user_input, "Please enter a question to analyze."))
72
+ yield (history, state, None, None, None, None, None)
73
+ return
74
+
75
+ history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating analysis plan...*"))
76
+ yield (history, state, None, None, None, None, None)
77
+
78
+ analysis_plan = llm_generate_analysis_plan_with_history(llm_model, query_context, history)
79
+ if not analysis_plan:
80
+ history.append((None, "I'm sorry, I couldn't generate a valid analysis plan. Please try rephrasing."))
81
+ yield (history, state, None, None, None, None, None)
82
+ return
83
+
84
+ history.append((None, "βœ… Analysis plan generated!"))
85
+ plan_summary = f"""
86
+ * **Analysis Dimension:** `{analysis_plan.get('analysis_dimension')}`
87
+ * **Analysis Measure:** `{analysis_plan.get('analysis_measure')}`
88
+ * **Query Filter:** `{analysis_plan.get('query_filter')}`
89
+ """
90
+ history.append((None, plan_summary))
91
+ formatted_plan = f"**Full Analysis Plan:**\n```json\n{json.dumps(analysis_plan, indent=2)}\n```"
92
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
93
+
94
+ history.append((None, "*Executing queries for aggregates and examples...*"))
95
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
96
+
97
+ aggregate_data = None
98
+ example_data = None
99
+ with concurrent.futures.ThreadPoolExecutor() as executor:
100
+ future_agg = executor.submit(execute_quantitative_query, solr_client, analysis_plan)
101
+ future_ex = executor.submit(execute_qualitative_query, solr_client, analysis_plan)
102
+ aggregate_data = future_agg.result()
103
+ example_data = future_ex.result()
104
+
105
+ if not aggregate_data or aggregate_data.get('count', 0) == 0:
106
+ history.append((None, "No data was found for your query. Please try a different question."))
107
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None)
108
+ return
109
+
110
+ formatted_agg_data = f"**Quantitative (Aggregate) Data:**\n```json\n{json.dumps(aggregate_data, indent=2)}\n```"
111
+ formatted_qual_data = f"**Qualitative (Example) Data:**\n```json\n{json.dumps(example_data, indent=2)}\n```"
112
+ qual_data_display_update = gr.update(value=formatted_qual_data, visible=True)
113
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
114
+
115
+ history.append((None, "βœ… Data retrieved. Generating visualization and final report..."))
116
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
117
+
118
+ with concurrent.futures.ThreadPoolExecutor() as executor:
119
+ viz_future = executor.submit(llm_generate_visualization_code, llm_model, query_context, aggregate_data)
120
+
121
+ report_text = ""
122
+ stream_history = history[:]
123
+ for chunk in llm_synthesize_enriched_report_stream(llm_model, query_context, aggregate_data, example_data, analysis_plan):
124
+ report_text += chunk
125
+ yield (stream_history, state, None, gr.update(value=report_text, visible=True), gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
126
+
127
+ history.append((None, report_text))
128
+
129
+ viz_code = viz_future.result()
130
+ plot_path = execute_viz_code_and_get_path(viz_code, aggregate_data)
131
+ output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False)
132
+ if not plot_path:
133
+ history.append((None, "*I was unable to generate a plot for this data.*\n"))
134
+
135
+ yield (history, state, output_plot, report_text, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
136
+
137
+ state['query_count'] += 1
138
+ state['last_suggestions'] = parse_suggestions_from_report(report_text)
139
+ next_prompt = "Analysis complete. What would you like to explore next?"
140
+ history.append((None, next_prompt))
141
+ yield (history, state, output_plot, report_text, gr.update(value=formatted_plan, visible=True), gr.update(value=formatted_agg_data, visible=True), qual_data_display_update)
142
+
143
+ def reset_all():
144
+ """Resets the entire UI for a new analysis session."""
145
+ return (
146
+ [],
147
+ None,
148
+ "",
149
+ gr.update(value=None, visible=False),
150
+ gr.update(value=None, visible=False),
151
+ gr.update(value=None, visible=False),
152
+ gr.update(value=None, visible=False),
153
+ gr.update(value=None, visible=False)
154
+ )
155
+
156
+ msg_textbox.submit(
157
+ fn=process_analysis_flow,
158
+ inputs=[msg_textbox, chatbot, state],
159
+ outputs=[chatbot, state, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display],
160
+ ).then(
161
+ lambda: gr.update(value=""),
162
+ None,
163
+ [msg_textbox],
164
+ queue=False,
165
+ )
166
+
167
+ clear_button.click(
168
+ fn=reset_all,
169
+ inputs=None,
170
+ outputs=[chatbot, state, msg_textbox, plot_display, report_display, plan_display, quantitative_data_display, qualitative_data_display],
171
+ queue=False
172
+ )
173
+
174
+ return demo
175
+