Update app.py
Browse files
app.py
CHANGED
@@ -10,18 +10,16 @@
|
|
10 |
# SETUP: $ pip install -r requirements.txt
|
11 |
#
|
12 |
# AUTHOR: An MCP Expert in Data & AI Solutions
|
13 |
-
# VERSION: 4.
|
14 |
-
# LAST-UPDATE: 2023-10-29 (
|
15 |
|
16 |
from __future__ import annotations
|
17 |
|
18 |
import warnings
|
19 |
import logging
|
20 |
import os
|
21 |
-
import sys
|
22 |
-
import importlib.util
|
23 |
from datetime import datetime
|
24 |
-
from typing import Any, Dict,
|
25 |
|
26 |
import gradio as gr
|
27 |
import numpy as np
|
@@ -30,7 +28,7 @@ import plotly.express as px
|
|
30 |
import plotly.graph_objects as go
|
31 |
import google.generativeai as genai
|
32 |
|
33 |
-
# --- Local Adaptive Modules (Requires analysis_modules.py and requirements.txt
|
34 |
from analysis_modules import analyze_time_series, generate_word_cloud, perform_clustering
|
35 |
|
36 |
# --- Configuration & Setup ---
|
@@ -42,11 +40,9 @@ class Config:
|
|
42 |
GEMINI_MODEL = 'gemini-1.5-flash-latest'
|
43 |
MAX_UI_ROWS = 50000
|
44 |
|
45 |
-
# --- Core Analysis Engine
|
46 |
class DataAnalyzer:
|
47 |
-
|
48 |
-
# It should contain: __init__, metadata property, _extract_metadata,
|
49 |
-
# get_profiling_tables, get_overview_visuals, generate_ai_narrative
|
50 |
def __init__(self, df: pd.DataFrame):
|
51 |
if not isinstance(df, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame.")
|
52 |
self.df = df
|
@@ -63,7 +59,7 @@ class DataAnalyzer:
|
|
63 |
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
|
64 |
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
65 |
datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
|
66 |
-
text_cols = [col for col in categorical_cols if self.df[col].str.len().mean() > 50]
|
67 |
|
68 |
high_corr_pairs = []
|
69 |
if len(numeric_cols) > 1:
|
@@ -104,23 +100,72 @@ class DataAnalyzer:
|
|
104 |
return fig_types, fig_missing, fig_corr
|
105 |
|
106 |
def generate_ai_narrative(self, api_key: str, context: Dict[str, Any]) -> str:
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
# --- UI Creation ---
|
112 |
def create_ui():
|
113 |
"""Defines the complete, integrated Gradio user interface."""
|
114 |
-
|
115 |
-
# --- Reusable plotting functions for interactive tabs ---
|
116 |
def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
|
117 |
if not col or not analyzer: return go.Figure()
|
118 |
return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
|
119 |
-
|
120 |
def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col:str, color_col:str) -> go.Figure:
|
121 |
if not all([analyzer, x_col, y_col]): return go.Figure()
|
122 |
return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col, title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white")
|
123 |
-
|
124 |
def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
|
125 |
if not col or not analyzer: return "", go.Figure()
|
126 |
series = analyzer.df[col]
|
@@ -136,17 +181,14 @@ def create_ui():
|
|
136 |
|
137 |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"), title=Config.APP_TITLE) as demo:
|
138 |
state_analyzer = gr.State()
|
139 |
-
|
140 |
gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>")
|
141 |
gr.Markdown("Upload your data to receive a complete standard analysis, plus specialized dashboards that unlock automatically based on your data's content.")
|
142 |
-
|
143 |
with gr.Row():
|
144 |
upload_button = gr.File(label="1. Upload Data File (CSV, Excel)", file_types=[".csv", ".xlsx", ".xls"], scale=3)
|
145 |
api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password", scale=2)
|
146 |
analyze_button = gr.Button("β¨ Build My Dashboard", variant="primary", scale=1)
|
147 |
|
148 |
with gr.Tabs():
|
149 |
-
# --- Standard Tabs (Always Visible) ---
|
150 |
with gr.Tab("π€ AI Narrative"):
|
151 |
ai_report_output = gr.Markdown("### Your AI-generated report will appear here...")
|
152 |
with gr.Tab("π Profile"):
|
@@ -156,38 +198,29 @@ def create_ui():
|
|
156 |
plot_correlation = gr.Plot()
|
157 |
with gr.Tab("π¨ Interactive Explorer"):
|
158 |
with gr.Row():
|
159 |
-
with gr.Column(scale=1):
|
160 |
-
|
161 |
-
with gr.Column(scale=2):
|
162 |
-
plot_histogram = gr.Plot()
|
163 |
with gr.Row():
|
164 |
with gr.Column(scale=1):
|
165 |
dd_scatter_x = gr.Dropdown(label="X-Axis (Numeric)", interactive=True)
|
166 |
dd_scatter_y = gr.Dropdown(label="Y-Axis (Numeric)", interactive=True)
|
167 |
dd_scatter_color = gr.Dropdown(label="Color By (Optional)", interactive=True)
|
168 |
-
with gr.Column(scale=2):
|
169 |
-
plot_scatter = gr.Plot()
|
170 |
with gr.Tab("π Column Deep-Dive"):
|
171 |
dd_drilldown_col = gr.Dropdown(label="Select Column to Analyze", interactive=True)
|
172 |
-
with gr.Row():
|
173 |
-
md_drilldown_stats, plot_drilldown = gr.Markdown(), gr.Plot()
|
174 |
-
|
175 |
-
# --- Specialized, Adaptive Tabs ---
|
176 |
with gr.Tab("β Time-Series Analysis", visible=False) as tab_timeseries:
|
177 |
with gr.Row():
|
178 |
dd_ts_date = gr.Dropdown(label="Select Date/Time Column", interactive=True)
|
179 |
dd_ts_value = gr.Dropdown(label="Select Value Column", interactive=True)
|
180 |
plot_ts_decomp, md_ts_stats = gr.Plot(), gr.Markdown()
|
181 |
-
|
182 |
with gr.Tab("π Text Analysis", visible=False) as tab_text:
|
183 |
dd_text_col = gr.Dropdown(label="Select Text Column", interactive=True)
|
184 |
html_word_cloud = gr.HTML()
|
185 |
-
|
186 |
with gr.Tab("π§© Clustering (K-Means)", visible=False) as tab_cluster:
|
187 |
num_clusters = gr.Slider(minimum=2, maximum=10, value=4, step=1, label="Number of Clusters (K)", interactive=True)
|
188 |
plot_cluster, md_cluster_summary = gr.Plot(), gr.Markdown()
|
189 |
|
190 |
-
# --- Event Listeners ---
|
191 |
main_outputs = [
|
192 |
state_analyzer, ai_report_output,
|
193 |
profile_missing_df, profile_numeric_df, profile_categorical_df,
|
@@ -195,81 +228,53 @@ def create_ui():
|
|
195 |
dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, dd_drilldown_col,
|
196 |
tab_timeseries, dd_ts_date, dd_ts_value,
|
197 |
tab_text, dd_text_col,
|
198 |
-
tab_cluster, num_clusters
|
199 |
-
]
|
200 |
analyze_button.click(fn=run_full_analysis, inputs=[upload_button, api_key_input], outputs=main_outputs, show_progress="full")
|
201 |
-
|
202 |
-
# Listeners for standard interactive tabs
|
203 |
dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_histogram)
|
204 |
scatter_inputs = [state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color]
|
205 |
-
for dd in [dd_scatter_x, dd_scatter_y, dd_scatter_color]:
|
206 |
-
dd.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
|
207 |
dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown])
|
208 |
-
|
209 |
-
# Listeners for specialized adaptive tabs
|
210 |
ts_inputs = [state_analyzer, dd_ts_date, dd_ts_value]
|
211 |
-
for dd in [dd_ts_date, dd_ts_value]:
|
212 |
-
dd.change(fn=lambda a, d, v: analyze_time_series(a.df, d, v), inputs=ts_inputs, outputs=[plot_ts_decomp, md_ts_stats])
|
213 |
dd_text_col.change(fn=lambda a, t: generate_word_cloud(a.df, t), inputs=[state_analyzer, dd_text_col], outputs=html_word_cloud)
|
214 |
num_clusters.change(fn=lambda a, k: perform_clustering(a.df, a.metadata['numeric_cols'], k), inputs=[state_analyzer, num_clusters], outputs=[plot_cluster, md_cluster_summary])
|
215 |
-
|
216 |
return demo
|
217 |
|
218 |
# --- Main Application Logic & Orchestration ---
|
219 |
def run_full_analysis(file_obj: gr.File, api_key: str) -> list:
|
220 |
-
"""Orchestrates the complete standard and adaptive analysis."""
|
221 |
if file_obj is None: raise gr.Error("CRITICAL: No file uploaded.")
|
222 |
if not api_key: raise gr.Error("CRITICAL: Gemini API key is missing.")
|
223 |
-
|
224 |
try:
|
225 |
logging.info(f"Processing uploaded file: {file_obj.name}")
|
226 |
df = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
|
227 |
-
|
228 |
-
if len(df) > Config.MAX_UI_ROWS:
|
229 |
-
df = df.sample(n=Config.MAX_UI_ROWS, random_state=42)
|
230 |
-
|
231 |
analyzer = DataAnalyzer(df)
|
232 |
meta = analyzer.metadata
|
233 |
-
|
234 |
-
# --- Run all base analyses ---
|
235 |
ai_context = {'is_timeseries': bool(meta['datetime_cols']), 'has_text': bool(meta['text_cols'])}
|
236 |
ai_report = analyzer.generate_ai_narrative(api_key, context=ai_context)
|
237 |
missing_df, num_df, cat_df = analyzer.get_profiling_tables()
|
238 |
fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
|
239 |
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
update_drill_dd = gr.Dropdown(choices=meta['columns'], label="Select Column to Analyze")
|
246 |
-
|
247 |
-
# --- Configure adaptive module visibility and dropdowns ---
|
248 |
-
show_ts_tab = gr.Tab(visible=bool(meta['datetime_cols']))
|
249 |
-
update_ts_date_dd = gr.Dropdown(choices=meta['datetime_cols'])
|
250 |
-
update_ts_value_dd = gr.Dropdown(choices=meta['numeric_cols'])
|
251 |
-
|
252 |
-
show_text_tab = gr.Tab(visible=bool(meta['text_cols']))
|
253 |
-
update_text_dd = gr.Dropdown(choices=meta['text_cols'])
|
254 |
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
show_text_tab, update_text_dd,
|
266 |
-
show_cluster_tab, update_cluster_slider
|
267 |
-
]
|
268 |
except Exception as e:
|
269 |
logging.error(f"A critical error occurred: {e}", exc_info=True)
|
270 |
raise gr.Error(f"Analysis Failed! Error: {str(e)}")
|
271 |
|
272 |
if __name__ == "__main__":
|
273 |
-
# You might want to run perform_pre_flight_checks() here
|
274 |
app_instance = create_ui()
|
275 |
app_instance.launch(debug=True, server_name="0.0.0.0")
|
|
|
10 |
# SETUP: $ pip install -r requirements.txt
|
11 |
#
|
12 |
# AUTHOR: An MCP Expert in Data & AI Solutions
|
13 |
+
# VERSION: 4.2 (Bugfix Edition: AI Narrative Engine Restored)
|
14 |
+
# LAST-UPDATE: 2023-10-29 (Fixed critical bug where AI was not being called)
|
15 |
|
16 |
from __future__ import annotations
|
17 |
|
18 |
import warnings
|
19 |
import logging
|
20 |
import os
|
|
|
|
|
21 |
from datetime import datetime
|
22 |
+
from typing import Any, Dict, Optional, Tuple
|
23 |
|
24 |
import gradio as gr
|
25 |
import numpy as np
|
|
|
28 |
import plotly.graph_objects as go
|
29 |
import google.generativeai as genai
|
30 |
|
31 |
+
# --- Local Adaptive Modules (Requires analysis_modules.py and requirements.txt) ---
|
32 |
from analysis_modules import analyze_time_series, generate_word_cloud, perform_clustering
|
33 |
|
34 |
# --- Configuration & Setup ---
|
|
|
40 |
GEMINI_MODEL = 'gemini-1.5-flash-latest'
|
41 |
MAX_UI_ROWS = 50000
|
42 |
|
43 |
+
# --- Core Analysis Engine ---
|
44 |
class DataAnalyzer:
|
45 |
+
"""The complete DataAnalyzer class, now with a fully functional AI engine."""
|
|
|
|
|
46 |
def __init__(self, df: pd.DataFrame):
|
47 |
if not isinstance(df, pd.DataFrame): raise TypeError("Input must be a pandas DataFrame.")
|
48 |
self.df = df
|
|
|
59 |
numeric_cols = self.df.select_dtypes(include=np.number).columns.tolist()
|
60 |
categorical_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
|
61 |
datetime_cols = self.df.select_dtypes(include=['datetime64', 'datetimetz']).columns.tolist()
|
62 |
+
text_cols = [col for col in categorical_cols if self.df[col].dropna().str.len().mean() > 50]
|
63 |
|
64 |
high_corr_pairs = []
|
65 |
if len(numeric_cols) > 1:
|
|
|
100 |
return fig_types, fig_missing, fig_corr
|
101 |
|
102 |
def generate_ai_narrative(self, api_key: str, context: Dict[str, Any]) -> str:
|
103 |
+
"""Generates a context-aware AI narrative using the Gemini API."""
|
104 |
+
logging.info(f"Generating AI narrative with context: {list(context.keys())}")
|
105 |
+
meta = self.metadata
|
106 |
+
data_snippet_md = self.df.head(5).to_markdown(index=False)
|
107 |
+
|
108 |
+
context_prompt = "**PRIMARY ANALYSIS MODES:**\n"
|
109 |
+
if context.get('is_timeseries'):
|
110 |
+
context_prompt += "- **Time-Series Detected:** Focus on trends, seasonality, and stationarity. Suggest forecasting models.\n"
|
111 |
+
if context.get('has_text'):
|
112 |
+
context_prompt += "- **Long-Form Text Detected:** Note potential for NLP tasks like sentiment analysis or topic modeling.\n"
|
113 |
+
if not context.get('is_timeseries') and not context.get('has_text'):
|
114 |
+
context_prompt += "- **General Tabular Data:** Focus on distributions, correlations, and potential for classification/regression modeling.\n"
|
115 |
+
|
116 |
+
prompt = f"""
|
117 |
+
As "Cognitive Analyst," an elite AI data scientist, your task is to generate a comprehensive, multi-part data discovery report in Markdown format.
|
118 |
+
|
119 |
+
{context_prompt}
|
120 |
+
**DATASET METADATA:**
|
121 |
+
- **Shape:** {meta['shape'][0]} rows, {meta['shape'][1]} columns.
|
122 |
+
- **Data Quality Score:** {meta['data_quality_score']}%
|
123 |
+
- **Total Missing Values:** {meta['total_missing']:,}
|
124 |
+
- **Highly Correlated Pairs:** {meta['high_corr_pairs'] if meta['high_corr_pairs'] else 'None detected.'}
|
125 |
+
- **Data Snippet (First 5 Rows):**
|
126 |
+
{data_snippet_md}
|
127 |
+
|
128 |
+
**REQUIRED REPORT STRUCTURE:**
|
129 |
+
|
130 |
+
# π AI Data Discovery Report
|
131 |
+
|
132 |
+
## π 1. Executive Summary
|
133 |
+
* **Primary Objective:** (Deduce the likely purpose of this dataset. What problem could it solve?)
|
134 |
+
* **Key Finding:** (State the single most interesting insight you've discovered.)
|
135 |
+
* **Overall State:** (Briefly comment on the data's quality and readiness for analysis.)
|
136 |
+
|
137 |
+
## π§ 2. Deep Dive & Quality Assessment
|
138 |
+
* **Structural Profile:** (Describe the dataset's composition: numeric, categorical, text, time-series features.)
|
139 |
+
* **Data Quality Audit:** (Elaborate on the quality score and missing values. Are they a major concern?)
|
140 |
+
* **Redundancy Check:** (Comment on the detected high-correlation pairs and any risks.)
|
141 |
+
|
142 |
+
## π‘ 3. Actionable Recommendations
|
143 |
+
* **Data Cleaning:** (Provide a specific recommendation for handling missing data or outliers.)
|
144 |
+
* **Feature Engineering:** (Suggest creating a new, valuable feature.)
|
145 |
+
* **Next Analytical Steps:** (Propose a specific hypothesis to test or a suitable ML model to build.)
|
146 |
+
"""
|
147 |
+
try:
|
148 |
+
genai.configure(api_key=api_key)
|
149 |
+
model = genai.GenerativeModel(Config.GEMINI_MODEL)
|
150 |
+
response = model.generate_content(prompt)
|
151 |
+
if not response.parts:
|
152 |
+
blocked_reason = response.prompt_feedback.block_reason.name if response.prompt_feedback else "Unknown"
|
153 |
+
logging.warning(f"AI response blocked. Reason: {blocked_reason}")
|
154 |
+
return f"β **AI Report Generation Blocked by Safety Settings**\n**Reason:** `{blocked_reason}`."
|
155 |
+
return response.text
|
156 |
+
except Exception as e:
|
157 |
+
logging.error(f"Gemini API call failed: {e}", exc_info=True)
|
158 |
+
return f"β **AI Report Generation Failed**\n**Error:** `{str(e)}`"
|
159 |
|
160 |
# --- UI Creation ---
|
161 |
def create_ui():
|
162 |
"""Defines the complete, integrated Gradio user interface."""
|
|
|
|
|
163 |
def create_histogram(analyzer: DataAnalyzer, col: str) -> go.Figure:
|
164 |
if not col or not analyzer: return go.Figure()
|
165 |
return px.histogram(analyzer.df, x=col, title=f"<b>Distribution of {col}</b>", marginal="box", template="plotly_white")
|
|
|
166 |
def create_scatterplot(analyzer: DataAnalyzer, x_col: str, y_col:str, color_col:str) -> go.Figure:
|
167 |
if not all([analyzer, x_col, y_col]): return go.Figure()
|
168 |
return px.scatter(analyzer.df, x=x_col, y=y_col, color=color_col, title=f"<b>Scatter Plot: {x_col} vs. {y_col}</b>", template="plotly_white")
|
|
|
169 |
def analyze_single_column(analyzer: DataAnalyzer, col: str) -> Tuple[str, go.Figure]:
|
170 |
if not col or not analyzer: return "", go.Figure()
|
171 |
series = analyzer.df[col]
|
|
|
181 |
|
182 |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue"), title=Config.APP_TITLE) as demo:
|
183 |
state_analyzer = gr.State()
|
|
|
184 |
gr.Markdown(f"<h1>{Config.APP_TITLE}</h1>")
|
185 |
gr.Markdown("Upload your data to receive a complete standard analysis, plus specialized dashboards that unlock automatically based on your data's content.")
|
|
|
186 |
with gr.Row():
|
187 |
upload_button = gr.File(label="1. Upload Data File (CSV, Excel)", file_types=[".csv", ".xlsx", ".xls"], scale=3)
|
188 |
api_key_input = gr.Textbox(label="2. Enter Google Gemini API Key", type="password", scale=2)
|
189 |
analyze_button = gr.Button("β¨ Build My Dashboard", variant="primary", scale=1)
|
190 |
|
191 |
with gr.Tabs():
|
|
|
192 |
with gr.Tab("π€ AI Narrative"):
|
193 |
ai_report_output = gr.Markdown("### Your AI-generated report will appear here...")
|
194 |
with gr.Tab("π Profile"):
|
|
|
198 |
plot_correlation = gr.Plot()
|
199 |
with gr.Tab("π¨ Interactive Explorer"):
|
200 |
with gr.Row():
|
201 |
+
with gr.Column(scale=1): dd_hist_col = gr.Dropdown(label="Select Column for Histogram", interactive=True)
|
202 |
+
with gr.Column(scale=2): plot_histogram = gr.Plot()
|
|
|
|
|
203 |
with gr.Row():
|
204 |
with gr.Column(scale=1):
|
205 |
dd_scatter_x = gr.Dropdown(label="X-Axis (Numeric)", interactive=True)
|
206 |
dd_scatter_y = gr.Dropdown(label="Y-Axis (Numeric)", interactive=True)
|
207 |
dd_scatter_color = gr.Dropdown(label="Color By (Optional)", interactive=True)
|
208 |
+
with gr.Column(scale=2): plot_scatter = gr.Plot()
|
|
|
209 |
with gr.Tab("π Column Deep-Dive"):
|
210 |
dd_drilldown_col = gr.Dropdown(label="Select Column to Analyze", interactive=True)
|
211 |
+
with gr.Row(): md_drilldown_stats, plot_drilldown = gr.Markdown(), gr.Plot()
|
|
|
|
|
|
|
212 |
with gr.Tab("β Time-Series Analysis", visible=False) as tab_timeseries:
|
213 |
with gr.Row():
|
214 |
dd_ts_date = gr.Dropdown(label="Select Date/Time Column", interactive=True)
|
215 |
dd_ts_value = gr.Dropdown(label="Select Value Column", interactive=True)
|
216 |
plot_ts_decomp, md_ts_stats = gr.Plot(), gr.Markdown()
|
|
|
217 |
with gr.Tab("π Text Analysis", visible=False) as tab_text:
|
218 |
dd_text_col = gr.Dropdown(label="Select Text Column", interactive=True)
|
219 |
html_word_cloud = gr.HTML()
|
|
|
220 |
with gr.Tab("π§© Clustering (K-Means)", visible=False) as tab_cluster:
|
221 |
num_clusters = gr.Slider(minimum=2, maximum=10, value=4, step=1, label="Number of Clusters (K)", interactive=True)
|
222 |
plot_cluster, md_cluster_summary = gr.Plot(), gr.Markdown()
|
223 |
|
|
|
224 |
main_outputs = [
|
225 |
state_analyzer, ai_report_output,
|
226 |
profile_missing_df, profile_numeric_df, profile_categorical_df,
|
|
|
228 |
dd_hist_col, dd_scatter_x, dd_scatter_y, dd_scatter_color, dd_drilldown_col,
|
229 |
tab_timeseries, dd_ts_date, dd_ts_value,
|
230 |
tab_text, dd_text_col,
|
231 |
+
tab_cluster, num_clusters]
|
|
|
232 |
analyze_button.click(fn=run_full_analysis, inputs=[upload_button, api_key_input], outputs=main_outputs, show_progress="full")
|
|
|
|
|
233 |
dd_hist_col.change(fn=create_histogram, inputs=[state_analyzer, dd_hist_col], outputs=plot_histogram)
|
234 |
scatter_inputs = [state_analyzer, dd_scatter_x, dd_scatter_y, dd_scatter_color]
|
235 |
+
for dd in [dd_scatter_x, dd_scatter_y, dd_scatter_color]: dd.change(fn=create_scatterplot, inputs=scatter_inputs, outputs=plot_scatter)
|
|
|
236 |
dd_drilldown_col.change(fn=analyze_single_column, inputs=[state_analyzer, dd_drilldown_col], outputs=[md_drilldown_stats, plot_drilldown])
|
|
|
|
|
237 |
ts_inputs = [state_analyzer, dd_ts_date, dd_ts_value]
|
238 |
+
for dd in [dd_ts_date, dd_ts_value]: dd.change(fn=lambda a, d, v: analyze_time_series(a.df, d, v), inputs=ts_inputs, outputs=[plot_ts_decomp, md_ts_stats])
|
|
|
239 |
dd_text_col.change(fn=lambda a, t: generate_word_cloud(a.df, t), inputs=[state_analyzer, dd_text_col], outputs=html_word_cloud)
|
240 |
num_clusters.change(fn=lambda a, k: perform_clustering(a.df, a.metadata['numeric_cols'], k), inputs=[state_analyzer, num_clusters], outputs=[plot_cluster, md_cluster_summary])
|
|
|
241 |
return demo
|
242 |
|
243 |
# --- Main Application Logic & Orchestration ---
|
244 |
def run_full_analysis(file_obj: gr.File, api_key: str) -> list:
|
|
|
245 |
if file_obj is None: raise gr.Error("CRITICAL: No file uploaded.")
|
246 |
if not api_key: raise gr.Error("CRITICAL: Gemini API key is missing.")
|
|
|
247 |
try:
|
248 |
logging.info(f"Processing uploaded file: {file_obj.name}")
|
249 |
df = pd.read_csv(file_obj.name) if file_obj.name.endswith('.csv') else pd.read_excel(file_obj.name)
|
250 |
+
if len(df) > Config.MAX_UI_ROWS: df = df.sample(n=Config.MAX_UI_ROWS, random_state=42)
|
|
|
|
|
|
|
251 |
analyzer = DataAnalyzer(df)
|
252 |
meta = analyzer.metadata
|
|
|
|
|
253 |
ai_context = {'is_timeseries': bool(meta['datetime_cols']), 'has_text': bool(meta['text_cols'])}
|
254 |
ai_report = analyzer.generate_ai_narrative(api_key, context=ai_context)
|
255 |
missing_df, num_df, cat_df = analyzer.get_profiling_tables()
|
256 |
fig_types, fig_missing, fig_corr = analyzer.get_overview_visuals()
|
257 |
|
258 |
+
update_hist_dd = gr.Dropdown(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None)
|
259 |
+
update_scatter_x = gr.Dropdown(choices=meta['numeric_cols'], value=meta['numeric_cols'][0] if meta['numeric_cols'] else None)
|
260 |
+
update_scatter_y = gr.Dropdown(choices=meta['numeric_cols'], value=meta['numeric_cols'][1] if len(meta['numeric_cols']) > 1 else None)
|
261 |
+
update_scatter_color = gr.Dropdown(choices=meta['columns'])
|
262 |
+
update_drill_dd = gr.Dropdown(choices=meta['columns'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
+
show_ts_tab = gr.Tab(visible=bool(meta['datetime_cols']))
|
265 |
+
update_ts_date_dd, update_ts_value_dd = gr.Dropdown(choices=meta['datetime_cols']), gr.Dropdown(choices=meta['numeric_cols'])
|
266 |
+
show_text_tab, update_text_dd = gr.Tab(visible=bool(meta['text_cols'])), gr.Dropdown(choices=meta['text_cols'])
|
267 |
+
show_cluster_tab, update_cluster_slider = gr.Tab(visible=len(meta['numeric_cols']) > 1), gr.Slider(visible=len(meta['numeric_cols']) > 1)
|
268 |
+
|
269 |
+
return [analyzer, ai_report, missing_df, num_df, cat_df, fig_types, fig_missing, fig_corr,
|
270 |
+
update_hist_dd, update_scatter_x, update_scatter_y, update_scatter_color, update_drill_dd,
|
271 |
+
show_ts_tab, update_ts_date_dd, update_ts_value_dd,
|
272 |
+
show_text_tab, update_text_dd,
|
273 |
+
show_cluster_tab, update_cluster_slider]
|
|
|
|
|
|
|
274 |
except Exception as e:
|
275 |
logging.error(f"A critical error occurred: {e}", exc_info=True)
|
276 |
raise gr.Error(f"Analysis Failed! Error: {str(e)}")
|
277 |
|
278 |
if __name__ == "__main__":
|
|
|
279 |
app_instance = create_ui()
|
280 |
app_instance.launch(debug=True, server_name="0.0.0.0")
|