Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

CognitiveEDA / app.py

mgbam

Update app.py

5bd4d74 verified about 1 month ago

raw

history blame

16.9 kB

	# Odyssey - The AI Data Science Workspace
	# A state-of-the-art, AI-native analytic environment.
	# This script is a complete, self-contained Gradio application.

	import gradio as gr
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	import io, os, json, pickle, logging, warnings, uuid
	from contextlib import redirect_stdout
	from datetime import datetime

	# ML & Preprocessing Imports
	from sklearn.model_selection import cross_val_score, train_test_split
	from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
	from sklearn.linear_model import LogisticRegression, LinearRegression
	from sklearn.metrics import roc_curve, auc, confusion_matrix, r2_score, mean_squared_error
	from sklearn.preprocessing import LabelEncoder
	from sklearn.impute import KNNImputer

	# Optional: For AI features
	try:
	import google.generativeai as genai
	except ImportError:
	print("Warning: 'google-generativeai' not found. AI features will be disabled.")
	genai = None

	# --- Configuration ---
	warnings.filterwarnings('ignore')
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# --- UI Theme & Icons ---
	THEME = gr.themes.Monochrome(primary_hue="indigo", secondary_hue="blue", neutral_hue="slate").set(
	body_background_fill="radial-gradient(circle, rgba(10,20,50,1) 0%, rgba(0,0,10,1) 100%);",
	block_label_background_fill="rgba(255,255,255,0.05)",
	block_background_fill="rgba(255,255,255,0.05)",
	button_primary_background_fill="linear-gradient(90deg, #6A11CB 0%, #2575FC 100%)",
	button_secondary_background_fill="linear-gradient(90deg, #556270 0%, #4ECDC4 100%)",
	color_accent_soft="rgba(255,255,255,0.2)"
	)
	ICONS = {"overview": "🔭", "medic": "🧪", "launchpad": "🚀", "copilot": "💡", "export": "📄"}

	# --- Helper Functions ---
	def safe_exec(code_string: str, local_vars: dict) -> tuple:
	"""Safely execute a string of Python code and capture its output."""
	output_buffer = io.StringIO()
	try:
	with redirect_stdout(output_buffer):
	exec(code_string, globals(), local_vars)
	stdout = output_buffer.getvalue()
	fig = local_vars.get('fig')
	df_out = local_vars.get('df_result')
	return stdout, fig, df_out, None
	except Exception as e:
	return None, None, None, f"Execution Error: {str(e)}"

	# --- Core State & Project Management ---
	def init_state():
	"""Initializes a blank global state dictionary."""
	return {
	"project_name": None, "df_original": None, "df_modified": None,
	"metadata": None, "insights": None, "chat_history": []
	}

	def save_project(state):
	"""Saves the entire application state to a .odyssey file."""
	if not state or not state.get("project_name"):
	return gr.update(value="Project needs a name to save.", interactive=True)

	filename = f"{state['project_name']}.odyssey"
	with open(filename, "wb") as f:
	pickle.dump(state, f)
	return gr.update(value=f"✅ Project saved to {filename}", interactive=True)

	def load_project(file_obj):
	"""Loads a .odyssey file into the application state."""
	if not file_obj: return init_state()
	with open(file_obj.name, "rb") as f:
	return pickle.load(f)

	def prime_data(file_obj, project_name):
	"""Main function to load a new CSV, analyze it, and set the initial state."""
	if not file_obj: return init_state()
	df = pd.read_csv(file_obj.name)

	for col in df.select_dtypes(include=['object']).columns:
	try:
	df[col] = pd.to_datetime(df[col], errors='raise')
	except (ValueError, TypeError):
	if 0.5 > df[col].nunique() / len(df) > 0.0:
	df[col] = df[col].astype('category')

	metadata = extract_metadata(df)
	insights = run_helios_engine(df, metadata)

	return {
	"project_name": project_name or f"Project_{datetime.now().strftime('%Y%m%d_%H%M')}",
	"df_original": df, "df_modified": df.copy(), "metadata": metadata,
	"insights": insights, "chat_history": []
	}

	def extract_metadata(df):
	"""Utility to get schema and column types."""
	return {
	'shape': df.shape, 'columns': df.columns.tolist(),
	'numeric': df.select_dtypes(include=np.number).columns.tolist(),
	'categorical': df.select_dtypes(include=['object', 'category']).columns.tolist(),
	'datetime': df.select_dtypes(include='datetime').columns.tolist(),
	'dtypes': df.dtypes.apply(lambda x: x.name).to_dict()
	}

	# --- Module-Specific Handlers ---

	def run_helios_engine(df, metadata):
	"""The proactive analysis engine for the Helios Overview."""
	insights = {}
	missing = df.isnull().sum()
	insights['missing_data'] = missing[missing > 0].sort_values(ascending=False)
	insights['high_cardinality'] = {c: df[c].nunique() for c in metadata['categorical'] if df[c].nunique() > 50}

	outliers = {}
	for col in metadata['numeric']:
	Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
	IQR = Q3 - Q1
	count = ((df[col] < (Q1 - 1.5 * IQR)) \| (df[col] > (Q3 + 1.5 * IQR))).sum()
	if count > 0: outliers[col] = count
	insights['outliers'] = outliers

	suggestions = []
	for col in metadata['categorical']:
	if df[col].nunique() == 2: suggestions.append(f"{col} (Classification)")
	for col in metadata['numeric']:
	if df[col].nunique() > 20: suggestions.append(f"{col} (Regression)")
	insights['ml_suggestions'] = suggestions
	return insights

	def prometheus_run_model(state, target, features, model_name):
	"""Trains and evaluates a model in the Prometheus Launchpad."""
	if not target or not features: return None, None, "Select target and features."
	df = state['df_modified'].copy()
	df.dropna(subset=[target] + features, inplace=True)

	for col in [target] + features:
	if df[col].dtype.name in ['category', 'object']:
	df[col] = LabelEncoder().fit_transform(df[col])

	X, y = df[features], df[target]
	problem_type = "Classification" if y.nunique() <= 10 else "Regression"

	MODELS = {"Classification": {"Random Forest": RandomForestClassifier, "Logistic Regression": LogisticRegression},
	"Regression": {"Random Forest": RandomForestRegressor, "Linear Regression": LinearRegression}}
	if model_name not in MODELS[problem_type]: return None, None, "Invalid model for this problem type."

	model = MODELS[problem_type][model_name](random_state=42)

	if problem_type == "Classification":
	scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
	report = f"Cross-Validated Accuracy: {np.mean(scores):.3f} ± {np.std(scores):.3f}"
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
	model.fit(X_train, y_train)
	y_prob = model.predict_proba(X_test)[:, 1]
	fpr, tpr, _ = roc_curve(y_test, y_prob)
	fig1 = go.Figure(data=go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC (AUC = {auc(fpr, tpr):.2f})'))
	fig1.add_scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash'), name='Random')
	fig1.update_layout(title="ROC Curve")
	else: # Regression
	scores = cross_val_score(model, X, y, cv=5, scoring='r2')
	report = f"Cross-Validated R² Score: {np.mean(scores):.3f} ± {np.std(scores):.3f}"
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
	model.fit(X_train, y_train)
	preds = model.predict(X_test)
	residuals = y_test - preds
	fig1 = px.scatter(x=preds, y=residuals, title="Residuals vs. Predicted", labels={'x': 'Predicted', 'y': 'Residuals'})
	fig1.add_hline(y=0, line_dash="dash")

	if hasattr(model, 'feature_importances_'):
	fi = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
	fig2 = px.bar(fi, title="Feature Importance")
	else:
	fig2 = go.Figure().update_layout(title="Feature Importance (Not available)")

	return fig1, fig2, report

	def athena_respond(user_message, history, state, api_key):
	"""Handles the chat interaction with the AI Co-pilot."""
	if not genai:
	history.append((user_message, "Google AI library not installed. Cannot use Athena."))
	return history, None, None, state
	if not api_key:
	history.append((user_message, "Please enter your Gemini API key to use Athena."))
	return history, None, None, state

	history.append((user_message, None))

	# Configure the API
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel('gemini-1.5-flash')

	prompt = f"""
	You are 'Athena', an AI data scientist. Your goal is to help a user by writing and executing Python code on a pandas DataFrame named `df`.

	DataFrame Info:
	{state['df_modified'].info(verbose=False)}

	Instructions:
	1. Analyze the user's request: '{user_message}'.
	2. Formulate a plan (thought).
	3. Write Python code to execute the plan. You can use `pandas as pd`, `numpy as np`, and `plotly.express as px`.
	4. To show a plot, assign it to a variable `fig`.
	5. To show a dataframe, assign it to a variable `df_result`.
	6. Use `print()` for text output.
	7. NEVER modify `df` in place.
	8. Respond ONLY with a single, valid JSON object with keys "thought" and "code".

	Your JSON Response:
	"""
	try:
	response = model.generate_content(prompt)
	response_json = json.loads(response.text.strip().replace("```json", "").replace("```", ""))
	thought = response_json.get("thought", "Thinking...")
	code_to_run = response_json.get("code", "print('No code generated.')")

	bot_thinking = f"🧠 Thinking: {thought}"
	history[-1] = (user_message, bot_thinking)
	yield history, None, None, state

	local_vars = {'df': state['df_modified'], 'px': px, 'pd': pd, 'np': np}
	stdout, fig_result, df_result, error = safe_exec(code_to_run, local_vars)

	bot_response = bot_thinking + "\n\n---\n\n"
	if error: bot_response += f"💥 Error:\n```\n{error}\n```"
	if stdout: bot_response += f"📋 Output:\n```\n{stdout}\n```"
	if not error and not stdout and not fig_result and not isinstance(df_result, pd.DataFrame):
	bot_response += "✅ Code executed, but produced no direct output."

	history[-1] = (user_message, bot_response)
	state['chat_history'] = history # Persist chat history
	yield history, fig_result, df_result, state

	except Exception as e:
	error_msg = f"A critical error occurred with the AI model: {e}"
	history[-1] = (user_message, error_msg)
	yield history, None, None, state

	# --- UI Builder ---
	def build_ui():
	"""Constructs the entire Gradio application interface."""
	with gr.Blocks(theme=THEME, title="Odyssey AI Data Workspace") as demo:
	state = gr.State(init_state())

	with gr.Row():
	# Left Sidebar - Command Center
	with gr.Column(scale=1):
	gr.Markdown("# 🦉 Odyssey")
	with gr.Accordion("📂 Project", open=True):
	project_name_input = gr.Textbox(label="Project Name", value="New_Project")
	file_input = gr.File(label="Upload CSV", file_types=[".csv"])
	api_key_input = gr.Textbox(label="🔑 Gemini API Key", type="password", placeholder="Enter key...")
	with gr.Row():
	save_btn = gr.Button("Save")
	load_btn = gr.UploadButton("Load .odyssey")
	project_status = gr.Markdown()

	# Navigation buttons
	overview_btn = gr.Button(f"{ICONS['overview']} Helios Overview")
	launchpad_btn = gr.Button(f"{ICONS['launchpad']} Prometheus Launchpad")
	copilot_btn = gr.Button(f"{ICONS['copilot']} Athena Co-pilot")
	export_btn = gr.Button(f"{ICONS['export']} Export Report", visible=False)

	# Right Panel - Main Workspace
	with gr.Column(scale=4):
	# --- Helios Overview Panel ---
	with gr.Column(visible=True) as overview_panel:
	gr.Markdown(f"# {ICONS['overview']} Helios Overview")
	helios_report_md = gr.Markdown("Upload a CSV and provide a project name to begin your Odyssey.")

	# --- Prometheus Launchpad Panel ---
	with gr.Column(visible=False) as launchpad_panel:
	gr.Markdown(f"# {ICONS['launchpad']} Prometheus Launchpad")
	with gr.Row():
	lp_target = gr.Dropdown(label="🎯 Target")
	# CORRECTED LINE: Use gr.Dropdown with multiselect=True
	lp_features = gr.Dropdown(label="✨ Features", multiselect=True)
	lp_model = gr.Dropdown(choices=["Random Forest", "Logistic Regression", "Linear Regression"], label="🧠 Model")
	lp_run_btn = gr.Button("🚀 Launch Model Training (with CV)")
	lp_report_md = gr.Markdown()
	with gr.Row():
	lp_fig1 = gr.Plot()
	lp_fig2 = gr.Plot()

	# --- Athena Co-pilot Panel ---
	with gr.Column(visible=False) as copilot_panel:
	gr.Markdown(f"# {ICONS['copilot']} Athena Co-pilot")
	chatbot = gr.Chatbot(height=500, label="Chat History")
	with gr.Accordion("AI Generated Results", open=True):
	copilot_fig_output = gr.Plot()
	copilot_df_output = gr.DataFrame(interactive=False)
	chat_input = gr.Textbox(label="Your Request", placeholder="e.g., 'What's the correlation between all numeric columns?'")
	chat_submit = gr.Button("Send", variant="primary")

	# --- Event Handling ---
	panels = [overview_panel, launchpad_panel, copilot_panel]
	def switch_panel(btn_idx):
	return [gr.update(visible=i == btn_idx) for i in range(len(panels))]

	overview_btn.click(lambda: switch_panel(0), None, panels)
	launchpad_btn.click(lambda: switch_panel(1), None, panels)
	copilot_btn.click(lambda: switch_panel(2), None, panels)

	def on_upload_or_load(state_data):
	"""Unified function to update UI after data is loaded or a project is loaded."""
	helios_md = "No data loaded."
	if state_data and state_data.get('insights'):
	insights = state_data['insights']
	md = f"## 🔭 Proactive Insights for `{state_data.get('project_name')}`\n"
	md += f"Dataset has {state_data['metadata']['shape'][0]} rows and {state_data['metadata']['shape'][1]} columns.\n\n"
	if suggestions := insights.get('ml_suggestions'):
	md += "### 🔮 Potential ML Targets\n" + "\n".join(f"- `{s}`" for s in suggestions) + "\n"
	if not insights.get('missing_data', pd.Series()).empty:
	md += "\n### 💧 Missing Data\nFound missing values in these columns:\n" + insights['missing_data'].to_frame('Missing Count').to_markdown() + "\n"
	helios_md = md

	all_cols = state_data.get('metadata', {}).get('columns', [])
	return {
	state: state_data,
	helios_report_md: helios_md,
	lp_target: gr.update(choices=all_cols),
	lp_features: gr.update(choices=all_cols),
	chatbot: state_data.get('chat_history', [])
	}

	file_input.upload(prime_data, [file_input, project_name_input], state).then(
	on_upload_or_load, state, [state, helios_report_md, lp_target, lp_features, chatbot]
	)
	load_btn.upload(load_project, load_btn, state).then(
	on_upload_or_load, state, [state, helios_report_md, lp_target, lp_features, chatbot]
	)
	save_btn.click(save_project, state, project_status)

	lp_run_btn.click(prometheus_run_model, [state, lp_target, lp_features, lp_model], [lp_fig1, lp_fig2, lp_report_md])

	chat_submit.click(
	athena_respond,
	[chat_input, chatbot, state, api_key_input],
	[chatbot, copilot_fig_output, copilot_df_output, state]
	).then(lambda: "", outputs=chat_input)

	return demo

	# --- Main Execution ---
	if __name__ == "__main__":
	app = build_ui()
	app.launch(debug=True)