Spaces:
Sleeping
Sleeping
# Odyssey - The AI Data Science Workspace | |
# A state-of-the-art, AI-native analytic environment. | |
# This script is a complete, self-contained Gradio application. | |
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import io, os, json, pickle, logging, warnings, uuid | |
from contextlib import redirect_stdout | |
from datetime import datetime | |
# ML & Preprocessing Imports | |
from sklearn.model_selection import cross_val_score, train_test_split | |
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor | |
from sklearn.linear_model import LogisticRegression, LinearRegression | |
from sklearn.metrics import roc_curve, auc, confusion_matrix, r2_score, mean_squared_error | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.impute import KNNImputer | |
# Optional: For AI features | |
try: | |
import google.generativeai as genai | |
except ImportError: | |
print("Warning: 'google-generativeai' not found. AI features will be disabled.") | |
genai = None | |
# --- Configuration --- | |
warnings.filterwarnings('ignore') | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# --- UI Theme & Icons --- | |
THEME = gr.themes.Monochrome(primary_hue="indigo", secondary_hue="blue", neutral_hue="slate").set( | |
body_background_fill="radial-gradient(circle, rgba(10,20,50,1) 0%, rgba(0,0,10,1) 100%);", | |
block_label_background_fill="rgba(255,255,255,0.05)", | |
block_background_fill="rgba(255,255,255,0.05)", | |
button_primary_background_fill="linear-gradient(90deg, #6A11CB 0%, #2575FC 100%)", | |
button_secondary_background_fill="linear-gradient(90deg, #556270 0%, #4ECDC4 100%)", | |
color_accent_soft="rgba(255,255,255,0.2)" | |
) | |
ICONS = {"overview": "๐ญ", "medic": "๐งช", "launchpad": "๐", "copilot": "๐ก", "export": "๐"} | |
# --- Helper Functions --- | |
def safe_exec(code_string: str, local_vars: dict) -> tuple: | |
"""Safely execute a string of Python code and capture its output.""" | |
output_buffer = io.StringIO() | |
try: | |
with redirect_stdout(output_buffer): | |
exec(code_string, globals(), local_vars) | |
stdout = output_buffer.getvalue() | |
fig = local_vars.get('fig') | |
df_out = local_vars.get('df_result') | |
return stdout, fig, df_out, None | |
except Exception as e: | |
return None, None, None, f"Execution Error: {str(e)}" | |
# --- Core State & Project Management --- | |
def init_state(): | |
"""Initializes a blank global state dictionary.""" | |
return { | |
"project_name": None, "df_original": None, "df_modified": None, | |
"metadata": None, "insights": None, "chat_history": [] | |
} | |
def save_project(state): | |
"""Saves the entire application state to a .odyssey file.""" | |
if not state or not state.get("project_name"): | |
return gr.update(value="Project needs a name to save.", interactive=True) | |
filename = f"{state['project_name']}.odyssey" | |
with open(filename, "wb") as f: | |
pickle.dump(state, f) | |
return gr.update(value=f"โ Project saved to {filename}", interactive=True) | |
def load_project(file_obj): | |
"""Loads a .odyssey file into the application state.""" | |
if not file_obj: return init_state() | |
with open(file_obj.name, "rb") as f: | |
return pickle.load(f) | |
def prime_data(file_obj, project_name): | |
"""Main function to load a new CSV, analyze it, and set the initial state.""" | |
if not file_obj: return init_state() | |
df = pd.read_csv(file_obj.name) | |
for col in df.select_dtypes(include=['object']).columns: | |
try: | |
df[col] = pd.to_datetime(df[col], errors='raise') | |
except (ValueError, TypeError): | |
if 0.5 > df[col].nunique() / len(df) > 0.0: | |
df[col] = df[col].astype('category') | |
metadata = extract_metadata(df) | |
insights = run_helios_engine(df, metadata) | |
return { | |
"project_name": project_name or f"Project_{datetime.now().strftime('%Y%m%d_%H%M')}", | |
"df_original": df, "df_modified": df.copy(), "metadata": metadata, | |
"insights": insights, "chat_history": [] | |
} | |
def extract_metadata(df): | |
"""Utility to get schema and column types.""" | |
return { | |
'shape': df.shape, 'columns': df.columns.tolist(), | |
'numeric': df.select_dtypes(include=np.number).columns.tolist(), | |
'categorical': df.select_dtypes(include=['object', 'category']).columns.tolist(), | |
'datetime': df.select_dtypes(include='datetime').columns.tolist(), | |
'dtypes': df.dtypes.apply(lambda x: x.name).to_dict() | |
} | |
# --- Module-Specific Handlers --- | |
def run_helios_engine(df, metadata): | |
"""The proactive analysis engine for the Helios Overview.""" | |
insights = {} | |
missing = df.isnull().sum() | |
insights['missing_data'] = missing[missing > 0].sort_values(ascending=False) | |
insights['high_cardinality'] = {c: df[c].nunique() for c in metadata['categorical'] if df[c].nunique() > 50} | |
outliers = {} | |
for col in metadata['numeric']: | |
Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75) | |
IQR = Q3 - Q1 | |
count = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum() | |
if count > 0: outliers[col] = count | |
insights['outliers'] = outliers | |
suggestions = [] | |
for col in metadata['categorical']: | |
if df[col].nunique() == 2: suggestions.append(f"{col} (Classification)") | |
for col in metadata['numeric']: | |
if df[col].nunique() > 20: suggestions.append(f"{col} (Regression)") | |
insights['ml_suggestions'] = suggestions | |
return insights | |
def prometheus_run_model(state, target, features, model_name): | |
"""Trains and evaluates a model in the Prometheus Launchpad.""" | |
if not target or not features: return None, None, "Select target and features." | |
df = state['df_modified'].copy() | |
df.dropna(subset=[target] + features, inplace=True) | |
for col in [target] + features: | |
if df[col].dtype.name in ['category', 'object']: | |
df[col] = LabelEncoder().fit_transform(df[col]) | |
X, y = df[features], df[target] | |
problem_type = "Classification" if y.nunique() <= 10 else "Regression" | |
MODELS = {"Classification": {"Random Forest": RandomForestClassifier, "Logistic Regression": LogisticRegression}, | |
"Regression": {"Random Forest": RandomForestRegressor, "Linear Regression": LinearRegression}} | |
if model_name not in MODELS[problem_type]: return None, None, "Invalid model for this problem type." | |
model = MODELS[problem_type][model_name](random_state=42) | |
if problem_type == "Classification": | |
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy') | |
report = f"**Cross-Validated Accuracy:** {np.mean(scores):.3f} ยฑ {np.std(scores):.3f}" | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | |
model.fit(X_train, y_train) | |
y_prob = model.predict_proba(X_test)[:, 1] | |
fpr, tpr, _ = roc_curve(y_test, y_prob) | |
fig1 = go.Figure(data=go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC (AUC = {auc(fpr, tpr):.2f})')) | |
fig1.add_scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash'), name='Random') | |
fig1.update_layout(title="ROC Curve") | |
else: # Regression | |
scores = cross_val_score(model, X, y, cv=5, scoring='r2') | |
report = f"**Cross-Validated Rยฒ Score:** {np.mean(scores):.3f} ยฑ {np.std(scores):.3f}" | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | |
model.fit(X_train, y_train) | |
preds = model.predict(X_test) | |
residuals = y_test - preds | |
fig1 = px.scatter(x=preds, y=residuals, title="Residuals vs. Predicted", labels={'x': 'Predicted', 'y': 'Residuals'}) | |
fig1.add_hline(y=0, line_dash="dash") | |
if hasattr(model, 'feature_importances_'): | |
fi = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False) | |
fig2 = px.bar(fi, title="Feature Importance") | |
else: | |
fig2 = go.Figure().update_layout(title="Feature Importance (Not available)") | |
return fig1, fig2, report | |
def athena_respond(user_message, history, state, api_key): | |
"""Handles the chat interaction with the AI Co-pilot.""" | |
if not genai: | |
history.append((user_message, "Google AI library not installed. Cannot use Athena.")) | |
return history, None, None, state | |
if not api_key: | |
history.append((user_message, "Please enter your Gemini API key to use Athena.")) | |
return history, None, None, state | |
history.append((user_message, None)) | |
# Configure the API | |
genai.configure(api_key=api_key) | |
model = genai.GenerativeModel('gemini-1.5-flash') | |
prompt = f""" | |
You are 'Athena', an AI data scientist. Your goal is to help a user by writing and executing Python code on a pandas DataFrame named `df`. | |
**DataFrame Info:** | |
{state['df_modified'].info(verbose=False)} | |
**Instructions:** | |
1. Analyze the user's request: '{user_message}'. | |
2. Formulate a plan (thought). | |
3. Write Python code to execute the plan. You can use `pandas as pd`, `numpy as np`, and `plotly.express as px`. | |
4. To show a plot, assign it to a variable `fig`. | |
5. To show a dataframe, assign it to a variable `df_result`. | |
6. Use `print()` for text output. | |
7. **NEVER** modify `df` in place. | |
8. Respond **ONLY** with a single, valid JSON object with keys "thought" and "code". | |
**Your JSON Response:** | |
""" | |
try: | |
response = model.generate_content(prompt) | |
response_json = json.loads(response.text.strip().replace("```json", "").replace("```", "")) | |
thought = response_json.get("thought", "Thinking...") | |
code_to_run = response_json.get("code", "print('No code generated.')") | |
bot_thinking = f"๐ง **Thinking:** *{thought}*" | |
history[-1] = (user_message, bot_thinking) | |
yield history, None, None, state | |
local_vars = {'df': state['df_modified'], 'px': px, 'pd': pd, 'np': np} | |
stdout, fig_result, df_result, error = safe_exec(code_to_run, local_vars) | |
bot_response = bot_thinking + "\n\n---\n\n" | |
if error: bot_response += f"๐ฅ **Error:**\n```\n{error}\n```" | |
if stdout: bot_response += f"๐ **Output:**\n```\n{stdout}\n```" | |
if not error and not stdout and not fig_result and not isinstance(df_result, pd.DataFrame): | |
bot_response += "โ Code executed, but produced no direct output." | |
history[-1] = (user_message, bot_response) | |
state['chat_history'] = history # Persist chat history | |
yield history, fig_result, df_result, state | |
except Exception as e: | |
error_msg = f"A critical error occurred with the AI model: {e}" | |
history[-1] = (user_message, error_msg) | |
yield history, None, None, state | |
# --- UI Builder --- | |
def build_ui(): | |
"""Constructs the entire Gradio application interface.""" | |
with gr.Blocks(theme=THEME, title="Odyssey AI Data Workspace") as demo: | |
state = gr.State(init_state()) | |
with gr.Row(): | |
# Left Sidebar - Command Center | |
with gr.Column(scale=1): | |
gr.Markdown("# ๐ฆ Odyssey") | |
with gr.Accordion("๐ Project", open=True): | |
project_name_input = gr.Textbox(label="Project Name", value="New_Project") | |
file_input = gr.File(label="Upload CSV", file_types=[".csv"]) | |
api_key_input = gr.Textbox(label="๐ Gemini API Key", type="password", placeholder="Enter key...") | |
with gr.Row(): | |
save_btn = gr.Button("Save") | |
load_btn = gr.UploadButton("Load .odyssey") | |
project_status = gr.Markdown() | |
# Navigation buttons | |
overview_btn = gr.Button(f"{ICONS['overview']} Helios Overview") | |
launchpad_btn = gr.Button(f"{ICONS['launchpad']} Prometheus Launchpad") | |
copilot_btn = gr.Button(f"{ICONS['copilot']} Athena Co-pilot") | |
export_btn = gr.Button(f"{ICONS['export']} Export Report", visible=False) | |
# Right Panel - Main Workspace | |
with gr.Column(scale=4): | |
# --- Helios Overview Panel --- | |
with gr.Column(visible=True) as overview_panel: | |
gr.Markdown(f"# {ICONS['overview']} Helios Overview") | |
helios_report_md = gr.Markdown("Upload a CSV and provide a project name to begin your Odyssey.") | |
# --- Prometheus Launchpad Panel --- | |
with gr.Column(visible=False) as launchpad_panel: | |
gr.Markdown(f"# {ICONS['launchpad']} Prometheus Launchpad") | |
with gr.Row(): | |
lp_target = gr.Dropdown(label="๐ฏ Target") | |
# CORRECTED LINE: Use gr.Dropdown with multiselect=True | |
lp_features = gr.Dropdown(label="โจ Features", multiselect=True) | |
lp_model = gr.Dropdown(choices=["Random Forest", "Logistic Regression", "Linear Regression"], label="๐ง Model") | |
lp_run_btn = gr.Button("๐ Launch Model Training (with CV)") | |
lp_report_md = gr.Markdown() | |
with gr.Row(): | |
lp_fig1 = gr.Plot() | |
lp_fig2 = gr.Plot() | |
# --- Athena Co-pilot Panel --- | |
with gr.Column(visible=False) as copilot_panel: | |
gr.Markdown(f"# {ICONS['copilot']} Athena Co-pilot") | |
chatbot = gr.Chatbot(height=500, label="Chat History") | |
with gr.Accordion("AI Generated Results", open=True): | |
copilot_fig_output = gr.Plot() | |
copilot_df_output = gr.DataFrame(interactive=False) | |
chat_input = gr.Textbox(label="Your Request", placeholder="e.g., 'What's the correlation between all numeric columns?'") | |
chat_submit = gr.Button("Send", variant="primary") | |
# --- Event Handling --- | |
panels = [overview_panel, launchpad_panel, copilot_panel] | |
def switch_panel(btn_idx): | |
return [gr.update(visible=i == btn_idx) for i in range(len(panels))] | |
overview_btn.click(lambda: switch_panel(0), None, panels) | |
launchpad_btn.click(lambda: switch_panel(1), None, panels) | |
copilot_btn.click(lambda: switch_panel(2), None, panels) | |
def on_upload_or_load(state_data): | |
"""Unified function to update UI after data is loaded or a project is loaded.""" | |
helios_md = "No data loaded." | |
if state_data and state_data.get('insights'): | |
insights = state_data['insights'] | |
md = f"## ๐ญ Proactive Insights for `{state_data.get('project_name')}`\n" | |
md += f"Dataset has **{state_data['metadata']['shape'][0]} rows** and **{state_data['metadata']['shape'][1]} columns**.\n\n" | |
if suggestions := insights.get('ml_suggestions'): | |
md += "### ๐ฎ Potential ML Targets\n" + "\n".join(f"- `{s}`" for s in suggestions) + "\n" | |
if not insights.get('missing_data', pd.Series()).empty: | |
md += "\n### ๐ง Missing Data\nFound missing values in these columns:\n" + insights['missing_data'].to_frame('Missing Count').to_markdown() + "\n" | |
helios_md = md | |
all_cols = state_data.get('metadata', {}).get('columns', []) | |
return { | |
state: state_data, | |
helios_report_md: helios_md, | |
lp_target: gr.update(choices=all_cols), | |
lp_features: gr.update(choices=all_cols), | |
chatbot: state_data.get('chat_history', []) | |
} | |
file_input.upload(prime_data, [file_input, project_name_input], state).then( | |
on_upload_or_load, state, [state, helios_report_md, lp_target, lp_features, chatbot] | |
) | |
load_btn.upload(load_project, load_btn, state).then( | |
on_upload_or_load, state, [state, helios_report_md, lp_target, lp_features, chatbot] | |
) | |
save_btn.click(save_project, state, project_status) | |
lp_run_btn.click(prometheus_run_model, [state, lp_target, lp_features, lp_model], [lp_fig1, lp_fig2, lp_report_md]) | |
chat_submit.click( | |
athena_respond, | |
[chat_input, chatbot, state, api_key_input], | |
[chatbot, copilot_fig_output, copilot_df_output, state] | |
).then(lambda: "", outputs=chat_input) | |
return demo | |
# --- Main Execution --- | |
if __name__ == "__main__": | |
app = build_ui() | |
app.launch(debug=True) |