DashBoard / app.py
MatteoScript's picture
Update app.py
8d5b1f3 verified
raw
history blame
92.4 kB
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import BytesIO # Importa BytesIO per gestire file in memoria
try:
import scipy.stats # Per correlazione spearman opzionale
SCIPY_AVAILABLE = True
except ImportError:
SCIPY_AVAILABLE = False
# Sposta l'avviso della libreria scipy dopo il caricamento del file,
# così non appare se non viene caricato nessun file.
# st.sidebar.warning("Libreria 'scipy' non trovata...") # Rimosso da qui
# --- Configuration ---
st.set_page_config(layout="wide", page_title="Dashboard Analisi Clima")
# --- Constants & Helper Functions ---
SCORE_BUCKETS = {
(0, 2.5): "Critico",
(2.5, 4.5): "Neutrale",
(4.5, 7): "Positivo" # Assumendo scala fino a 6, ma 7 copre > 4.5
}
BUCKET_COLORS = {"Critico": "#d62728", "Neutrale": "#ff7f0e", "Positivo": "#2ca02c"}
THRESHOLD_LOW = 3.0 # Leggermente aggiustato per bullet chart
THRESHOLD_HIGH = 4.5 # Leggermente aggiustato per bullet chart
PLOTLY_TEMPLATE = "plotly_white" # "seaborn" #"plotly_dark" # "ggplot2" # "plotly_white"
def categorize_score(score):
if pd.isna(score):
return "Non Risposto"
# Ajust range slightly to handle edge cases like 2.5 exactly
if 0 <= score <= 2.5: return "Critico"
if 2.5 < score <= 4.5: return "Neutrale"
if 4.5 < score <= 7: return "Positivo" # Assuming max score is around 6
return "Sconosciuto" # Should not happen with numeric data in expected range
@st.cache_data
# Modifica la funzione per accettare l'oggetto file caricato invece del percorso
def load_and_prepare_data(uploaded_file_object):
if uploaded_file_object is None:
return None, None, None, None, None, None, None
try:
# Legge direttamente dall'oggetto file in memoria
# Explicitly try different encodings if default fails
try:
# Usa BytesIO per permettere a read_csv di rileggere se necessario
file_content = BytesIO(uploaded_file_object.getvalue())
df_orig = pd.read_csv(file_content, delimiter=';', encoding='utf-8')
except UnicodeDecodeError:
try:
file_content.seek(0) # Riavvolgi il buffer
df_orig = pd.read_csv(file_content, delimiter=';', encoding='latin-1')
except UnicodeDecodeError:
file_content.seek(0) # Riavvolgi il buffer
df_orig = pd.read_csv(file_content, delimiter=';', encoding='iso-8859-1')
# Rimuovi FileNotFoundError dato che non usiamo più un percorso fisso
# except FileNotFoundError:
# st.error(f"Errore: File non trovato...") # Rimosso
# return None, None, None, None, None, None, None
except Exception as e:
st.error(f"Errore durante la lettura del CSV caricato: {e}")
return None, None, None, None, None, None, None
# --- Il resto della funzione di preparazione dati rimane invariato ---
original_columns = df_orig.columns.tolist()
unnamed_cols = [col for col in df_orig.columns if str(col).startswith('Unnamed:')]
df = df_orig.drop(columns=unnamed_cols)
cleaned_original_columns = df.columns.tolist() # Update after drop
header_row_index = 0 # Assuming header is the first row after loading
new_header = df.iloc[header_row_index].tolist()
df = df[header_row_index + 1:].reset_index(drop=True)
# Clean the header: replace NaN/None with placeholders, ensure strings, strip whitespace
cleaned_header = []
for i, col in enumerate(new_header):
col_str = str(col).strip() if pd.notna(col) else ""
if not col_str: # If empty after stripping
if i < len(cleaned_original_columns) and not cleaned_original_columns[i].startswith('Unnamed:'):
cleaned_header.append(str(cleaned_original_columns[i]).strip()) # Use original name if meaningful
else:
cleaned_header.append(f"Colonna_Sconosciuta_{i}") # Placeholder
else:
cleaned_header.append(col_str)
# *** START: Enhanced Duplicate Column Handling ***
counts = {}
final_header = []
original_to_final_map = {} # Map original cleaned name to final unique name
for i, col_name in enumerate(cleaned_header):
original_name = col_name # Keep track of the name before potential suffix
if col_name in counts:
counts[col_name] += 1
new_name = f"{col_name}_{counts[col_name]}"
final_header.append(new_name)
# Store mapping if original name was intended as a question
# Heuristic: assume non-demographic columns are potential questions
if i >= 3: # Assuming first 3 are demo - adjust if needed
original_to_final_map[original_name] = original_to_final_map.get(original_name, []) + [new_name]
else:
counts[col_name] = 0
final_header.append(col_name)
if i >= 3:
original_to_final_map[original_name] = [col_name] # First occurrence
df.columns = final_header
# *** END: Enhanced Duplicate Column Handling ***
# --- Category Mapping ---
def get_category_from_original(original_col_name, potential_category_source):
col_name_str = str(original_col_name).strip()
source_str = str(potential_category_source).strip()
if pd.notna(potential_category_source) and not source_str.isdigit() and 'domanda' not in source_str.lower():
base_name = source_str.split('.')[0].strip()
if base_name: return base_name
if '.' in col_name_str:
base_name = col_name_str.split('.')[0].strip()
suffix = col_name_str.split('.')[-1]
if suffix.isdigit():
if base_name: return base_name
elif not col_name_str.isdigit() and 'domanda' not in col_name_str.lower():
if col_name_str: return col_name_str
return "Categoria Sconosciuta"
question_to_category_map = {}
demographic_indices = list(range(min(3, len(final_header)))) # Safer range for demo indices
for i, final_col_name in enumerate(final_header):
if i not in demographic_indices:
# Find the original cleaned header name before potential suffix was added
original_cleaned_name = final_col_name
if '_' in final_col_name:
parts = final_col_name.rsplit('_', 1)
if parts[1].isdigit() and int(parts[1]) == counts.get(parts[0], -1):
original_cleaned_name = parts[0]
# Use original column name from the CSV *before* taking row 0 as header for category inference
original_csv_col = cleaned_original_columns[i] if i < len(cleaned_original_columns) else original_cleaned_name
category = get_category_from_original(original_csv_col, original_csv_col)
category = category.replace("Parità di genere", "Parità Genere")
question_to_category_map[final_col_name] = category # Map the *final unique* column name
# --- Demographic Columns ---
demographic_map = {}
if len(final_header) > 0: demographic_map[final_header[0]] = 'Genere'
if len(final_header) > 1: demographic_map[final_header[1]] = 'Fascia_Eta'
if len(final_header) > 2: demographic_map[final_header[2]] = 'Sede'
# Check if default demo columns actually exist before renaming
valid_demo_map = {k: v for k, v in demographic_map.items() if k in df.columns}
df.rename(columns=valid_demo_map, inplace=True)
demographic_cols = list(valid_demo_map.values())
# Filter out potential summary rows
if 'Sede' in df.columns:
anomalous_sede = ['Media', 'Mediana', 'Media sezione', 'Totale', 'Scarto quadratico medio']
df = df[~df['Sede'].astype(str).str.strip().str.lower().isin([s.lower() for s in anomalous_sede])]
# Fill missing demographic data
for col in demographic_cols:
if col in df.columns:
df[col] = df[col].astype(str).fillna('Non specificato').replace(['nan', 'None', ''], 'Non specificato')
# Identify question columns based on the map (using final unique names)
question_cols = list(question_to_category_map.keys())
question_cols = [col for col in question_cols if col in df.columns] # Ensure they exist
# --- Type Conversion ---
for col in question_cols:
if df[col].dtype == 'object':
df[col] = df[col].astype(str).str.replace(',', '.', regex=False)
df[col] = df[col].replace(['nan', 'N/A', '', '-', 'None'], np.nan, regex=False)
df[col] = pd.to_numeric(df[col], errors='coerce')
numeric_question_cols = df[question_cols].select_dtypes(include=np.number).columns.tolist()
# Determine response scale dynamically
response_scale = (1, 6) # Default fallback
if numeric_question_cols:
valid_numeric_cols = [col for col in numeric_question_cols if col in df.columns]
if valid_numeric_cols:
# Drop rows where ALL numeric questions are NaN before calculating min/max
df_numeric_only = df[valid_numeric_cols].dropna(how='all')
if not df_numeric_only.empty:
min_val = df_numeric_only.min(skipna=True).min(skipna=True)
max_val = df_numeric_only.max(skipna=True).max(skipna=True)
if pd.notna(min_val) and pd.notna(max_val):
response_scale = (min_val, max_val)
# --- Identify Overall Satisfaction Question ---
overall_satisfaction_question = None
possible_satisfaction_cats = ['Riepilogo', 'Generale', 'Soddisfazione Complessiva']
# Use final unique names from numeric_question_cols
possible_satisfaction_cols = [q for q in numeric_question_cols
if question_to_category_map.get(q) in possible_satisfaction_cats]
if possible_satisfaction_cols:
overall_satisfaction_question = possible_satisfaction_cols[0]
else:
keywords = ['soddisfazione', 'complessivamente', 'generale', 'valutazione']
for q in numeric_question_cols:
# Check original cleaned name for keywords if available, else the final name
original_cleaned_name = q.rsplit('_', 1)[0] if '_' in q and q.rsplit('_', 1)[1].isdigit() else q
q_check = original_cleaned_name.lower() # Check original name primarily
if any(keyword in q_check for keyword in keywords):
overall_satisfaction_question = q # Assign the final unique name
st.info(f"Domanda soddisfazione generale identificata: '{q}' (basata su '{original_cleaned_name}')")
break
if not overall_satisfaction_question and numeric_question_cols:
st.warning("Impossibile identificare automaticamente la domanda sulla soddisfazione generale. Alcune analisi potrebbero essere limitate.")
return df, demographic_cols, question_cols, question_to_category_map, numeric_question_cols, response_scale, overall_satisfaction_question
# --- Inizio Script Principale ---
# Aggiungi il widget per caricare il file
st.sidebar.title('Sondaggio')
uploaded_file = st.sidebar.file_uploader("Carica il tuo file CSV", type="csv")
st.sidebar.divider()
# Procedi solo se un file è stato caricato
if uploaded_file is not None:
# Sposta l'avviso della libreria scipy qui, così appare solo se si procede
if not SCIPY_AVAILABLE:
st.sidebar.warning("Libreria 'scipy' non trovata. La correlazione Spearman non sarà disponibile. Installa con: pip install scipy")
# --- Load Data ---
# Chiama la funzione di caricamento passando l'oggetto file caricato
try:
df_full, demographic_cols, question_cols, question_to_category_map, numeric_question_cols, response_scale, overall_satisfaction_question = load_and_prepare_data(uploaded_file)
if df_full is None:
st.error("Caricamento o preparazione dati fallito. Controlla il file CSV.")
st.stop() # Ferma l'esecuzione se il caricamento fallisce
elif df_full.empty:
st.warning("Il file CSV caricato risulta vuoto dopo la pulizia iniziale.")
# Si potrebbe fermare qui o continuare mostrando avvisi di dati vuoti
# st.stop()
except Exception as e:
st.error(f"Errore critico durante l'inizializzazione dei dati dal file caricato: {e}")
st.exception(e) # Stampa traceback completo per debug
st.stop() # Ferma l'esecuzione in caso di errore critico
# --- DA QUI IN POI, IL CODICE DEL DASHBOARD RIMANE INVARIATO ---
# --- MA VIENE ESEGUITO SOLO SE uploaded_file IS NOT None ---
# --- App Title ---
st.title("🚀 Dashboard Analisi Clima")
# ==============================================================================
# --- Sidebar ---
# ==============================================================================
st.sidebar.title("Filtri & Controlli")
st.sidebar.subheader("👤 Filtri Demografici")
selected_filters = {}
if demographic_cols:
# Use df_full for filter options to show all possibilities
for demo_col in demographic_cols:
# Ensure the column exists in df_full before creating filter
if demo_col in df_full.columns:
unique_values = sorted(df_full[demo_col].astype(str).unique())
if len(unique_values) > 1:
selected_filters[demo_col] = st.sidebar.multiselect(
f"{demo_col}",
options=unique_values,
default=unique_values
)
else:
# If only one value, no need for multiselect, just store it
selected_filters[demo_col] = unique_values
else:
st.sidebar.warning(f"Colonna demografica '{demo_col}' definita ma non trovata nel DataFrame.")
# Apply filters - start from df_full each time filters change
df_filtered = df_full.copy()
for col, selected_values in selected_filters.items():
# Check if the column exists in df_filtered before applying the filter
if col in df_filtered.columns and selected_values:
# Ensure selected_values are strings for comparison if the column is string
if df_filtered[col].dtype == 'object':
selected_values_str = [str(v) for v in selected_values]
df_filtered = df_filtered[df_filtered[col].astype(str).isin(selected_values_str)]
else: # Keep original type for non-object columns if filtering is needed
df_filtered = df_filtered[df_filtered[col].isin(selected_values)]
else:
st.sidebar.warning("Nessuna colonna demografica valida trovata per i filtri.")
df_filtered = df_full.copy() if df_full is not None else pd.DataFrame() # Use full data if available, else empty
st.sidebar.divider()
st.sidebar.subheader("📊 Metriche Chiave (Filtrate)")
# Recalculate total respondents after filtering
total_respondents_filtered = len(df_filtered) if df_filtered is not None else 0
st.sidebar.metric("Rispondenti Filtrati", total_respondents_filtered)
# --- Calculate metrics only if df_filtered is not empty ---
avg_overall_filtered = np.nan
avg_scores_per_category_f = pd.Series(dtype=float)
driver_df = pd.DataFrame() # Initialize empty driver dataframe
# Default correlation method
corr_method_sidebar = 'pearson'
if SCIPY_AVAILABLE:
corr_method_sidebar = 'spearman' # Prefer Spearman if scipy is available
if df_filtered is not None and not df_filtered.empty and numeric_question_cols:
# Ensure overall satisfaction question exists in the filtered numeric columns
if overall_satisfaction_question and overall_satisfaction_question in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[overall_satisfaction_question]):
overall_sat_data = df_filtered[overall_satisfaction_question].dropna()
if not overall_sat_data.empty:
avg_overall_filtered = overall_sat_data.mean()
midpoint = (response_scale[0] + response_scale[1]) / 2 if response_scale else 3.5 # Fallback midpoint
delta_vs_mid = avg_overall_filtered - midpoint
st.sidebar.metric("Soddisfazione Generale Media", f"{avg_overall_filtered:.2f}", f"{delta_vs_mid:+.2f} vs Midpoint ({midpoint:.1f})")
else:
st.sidebar.metric("Soddisfazione Generale Media", "N/D (no data)")
else:
st.sidebar.metric("Soddisfazione Generale Media", "N/D (Domanda non trovata/valida)")
# Calculate category averages on filtered data
numeric_cols_in_filtered = [col for col in numeric_question_cols if col in df_filtered.columns]
if numeric_cols_in_filtered:
avg_scores_per_question_f = df_filtered[numeric_cols_in_filtered].mean(axis=0, skipna=True)
df_avg_scores_f = pd.DataFrame({'Domanda': avg_scores_per_question_f.index, 'Punteggio Medio': avg_scores_per_question_f.values})
df_avg_scores_f['Categoria'] = df_avg_scores_f['Domanda'].map(question_to_category_map).fillna("Senza Categoria")
df_avg_scores_f.dropna(subset=['Punteggio Medio'], inplace=True)
if not df_avg_scores_f.empty:
# Exclude "Senza Categoria" from min/max display if desired
avg_scores_valid_cat = df_avg_scores_f[df_avg_scores_f['Categoria'] != "Senza Categoria"]
if not avg_scores_valid_cat.empty:
avg_scores_per_category_f = avg_scores_valid_cat.groupby('Categoria')['Punteggio Medio'].mean().sort_values()
if not avg_scores_per_category_f.empty:
min_cat_score = avg_scores_per_category_f.iloc[0]
max_cat_score = avg_scores_per_category_f.iloc[-1]
delta_min = f"{min_cat_score - avg_overall_filtered:.2f} vs Sod. Gen." if not np.isnan(avg_overall_filtered) else None
delta_max = f"{max_cat_score - avg_overall_filtered:.2f} vs Sod. Gen." if not np.isnan(avg_overall_filtered) else None
st.sidebar.metric(f"⚠️ Cat. Punteggio MIN", f"{avg_scores_per_category_f.index[0]} ({min_cat_score:.2f})", delta_min, delta_color="inverse")
st.sidebar.metric(f"✅ Cat. Punteggio MAX", f"{avg_scores_per_category_f.index[-1]} ({max_cat_score:.2f})", delta_max, delta_color="normal")
else:
st.sidebar.text("N/D per Categorie (Vuote dopo agg.)")
else:
st.sidebar.text("N/D per Categorie (Solo 'Senza Cat.')")
else:
st.sidebar.text("N/D per Categorie (No medie domande)")
else:
st.sidebar.text("N/D per Categorie (No colonne numeriche)")
# --- Calculate Driver Data (Correlation) ---
if overall_satisfaction_question and overall_satisfaction_question in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[overall_satisfaction_question]):
# Ensure overall satisfaction has variance
if df_filtered[overall_satisfaction_question].nunique(dropna=True) > 1:
driver_candidate_cols = [col for col in numeric_cols_in_filtered if col != overall_satisfaction_question and df_filtered[col].nunique(dropna=True) > 1]
if driver_candidate_cols:
try:
# Calculate correlations
correlations = df_filtered[driver_candidate_cols].corrwith(df_filtered[overall_satisfaction_question], method=corr_method_sidebar).dropna()
# Calculate average scores for the same candidates
avg_scores_drivers = df_filtered[driver_candidate_cols].mean(skipna=True)
# Combine into driver_df
if not correlations.empty:
driver_df = pd.DataFrame({'Correlazione': correlations})
# Add avg scores safely, aligning index
driver_df = driver_df.join(avg_scores_drivers.rename('Punteggio Medio'), how='inner') # Inner join ensures only questions with both corr and avg score remain
if not driver_df.empty:
driver_df['Categoria'] = driver_df.index.map(question_to_category_map).fillna("Senza Categoria")
driver_df.dropna(subset=['Categoria', 'Correlazione', 'Punteggio Medio'], inplace=True) # Drop if essential data missing
if not driver_df.empty:
driver_df['Domanda'] = driver_df.index
driver_df['Domanda_Breve'] = driver_df['Domanda'].apply(lambda x: str(x)[:47] + "..." if len(str(x)) > 50 else str(x))
driver_df['Correlazione_Abs'] = driver_df['Correlazione'].abs()
else:
driver_df = pd.DataFrame() # Ensure it's empty if join fails
else:
st.sidebar.info("Nessuna correlazione significativa calcolata per i driver.")
except Exception as e:
st.sidebar.warning(f"Errore nel calcolo correlazioni driver: {e}")
else:
st.sidebar.info("Nessuna domanda candidata (con varianza) trovata per l'analisi driver.")
else:
st.sidebar.info("La domanda di soddisfazione generale non ha varianza nei dati filtrati.")
else: # If df_filtered is empty or no numeric questions
st.sidebar.text("Dati insufficienti o non disponibili per le metriche.")
if total_respondents_filtered == 0:
st.sidebar.text("Nessun rispondente selezionato.")
st.sidebar.metric("Soddisfazione Generale Media", "N/D")
st.sidebar.text("N/D per Categorie")
st.sidebar.divider()
st.sidebar.info("Utilizza i filtri per esplorare i dati. Le metriche e i grafici si aggiornano dinamicamente.")
# ==============================================================================
# --- Create Tabs ---
# ==============================================================================
tab_list = [
"🎯 Sintesi Chiave",
"🗺️ Mappa Domande", # New Tab for Question Map
"👥 Demografia Dettagliata",
"📊 Generale & Categorie",
"🔍 Confronti & Driver",
"📈 Grafici Avanzati"
]
tabs = st.tabs(tab_list)
# Assign tabs to variables dynamically for easier access
tab_summary = tabs[0]
tab_map = tabs[1]
tab_demo = tabs[2]
tab_overall = tabs[3]
tab_comp = tabs[4]
tab_advanced = tabs[5]
# ==============================================================================
# --- TAB Summary: Key Takeaways ---
# ==============================================================================
with tab_summary:
# Content remains largely the same, but relies on variables calculated in sidebar
st.header("🎯 Sintesi Chiave (Basata sui Filtri Correnti)")
if df_filtered is None or df_filtered.empty:
st.warning("Nessun dato disponibile con i filtri selezionati.")
else:
st.markdown(f"Analisi basata su **{total_respondents_filtered}** rispondenti.")
col_s1, col_s2, col_s3 = st.columns([2, 1, 1]) # Adjusted columns for gauge
with col_s1:
st.subheader("Punti Salienti:")
if not np.isnan(avg_overall_filtered):
max_scale = response_scale[1] if response_scale else 6 # Fallback max scale
st.markdown(f"- **Soddisfazione Generale:** {avg_overall_filtered:.2f} / {max_scale:.0f}")
else:
st.markdown(f"- **Soddisfazione Generale:** N/D")
if not avg_scores_per_category_f.empty:
st.markdown(f"- **Area Più Forte:** {avg_scores_per_category_f.index[-1]} (Media: {avg_scores_per_category_f.iloc[-1]:.2f})")
st.markdown(f"- **Area Più Debole:** {avg_scores_per_category_f.index[0]} (Media: {avg_scores_per_category_f.iloc[0]:.2f})")
else:
st.markdown("- Dati categorie non disponibili.")
# Driver info from pre-calculated driver_df
if not driver_df.empty:
try:
# Top positive driver
top_driver = driver_df.sort_values('Correlazione', ascending=False).iloc[0]
st.markdown(f"- **Driver Positivo Principale:** {top_driver['Domanda_Breve']} (Corr: {top_driver['Correlazione']:.2f})")
# Top area for improvement (high correlation, low score) - using dynamic means
avg_corr_summary = driver_df['Correlazione'].mean()
avg_score_summary = driver_df['Punteggio Medio'].mean()
potential_improvement_df = driver_df[(driver_df['Correlazione'] > avg_corr_summary) & (driver_df['Punteggio Medio'] < avg_score_summary)]
if not potential_improvement_df.empty:
potential_improvement = potential_improvement_df.sort_values('Punteggio Medio').iloc[0] # Lowest score among high-impact, low-perf
st.markdown(f"- **Focus Miglioramento:** {potential_improvement['Domanda_Breve']} (Score: {potential_improvement['Punteggio Medio']:.2f}, Corr: {potential_improvement['Correlazione']:.2f})")
else:
st.markdown("- *Focus Miglioramento:* (Nessun driver critico trovato con medie correnti)")
except IndexError:
st.markdown("- *Driver Principali:* (Errore nell'accesso ai dati driver)")
except Exception as e:
st.markdown(f"- *Driver Principali:* (Errore: {e})")
else:
st.markdown("- *Driver Principali:* (Dati non disponibili o insufficienti)")
with col_s2:
st.subheader("Sentiment") # Combined Pie and Gauge
if overall_satisfaction_question and overall_satisfaction_question in df_filtered.columns:
overall_satisfaction_data_f = df_filtered[overall_satisfaction_question].dropna()
if pd.api.types.is_numeric_dtype(overall_satisfaction_data_f) and not overall_satisfaction_data_f.empty:
# Sentiment Pie Chart
bucket_counts = overall_satisfaction_data_f.apply(categorize_score).value_counts()
# Add 'Non Risposto' if it exists
# non_risposto_count = df_filtered[overall_satisfaction_question].isna().sum() # Needs careful handling if mixing counts and percentages
bucket_counts = bucket_counts.reindex(list(BUCKET_COLORS.keys()) + ["Non Risposto"], fill_value=0) # Ensure all buckets + Non Risposto
bucket_perc = (bucket_counts / bucket_counts.sum() * 100) if bucket_counts.sum() > 0 else bucket_counts
# Define colors including for "Non Risposto"
plot_colors = BUCKET_COLORS.copy()
plot_colors["Non Risposto"] = "#bbbbbb" # Grey for non-responded
fig_sentiment_pie = px.pie(values=bucket_perc.values, names=bucket_perc.index,
title="Distribuzione Sentiment", hole=0.4,
color=bucket_perc.index, color_discrete_map=plot_colors,
template=PLOTLY_TEMPLATE)
fig_sentiment_pie.update_traces(textinfo='percent+label', sort=False, # Keep defined order
pull=[0.05 if name=="Critico" else 0 for name in bucket_perc.index])
fig_sentiment_pie.update_layout(showlegend=False, margin=dict(t=30, b=10, l=10, r=10), height=250) # Compact layout
st.plotly_chart(fig_sentiment_pie, use_container_width=True)
else:
st.write("Dati soddisfazione non numerici/vuoti.")
else:
st.write("Domanda soddisfazione non trovata.")
with col_s3:
st.subheader("Valore Medio")
if not np.isnan(avg_overall_filtered):
min_scale, max_scale = response_scale if response_scale else (1, 6)
midpoint = (min_scale + max_scale) / 2
fig_gauge = go.Figure(go.Indicator(
mode = "gauge+number",
value = avg_overall_filtered,
domain = {'x': [0, 1], 'y': [0, 1]},
title = {'text': "Soddisfazione Generale", 'font': {'size': 16}},
gauge = {
'axis': {'range': [min_scale, max_scale], 'tickwidth': 1, 'tickcolor': "darkblue"},
'bar': {'color': "steelblue"},
'bgcolor': "white",
'borderwidth': 2,
'bordercolor': "gray",
'steps': [
{'range': [min_scale, THRESHOLD_LOW], 'color': BUCKET_COLORS['Critico']},
{'range': [THRESHOLD_LOW, THRESHOLD_HIGH], 'color': BUCKET_COLORS['Neutrale']},
{'range': [THRESHOLD_HIGH, max_scale], 'color': BUCKET_COLORS['Positivo']}],
'threshold': {
'line': {'color': "black", 'width': 3},
'thickness': 0.9,
'value': midpoint } # Show midpoint
}))
fig_gauge.update_layout(height=250, margin=dict(t=40, b=10, l=10, r=10)) # Compact layout
st.plotly_chart(fig_gauge, use_container_width=True)
else:
st.write(" ") # Placeholder
st.write(" ")
st.info("Gauge non disponibile (media N/D).")
st.markdown("---")
st.subheader("Riflessioni Rapide:")
satisfaction_text = f"{avg_overall_filtered:.2f}" if not np.isnan(avg_overall_filtered) else "N/D"
strongest_area_text = f"{avg_scores_per_category_f.index[-1]}" if not avg_scores_per_category_f.empty else "N/D"
weakest_area_text = f"{avg_scores_per_category_f.index[0]}" if not avg_scores_per_category_f.empty else "N/D"
st.info(f"""
Questa sintesi evidenzia i risultati principali per il gruppo selezionato ({total_respondents_filtered} persone).
La soddisfazione generale si attesta a **{satisfaction_text}**.
Le aree di forza (**{strongest_area_text}**) e di debolezza (**{weakest_area_text}**)
richiedono attenzione specifica. Esplora le altre schede per dettagli, confronti e visualizzazioni avanzate.
""")
# ==============================================================================
# --- TAB Map: Category -> Question Mapping ---
# ==============================================================================
with tab_map:
st.header("🗺️ Mappa Categorie e Domande")
st.write("Questa sezione mostra quali domande appartengono a ciascuna categoria identificata durante il caricamento dei dati.")
if question_to_category_map:
# Create DataFrame from the mapping dictionary
map_df = pd.DataFrame(question_to_category_map.items(), columns=['Domanda', 'Categoria'])
# Sort for better readability
map_df = map_df.sort_values(by=['Categoria', 'Domanda']).reset_index(drop=True)
st.dataframe(map_df, use_container_width=True)
# Optional: Display grouped by category
st.divider()
st.subheader("Domande Raggruppate per Categoria")
categories_in_map = map_df['Categoria'].unique()
for category in sorted(categories_in_map):
with st.expander(f"**{category}**"):
questions_in_cat = map_df[map_df['Categoria'] == category]['Domanda'].tolist()
for q in questions_in_cat:
st.markdown(f"- {q}")
else:
st.warning("La mappa tra domande e categorie non è disponibile.")
# ==============================================================================
# --- TAB Demo: Demographics ---
# ==============================================================================
with tab_demo:
st.header("👥 Analisi Demografica Dettagliata (Filtrata)")
if df_filtered is None or df_filtered.empty:
st.warning("Nessun dato disponibile con i filtri selezionati.")
elif not demographic_cols:
st.warning("Nessuna colonna demografica configurata per l'analisi.")
else:
st.write(f"Visualizzazione basata su **{len(df_filtered)}** rispondenti selezionati.")
valid_demo_cols_plots = [col for col in demographic_cols if col in df_filtered.columns] # Use only valid cols for plotting
if not valid_demo_cols_plots:
st.warning("Nessuna colonna demografica valida trovata nei dati filtrati per la visualizzazione.")
else:
# --- Basic Distribution Pies ---
st.subheader("Distribuzione Base")
num_demo_cols = len(valid_demo_cols_plots)
cols_pie = st.columns(num_demo_cols)
pie_colors = [px.colors.qualitative.Pastel1, px.colors.qualitative.Pastel2, px.colors.qualitative.Set3] # Cycle through color schemes
for i, demo_col in enumerate(valid_demo_cols_plots):
with cols_pie[i % num_demo_cols]: # Cycle through columns
if not df_filtered[demo_col].dropna().empty:
# Define order for age if applicable
category_orders = {}
if 'Eta' in demo_col:
age_order_guess = ['Fino a 30 anni', '31-40 anni', '41-50 anni', 'Oltre i 50 anni', 'Non specificato']
actual_ages = df_filtered[demo_col].unique()
ordered_actual = [age for age in age_order_guess if age in actual_ages]
ordered_actual.extend(sorted([age for age in actual_ages if age not in age_order_guess]))
category_orders={demo_col: ordered_actual}
fig_pie = px.pie(df_filtered.dropna(subset=[demo_col]), names=demo_col, hole=0.4,
color_discrete_sequence=pie_colors[i % len(pie_colors)], template=PLOTLY_TEMPLATE,
title=f"Per {demo_col}", category_orders=category_orders)
fig_pie.update_traces(textposition='inside', textinfo='percent+label')
fig_pie.update_layout(showlegend=False, title_x=0.5, margin=dict(t=40, b=0, l=0, r=0), height=300)
st.plotly_chart(fig_pie, use_container_width=True)
else:
st.write(f"Dati '{demo_col}' non disponibili.")
st.markdown("---")
# --- Hierarchical Views: Sunburst & Treemap ---
st.subheader("Visualizzazioni Gerarchiche/Proporzionali")
if len(valid_demo_cols_plots) >= 2: # Need at least 2 demographics for interesting hierarchy
chart_type_hier = st.radio("Scegli tipo grafico gerarchico:", ["Sunburst", "Treemap"], horizontal=True, key="hier_chart_sel")
# Aggregate counts for combinations
try:
df_grouped_hier = df_filtered.groupby(valid_demo_cols_plots, observed=True).size().reset_index(name='Conteggio')
if not df_grouped_hier.empty:
# Use first valid demo col for coloring
color_col_hier = valid_demo_cols_plots[0]
if chart_type_hier == "Sunburst":
fig_hier = px.sunburst(df_grouped_hier, path=valid_demo_cols_plots, values='Conteggio',
title=f"Distribuzione Combinata (Sunburst): {', '.join(valid_demo_cols_plots)}",
template=PLOTLY_TEMPLATE,
color=color_col_hier,
color_discrete_sequence=px.colors.qualitative.Pastel)
fig_hier.update_layout(margin=dict(t=50, l=25, r=25, b=25))
st.plotly_chart(fig_hier, use_container_width=True)
elif chart_type_hier == "Treemap":
fig_hier = px.treemap(df_grouped_hier, path=[px.Constant("Tutti")] + valid_demo_cols_plots, values='Conteggio',
title=f"Distribuzione Combinata (Treemap): {', '.join(valid_demo_cols_plots)}",
template=PLOTLY_TEMPLATE,
color=color_col_hier,
color_discrete_sequence=px.colors.qualitative.Pastel)
fig_hier.update_layout(margin=dict(t=50, l=25, r=25, b=25))
st.plotly_chart(fig_hier, use_container_width=True)
else:
st.info("Nessun dato aggregato per la visualizzazione gerarchica.")
except Exception as e:
st.error(f"Errore durante l'aggregazione per il grafico gerarchico: {e}")
else:
st.info("Sono necessarie almeno due colonne demografiche valide per le visualizzazioni gerarchiche.")
# ==============================================================================
# --- TAB Overall: Overall, Categories & Questions ---
# ==============================================================================
with tab_overall:
st.header("📊 Analisi Generale, Categorie e Domande (Filtrata)")
if df_filtered is None or df_filtered.empty:
st.warning("Nessun dato disponibile con i filtri selezionati.")
else:
# --- Overall Satisfaction Distribution ---
st.subheader("⭐ Soddisfazione Generale Complessiva")
if overall_satisfaction_question and overall_satisfaction_question in df_filtered.columns:
overall_satisfaction_data_f = df_filtered[overall_satisfaction_question].dropna()
if pd.api.types.is_numeric_dtype(overall_satisfaction_data_f) and not overall_satisfaction_data_f.empty:
col_ov1, col_ov2 = st.columns([2,1])
with col_ov1:
# Bar chart of distribution
overall_counts_f = overall_satisfaction_data_f.value_counts().sort_index()
fig_overall_satisfaction = px.bar(overall_counts_f, x=overall_counts_f.index, y=overall_counts_f.values,
labels={'x': f'Punteggio ({response_scale[0]:.0f}-{response_scale[1]:.0f})', 'y': 'Numero Risposte'},
text_auto=True, color_discrete_sequence=px.colors.sequential.Blues_r, template=PLOTLY_TEMPLATE,
title="Distribuzione Punteggi Soddisfazione Generale")
fig_overall_satisfaction.update_layout(xaxis = dict(tickmode = 'linear', dtick=1), title_x=0.5)
st.plotly_chart(fig_overall_satisfaction, use_container_width=True)
with col_ov2:
# Sentiment display
st.write(" ")
st.write(" ")
st.write("**Distribuzione Sentiment:**")
bucket_counts = overall_satisfaction_data_f.apply(categorize_score).value_counts()
bucket_counts = bucket_counts.reindex(list(BUCKET_COLORS.keys()) + ["Non Risposto"], fill_value=0)
total_valid_responses = bucket_counts.sum()
if total_valid_responses > 0:
bucket_perc = (bucket_counts / total_valid_responses * 100)
plot_colors = BUCKET_COLORS.copy()
plot_colors["Non Risposto"] = "#bbbbbb"
for bucket in plot_colors.keys(): # Iterate in defined order
if bucket in bucket_perc.index: # Check if bucket exists
perc = bucket_perc.get(bucket, 0)
count = bucket_counts.get(bucket, 0)
st.markdown(f"<span style='color:{plot_colors.get(bucket, 'black')}; font-size: 1.1em;'>■</span> **{bucket}:** {perc:.1f}% ({count})", unsafe_allow_html=True)
else:
st.write("Nessuna risposta valida per il sentiment.")
else: st.warning("Dati soddisfazione generale non disponibili/numerici.")
else: st.warning("Domanda soddisfazione generale non trovata.")
st.markdown("---")
# --- Category Averages ---
st.subheader("📈 Punteggio Medio per Categoria")
if not avg_scores_per_category_f.empty:
cat_avg_chart_type = st.radio("Visualizza medie categorie come:", ["Bar Chart", "Bullet Chart"], horizontal=True, key="cat_avg_type")
if cat_avg_chart_type == "Bar Chart":
avg_scores_plot = avg_scores_per_category_f.copy()
color_map = []
for score in avg_scores_plot.values:
if score > THRESHOLD_HIGH: color_map.append(BUCKET_COLORS["Positivo"])
elif score < THRESHOLD_LOW: color_map.append(BUCKET_COLORS["Critico"])
else: color_map.append(BUCKET_COLORS["Neutrale"])
fig_avg_category = go.Figure(go.Bar(
x=avg_scores_plot.values, y=avg_scores_plot.index, orientation='h',
text=[f'{score:.2f}' for score in avg_scores_plot.values], marker_color=color_map ))
fig_avg_category.update_traces(textposition='outside')
fig_avg_category.update_layout(
xaxis_title=f'Punteggio Medio ({response_scale[0]:.0f}-{response_scale[1]:.0f})', yaxis_title='Categoria',
yaxis={'categoryorder':'total ascending'}, template=PLOTLY_TEMPLATE, title="Medie Categorie (Colorate per Soglia)")
if not np.isnan(avg_overall_filtered):
fig_avg_category.add_vline(x=avg_overall_filtered, line_width=2, line_dash="dash", line_color="grey", annotation_text="Media Sod. Gen.")
st.plotly_chart(fig_avg_category, use_container_width=True)
elif cat_avg_chart_type == "Bullet Chart":
st.write("Grafico Bullet: Confronta la media di categoria con la media generale e le soglie.")
min_scale, max_scale = response_scale if response_scale else (1, 6)
avg_scores_plot = avg_scores_per_category_f.copy().sort_values(ascending=False)
for category, score in avg_scores_plot.items():
fig_bullet = go.Figure(go.Indicator(
mode = "gauge+number+delta",
value = score,
delta = {'reference': avg_overall_filtered, 'suffix': ' vs Media Gen.'} if not np.isnan(avg_overall_filtered) else None,
title = {'text': category, 'font': {'size': 14}},
gauge = {
'shape': "bullet",
'axis': {'range': [min_scale, max_scale]},
'threshold': {
'line': {'color': "black", 'width': 2},
'thickness': 0.75,
'value': avg_overall_filtered if not np.isnan(avg_overall_filtered) else (min_scale+max_scale)/2 },
'bgcolor': "white",
'steps': [
{'range': [min_scale, THRESHOLD_LOW], 'color': BUCKET_COLORS['Critico']},
{'range': [THRESHOLD_LOW, THRESHOLD_HIGH], 'color': BUCKET_COLORS['Neutrale']},
{'range': [THRESHOLD_HIGH, max_scale], 'color': BUCKET_COLORS['Positivo']}],
'bar': {'color': 'darkblue', 'thickness': 0.5}
}))
fig_bullet.update_layout(height=100, margin=dict(l=200, r=50, t=30, b=10))
st.plotly_chart(fig_bullet, use_container_width=True)
else:
st.warning("Impossibile calcolare medie per categoria (potrebbero essere tutte 'Senza Categoria' o vuote).")
st.markdown("---")
# --- Detailed Question Analysis ---
st.subheader("❓ Analisi Dettagliata per Domanda")
# Get categories present in the calculated averages
categories_with_averages = avg_scores_per_category_f.index.unique().tolist()
if not categories_with_averages:
# Fallback: get categories from the original map if averages failed
if question_to_category_map:
categories_with_averages = sorted(list(set(question_to_category_map.values())))
if "Senza Categoria" in categories_with_averages: categories_with_averages.remove("Senza Categoria")
if "Categoria Sconosciuta" in categories_with_averages: categories_with_averages.remove("Categoria Sconosciuta")
else:
categories_with_averages = []
if categories_with_averages: # Proceed only if there are valid categories
col_q1, col_q2 = st.columns([1,1])
with col_q1:
selected_category = st.selectbox("Seleziona Categoria:", options=categories_with_averages, key="cat_select_q")
with col_q2:
plot_type = st.radio("Tipo Grafico Domande:", ["Distribuzione % (Stacked)", "Conteggi (Bar)", "Box Plot"], horizontal=True, key="q_plot_type")
if selected_category:
st.write(f"**Dettaglio Domande: '{selected_category}'**")
# Find questions mapped to the selected category, ensuring they are numeric and exist
questions_in_category = [q for q, cat in question_to_category_map.items()
if cat == selected_category and q in df_filtered.columns and q in numeric_question_cols]
if not questions_in_category:
st.write("Nessuna domanda numerica valida trovata per questa categoria nei dati filtrati.")
else:
# Prepare data for box plot if selected
if plot_type == "Box Plot":
df_box_cat = df_filtered[questions_in_category].copy()
if not df_box_cat.empty:
df_box_melted = df_box_cat.melt(var_name='Domanda', value_name='Punteggio')
# Shorten question names for y-axis
df_box_melted['Domanda_Breve'] = df_box_melted['Domanda'].apply(lambda x: x[:67]+"..." if len(x) > 70 else x)
df_box_melted.dropna(subset=['Punteggio'], inplace=True)
if not df_box_melted.empty:
fig_box = px.box(df_box_melted, x='Punteggio', y='Domanda_Breve', orientation='h',
title=f"Distribuzione Punteggi per Domanda in '{selected_category}'",
template=PLOTLY_TEMPLATE, points=False) # points="all" can be noisy
fig_box.update_layout(yaxis={'categoryorder':'total descending'}, height=max(400, len(questions_in_category)*50)) # Dynamic height
st.plotly_chart(fig_box, use_container_width=True)
else:
st.warning("Nessun dato valido per il Box Plot dopo il dropna.")
else:
st.warning("DataFrame vuoto per il Box Plot.")
else: # Stacked or Counts Bar Chart
for question in questions_in_category:
question_data_f = df_filtered[question].dropna()
if pd.api.types.is_numeric_dtype(question_data_f) and not question_data_f.empty:
avg_q = question_data_f.mean()
q_display = question if len(question) < 100 else question[:97] + "..."
st.markdown(f"**{q_display}** (Media: {avg_q:.2f})")
if plot_type == "Conteggi (Bar)":
counts_q = question_data_f.value_counts().sort_index()
if not counts_q.empty:
fig_q = px.bar(counts_q, x=counts_q.index, y=counts_q.values,
labels={'x': 'Punteggio', 'y': 'Numero Risposte'}, text_auto='.2s',
height=250, template=PLOTLY_TEMPLATE, color_discrete_sequence=px.colors.sequential.Blues_r)
fig_q.update_layout(xaxis = dict(tickmode = 'linear', dtick=1), margin=dict(t=5, b=5, l=5, r=5))
st.plotly_chart(fig_q, use_container_width=True)
else: st.caption("Nessun dato per questo grafico.")
elif plot_type == "Distribuzione % (Stacked)":
counts_q_norm = question_data_f.value_counts(normalize=True).sort_index() * 100
if not counts_q_norm.empty:
counts_q_df = counts_q_norm.reset_index()
counts_q_df.columns = ['Punteggio', 'Percentuale']
counts_q_df['Punteggio'] = counts_q_df['Punteggio'].astype(str) # For discrete colors
# Define a color map for the scores in the stacked bar
unique_scores = sorted(counts_q_df['Punteggio'].astype(float).unique())
colors = px.colors.sequential.Blues_r
score_color_map = {str(score): colors[min(len(colors)-1, int((score - response_scale[0]) / (response_scale[1] - response_scale[0]) * len(colors)))]
for score in unique_scores}
fig_q = px.bar(counts_q_df, x='Percentuale', y=[' ']*len(counts_q_df), # Single bar
color='Punteggio', orientation='h',
text=[f"{p:.1f}%" for p in counts_q_df['Percentuale']],
height=150, template=PLOTLY_TEMPLATE,
color_discrete_map=score_color_map # Apply color map
)
fig_q.update_layout(xaxis_ticksuffix="%", yaxis_title="", xaxis_title="% Rispondenti",
legend_title="Punteggio", showlegend=True, margin=dict(t=5, b=5, l=5, r=5),
xaxis_range=[0,100], yaxis_visible=False,
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
fig_q.update_traces(textposition='inside', textfont_color='white') # Ensure text is visible
st.plotly_chart(fig_q, use_container_width=True)
else: st.caption("Nessun dato per questo grafico.")
else:
st.caption(f"Dati per '{question[:50]}...' non numerici o vuoti.")
else:
st.info("Nessuna categoria valida trovata per l'analisi dettagliata delle domande.")
# ==============================================================================
# --- TAB Comparisons: Comparisons, Drivers & More ---
# ==============================================================================
with tab_comp:
st.header("🔍 Confronti Demografici & Analisi Driver (Filtrata)")
if df_filtered is None or df_filtered.empty:
st.warning("Nessun dato disponibile con i filtri selezionati.")
elif not numeric_question_cols:
st.warning("Nessuna domanda numerica trovata per le analisi di confronto.")
else:
# --- Prepare Melted Data ---
@st.cache_data # Cache the melting process
def get_melted_data(df, id_vars, value_vars, cat_map):
if not value_vars: return pd.DataFrame()
cols_to_melt = [col for col in id_vars + value_vars if col in df.columns]
value_vars_valid = [col for col in value_vars if col in cols_to_melt]
id_vars_valid = [col for col in id_vars if col in cols_to_melt]
if not value_vars_valid or not id_vars_valid: return pd.DataFrame() # Need both ID and Value vars
df_melted = df[cols_to_melt].melt(id_vars=id_vars_valid, value_vars=value_vars_valid, var_name='Domanda', value_name='Punteggio')
df_melted['Categoria'] = df_melted['Domanda'].map(cat_map).fillna("Senza Categoria")
df_melted.dropna(subset=['Punteggio'], inplace=True)
return df_melted
numeric_cols_in_filtered = [col for col in numeric_question_cols if col in df_filtered.columns]
valid_demographic_cols = [col for col in demographic_cols if col in df_filtered.columns]
df_melted_f = pd.DataFrame() # Initialize empty
if valid_demographic_cols and numeric_cols_in_filtered:
df_melted_f = get_melted_data(df_filtered, valid_demographic_cols, numeric_cols_in_filtered, question_to_category_map)
# --- Demographic Comparisons (Violin / Box Plots) ---
st.subheader("🎻 Confronti Demografici (Distribuzione Punteggi per Categoria)")
if not df_melted_f.empty and valid_demographic_cols:
col_comp1, col_comp2 = st.columns(2)
with col_comp1:
# Select demographic group for comparison
comparison_group_v_options = [col for col in valid_demographic_cols if df_filtered[col].nunique() > 1] # Only those with multiple values
if comparison_group_v_options:
comparison_group_v = st.selectbox("Confronta Distribuzioni per:", comparison_group_v_options, key="dist_group")
else:
comparison_group_v = None
st.info("Nessuna colonna demografica con valori multipli per il confronto.")
with col_comp2:
dist_plot_type = st.radio("Tipo Grafico Distribuzione:", ["Violin Plot", "Box Plot"], horizontal=True, key="dist_plot_type")
if comparison_group_v: # Proceed only if a valid comparison group is selected
# Select categories to show (use averages calculated in sidebar)
categories_with_averages = avg_scores_per_category_f.index.unique().tolist()
if categories_with_averages:
default_cats_dist = avg_scores_per_category_f.nsmallest(3).index.tolist()
default_cats_dist = [cat for cat in default_cats_dist if cat in categories_with_averages] # Ensure defaults are valid
selected_cats_dist = st.multiselect("Seleziona Categorie da Visualizzare:", options=categories_with_averages, default=default_cats_dist, key="cat_dist")
if selected_cats_dist:
# Filter melted data for selected categories and ensure comparison group is not NA
df_dist = df_melted_f[(df_melted_f['Categoria'].isin(selected_cats_dist)) &
(df_melted_f[comparison_group_v].notna()) &
(df_melted_f[comparison_group_v] != 'Non specificato')] # Exclude 'Non specificato'? Optional.
if not df_dist.empty:
# Ensure hover data columns exist
hover_data = [col for col in valid_demographic_cols if col in df_dist.columns]
plot_func = px.violin if dist_plot_type == "Violin Plot" else px.box
caption_text = ("Il grafico a violino mostra la densità della distribuzione..." if dist_plot_type == "Violin Plot"
else "Il box plot mostra mediana, quartili...")
fig_dist = plot_func(df_dist, x='Categoria', y='Punteggio', color=comparison_group_v,
points=False, # 'all', False, 'outliers'
hover_data=hover_data,
category_orders={'Categoria': selected_cats_dist}, # Use selected order
template=PLOTLY_TEMPLATE, title=f"Distribuzione Punteggi per {comparison_group_v}")
fig_dist.update_layout(yaxis_range=[response_scale[0]-0.5, response_scale[1]+0.5])
st.plotly_chart(fig_dist, use_container_width=True)
st.caption(caption_text)
else:
st.warning(f"Nessun dato per le categorie e gruppo '{comparison_group_v}' selezionati.")
else:
st.info("Seleziona almeno una categoria per visualizzare il confronto.")
else:
st.warning("Medie per categoria non disponibili.")
else:
st.info("Dati o colonne demografiche insufficienti per i confronti.")
st.markdown("---")
# --- Driver Analysis ---
st.subheader("🎯 Analisi Driver (Impatto vs Performance)")
if not driver_df.empty: # Use pre-calculated driver_df from sidebar
driver_plot_type = st.radio("Visualizza Analisi Driver come:", ["Scatter Plot", "Density Heatmap", "Bar Chart (Top/Bottom)"], horizontal=True, key="driver_plot_type")
if driver_plot_type == "Scatter Plot":
# (Code for Scatter Plot - seems okay, uses driver_df)
fig_scatter_drivers = px.scatter(driver_df, x='Punteggio Medio', y='Correlazione',
color='Categoria',
size='Correlazione_Abs', size_max=18,
hover_data=['Domanda_Breve', 'Punteggio Medio', 'Correlazione'],
template=PLOTLY_TEMPLATE, title=f"Driver: Impatto (Corr. {corr_method_sidebar.capitalize()}) vs Performance")
avg_corr = driver_df['Correlazione'].mean()
avg_score_all_q = driver_df['Punteggio Medio'].mean()
fig_scatter_drivers.add_vline(x=avg_score_all_q, line_width=1, line_dash="dash", line_color="grey", annotation_text="Media Perf.")
fig_scatter_drivers.add_hline(y=avg_corr, line_width=1, line_dash="dash", line_color="grey", annotation_text="Media Impatto")
fig_scatter_drivers.update_layout(xaxis_title="Performance (Punteggio Medio Domanda)", yaxis_title=f"Impatto (Corr. {corr_method_sidebar.capitalize()} con Sod. Gen.)")
st.plotly_chart(fig_scatter_drivers, use_container_width=True)
st.caption("Quadranti (vs medie): Alto Dx (Verde)=Forza Chiave; Alto Sx (Giallo)=Priorità Alta; Basso Sx (Rosso)=Priorità Bassa; Basso Dx (Blu)=Mantenimento Secondario. Dimensione = forza correlazione.")
elif driver_plot_type == "Density Heatmap":
# (Code for Density Heatmap - seems okay, uses driver_df)
fig_density_driver = px.density_heatmap(driver_df, x="Punteggio Medio", y="Correlazione",
marginal_x="histogram", marginal_y="histogram",
text_auto=False,
template=PLOTLY_TEMPLATE, title=f"Densità Driver: Impatto (Corr. {corr_method_sidebar.capitalize()}) vs Performance")
avg_corr = driver_df['Correlazione'].mean()
avg_score_all_q = driver_df['Punteggio Medio'].mean()
fig_density_driver.add_vline(x=avg_score_all_q, line_width=1, line_dash="dash", line_color="grey")
fig_density_driver.add_hline(y=avg_corr, line_width=1, line_dash="dash", line_color="grey")
fig_density_driver.update_layout(xaxis_title="Performance (Punteggio Medio Domanda)", yaxis_title=f"Impatto (Corr. {corr_method_sidebar.capitalize()} con Sod. Gen.)")
st.plotly_chart(fig_density_driver, use_container_width=True)
st.caption("Mostra dove si concentrano le domande nel piano Impatto-Performance.")
elif driver_plot_type == "Bar Chart (Top/Bottom)":
# (Code for Bar Chart - seems okay, uses driver_df)
top_n = st.slider("Numero Top/Bottom Driver da mostrare:", min_value=3, max_value=15, value=8, key="driver_topn")
driver_df_unique = driver_df.loc[~driver_df.index.duplicated(keep='first')]
top_drivers = driver_df_unique.sort_values('Correlazione', ascending=False).head(top_n)
bottom_drivers = driver_df_unique.sort_values('Correlazione', ascending=True).head(top_n) # Gets most negative
# Combine and ensure uniqueness (in case a driver is both top N pos and top N neg in small datasets)
drivers_to_plot = pd.concat([top_drivers, bottom_drivers]).drop_duplicates().sort_values('Correlazione')
if not drivers_to_plot.empty:
fig_drivers_bar = px.bar(drivers_to_plot, x='Correlazione', y='Domanda_Breve', orientation='h',
color='Categoria', template=PLOTLY_TEMPLATE, height=max(400, len(drivers_to_plot)*30),
title=f"Top/Bottom {top_n} Domande per Correlazione ({corr_method_sidebar.capitalize()}) con Sod. Gen.")
fig_drivers_bar.update_layout(yaxis={'categoryorder':'total ascending'}, xaxis_title=f"Correlazione {corr_method_sidebar.capitalize()}", yaxis_title="Domanda")
st.plotly_chart(fig_drivers_bar, use_container_width=True)
st.caption(f"Mostra le domande con la correlazione ({corr_method_sidebar}) più forte (positiva e negativa) con la soddisfazione generale.")
else:
st.warning("Nessun dato driver da mostrare nel grafico a barre.")
else:
st.warning("Impossibile calcolare l'analisi dei driver. Verifica la presenza e la varianza della domanda di soddisfazione generale e delle altre domande numeriche.")
st.markdown("---")
# --- Anomaly Detection & Recommendations ---
st.subheader("⚠️ Rilevamento Potenziali Punti d'Attenzione & Suggerimenti 💡")
# Use melted data calculated earlier
if not df_melted_f.empty and valid_demographic_cols and not avg_scores_per_category_f.empty:
col_anom, col_sugg = st.columns(2)
with col_anom:
st.write("**Possibili Punti d'Attenzione (Z-Score per Gruppo/Categoria):**")
try:
# Calculate overall category means and std deviations on the *filtered* dataset
overall_cat_stats = df_melted_f.groupby('Categoria')['Punteggio'].agg(['mean', 'std']).reset_index()
# Rename columns *before* merge
overall_cat_stats = overall_cat_stats.rename(columns={'mean': 'mean_overall', 'std': 'std_overall'})
# Calculate group means within the filtered dataset
group_means = df_melted_f.groupby(valid_demographic_cols + ['Categoria'], observed=True)['Punteggio'].mean().reset_index()
# Rename columns *before* merge
group_means = group_means.rename(columns={'Punteggio': 'mean_group'})
if not group_means.empty and not overall_cat_stats.empty:
# Merge using the renamed columns
merged_stats = pd.merge(group_means, overall_cat_stats, on='Categoria', how='left')
# Calculate Z-score only if std is not NaN and greater than a small epsilon
merged_stats_valid_std = merged_stats[merged_stats['std_overall'].notna() & (merged_stats['std_overall'] > 0.01)].copy() # Use copy to avoid SettingWithCopyWarning
if not merged_stats_valid_std.empty:
# *** CORRECTION HERE: Use correct column names ***
merged_stats_valid_std['Z_Score'] = (merged_stats_valid_std['mean_group'] - merged_stats_valid_std['mean_overall']) / merged_stats_valid_std['std_overall']
z_score_threshold = st.slider("Soglia Z-Score per Attenzione:", min_value=1.0, max_value=3.0, value=1.75, step=0.25, key="zscore_thresh")
potential_anomalies = merged_stats_valid_std[abs(merged_stats_valid_std['Z_Score']) > z_score_threshold].sort_values(by='Z_Score')
if not potential_anomalies.empty:
st.write(f"Gruppi/Categorie con punteggio medio deviante (> {z_score_threshold:.2f} dev. std. dalla media della categoria):")
for _, row in potential_anomalies.head(10).iterrows(): # Limit display
group_desc_parts = [f"{col}={row[col]}" for col in valid_demographic_cols]
group_desc = " / ".join(group_desc_parts)
direction = "⚠️ Basso" if row['Z_Score'] < 0 else "✅ Alto"
# Use mean_group and Z_Score from the row
st.markdown(f"- {direction}: **{group_desc}** in **'{row['Categoria']}'** (Media Gruppo: {row['mean_group']:.2f}, Z: {row['Z_Score']:.2f})")
else:
st.info(f"Nessun punto d'attenzione rilevato con soglia Z-Score > {z_score_threshold:.2f} nei dati filtrati.")
else:
st.info("Deviazione standard non calcolabile o nulla per le categorie, impossibile calcolare Z-score.")
else:
st.info("Dati insufficienti per calcolare medie di gruppo o statistiche di categoria.")
except KeyError as e:
st.error(f"Errore Chiave durante il calcolo Z-Score: '{e}'. Verifica i nomi delle colonne dopo il merge.")
st.dataframe(merged_stats.head()) # Display merged df head for debugging
except Exception as e:
st.error(f"Errore generico durante il calcolo Z-Score: {e}")
with col_sugg:
# Suggestions part remains the same, using driver_df calculated in sidebar
st.write("**Suggerimenti Basati sui Driver & Punteggi Bassi:**")
if not avg_scores_per_category_f.empty:
lowest_cat_name = avg_scores_per_category_f.index[0]
lowest_cat_score = avg_scores_per_category_f.iloc[0]
st.markdown(f"**Area più debole (media bassa):** '{lowest_cat_name}' ({lowest_cat_score:.2f}).")
if not driver_df.empty:
avg_corr = driver_df['Correlazione'].mean()
avg_score_all_q = driver_df['Punteggio Medio'].mean()
low_score_threshold = avg_score_all_q
high_impact_threshold = avg_corr
critical_drivers = driver_df[
(driver_df['Punteggio Medio'] < low_score_threshold) &
(driver_df['Correlazione'] > high_impact_threshold)
].sort_values('Correlazione', ascending=False)
if not critical_drivers.empty:
st.markdown("**Priorità Alte (Bassa Performance, Alto Impatto):**")
for _, row in critical_drivers.head(5).iterrows():
st.markdown(f"- *{row['Domanda_Breve']}* (Cat: {row['Categoria']}, Score: {row['Punteggio Medio']:.2f}, Corr: {row['Correlazione']:.2f})")
st.warning("Intervenire su queste domande potrebbe avere il maggior impatto positivo sulla soddisfazione generale.")
else:
st.info("Nessuna domanda trovata nel quadrante 'Priorità Alte' con le soglie attuali.")
# Generic suggestions
suggestions = {
"Stress e benessere": "Considerare iniziative per la gestione dello stress, flessibilità lavorativa, e supporto psicologico.",
# ... (rest of suggestions map) ...
"Apertura e inclusione": "Programmi D&I, garantire libertà di espressione e sicurezza psicologica."
}
default_suggestion = "Approfondire le cause specifiche tramite focus group o interviste mirate."
st.markdown("**Possibili Azioni Generiche per l'Area più Debole:**")
st.info(suggestions.get(lowest_cat_name, default_suggestion))
else: st.write("Nessun dato medio per categoria disponibile per generare suggerimenti.")
else:
st.info("Dati insufficienti per rilevare anomalie o fornire suggerimenti.")
# ==============================================================================
# --- TAB Advanced: More Complex Visualizations ---
# ==============================================================================
with tab_advanced:
st.header("📈 Grafici Avanzati (Filtrati)")
if df_filtered is None or df_filtered.empty:
st.warning("Nessun dato disponibile con i filtri selezionati.")
elif not numeric_question_cols:
st.warning("Nessuna domanda numerica trovata per le analisi avanzate.")
else:
# Use the melted data prepared in the Comparisons tab if available
if 'df_melted_f' not in locals() or df_melted_f.empty:
# Try to recreate df_melted_f if not available
numeric_cols_in_filtered = [col for col in numeric_question_cols if col in df_filtered.columns]
valid_demographic_cols = [col for col in demographic_cols if col in df_filtered.columns]
if valid_demographic_cols and numeric_cols_in_filtered:
df_melted_f = get_melted_data(df_filtered, valid_demographic_cols, numeric_cols_in_filtered, question_to_category_map)
else:
df_melted_f = pd.DataFrame()
if df_melted_f.empty and not numeric_cols_in_filtered: # Check again if still empty or no numerics
st.warning("Dati insufficienti per i grafici avanzati.")
else:
# --- 1. Correlation Heatmap ---
st.subheader("🔥 Heatmap di Correlazione tra Domande Numeriche")
corr_method_options = ['pearson']
if SCIPY_AVAILABLE:
corr_method_options.append('spearman')
corr_method_adv = st.radio("Metodo Correlazione:", corr_method_options, horizontal=True, key="corr_method_adv")
numeric_cols_in_filtered_adv = [col for col in numeric_question_cols if col in df_filtered.columns and df_filtered[col].nunique(dropna=True) > 1]
if len(numeric_cols_in_filtered_adv) > 1:
# Etichette univoche e leggibili
corr_labels = {
q: (f"{str(q)[:27]}..." if len(str(q)) > 30 else str(q)) + f" [{i}]"
for i, q in enumerate(numeric_cols_in_filtered_adv)
}
df_corr = df_filtered[numeric_cols_in_filtered_adv].rename(columns=corr_labels)
try:
corr_matrix = df_corr.corr(method=corr_method_adv)
if not corr_matrix.empty:
fig_heatmap = px.imshow(
corr_matrix,
text_auto=".2f",
aspect="auto",
color_continuous_scale='RdBu_r',
range_color=[-1, 1],
template=PLOTLY_TEMPLATE,
title=f"Heatmap Correlazione ({corr_method_adv.capitalize()}) tra Domande"
)
heatmap_height = max(600, len(numeric_cols_in_filtered_adv) * 20)
fig_heatmap.update_layout(height=heatmap_height, xaxis_tickangle=-45)
st.plotly_chart(fig_heatmap, use_container_width=True)
st.caption("Rosso = correlazione negativa, Blu = correlazione positiva.")
else:
st.warning("Matrice di correlazione vuota.")
except Exception as e:
st.warning(f"Errore nel calcolo heatmap: {e}")
else:
st.info("Servono almeno due domande numeriche con varianza per la heatmap.")
st.markdown("---")
# --- 2. Radar Chart ---
st.subheader("🕸️ Radar Chart: Confronto Medie Categorie per Gruppo Demografico")
if not avg_scores_per_category_f.empty and valid_demographic_cols and not df_melted_f.empty:
radar_demo_options = [col for col in valid_demographic_cols if df_filtered[col].nunique() > 1]
if radar_demo_options:
radar_demo_col = st.selectbox("Seleziona Gruppo Demografico per Confronto Radar:", radar_demo_options, key="radar_demo")
available_groups = sorted(df_filtered[radar_demo_col].astype(str).unique())
available_groups = [g for g in available_groups if g != 'Non specificato'] # Exclude 'Non specificato'?
if len(available_groups) > 1:
groups_to_compare = st.multiselect(f"Seleziona '{radar_demo_col}' da confrontare:", options=available_groups, default=available_groups[:min(len(available_groups), 3)], key="radar_groups")
if groups_to_compare:
radar_data = df_melted_f[df_melted_f[radar_demo_col].isin(groups_to_compare)]
avg_radar = radar_data.groupby(['Categoria', radar_demo_col], observed=True)['Punteggio'].mean().unstack()
avg_radar = avg_radar.dropna(axis=0, how='all') # Drop categories with no data
if not avg_radar.empty:
categories_radar = avg_radar.index.tolist()
fig_radar = go.Figure()
color_sequence = px.colors.qualitative.Plotly # Use a color sequence
for i, group in enumerate(groups_to_compare):
if group in avg_radar.columns:
fig_radar.add_trace(go.Scatterpolar(
r=avg_radar[group].values, theta=categories_radar, fill='toself', name=str(group),
line_color=color_sequence[i % len(color_sequence)] # Cycle through colors
))
min_scale_radar, max_scale_radar = response_scale if response_scale else (1, 6)
fig_radar.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[min_scale_radar-0.5, max_scale_radar+0.5])),
showlegend=True, title=f"Confronto Medie Categorie Radar per {radar_demo_col}", template=PLOTLY_TEMPLATE )
st.plotly_chart(fig_radar, use_container_width=True)
else: st.warning(f"Nessun dato medio disponibile per i gruppi selezionati.")
else: st.info(f"Seleziona almeno un gruppo.")
else: st.info(f"Solo un gruppo disponibile in '{radar_demo_col}'.")
else: st.info("Nessuna colonna demografica con valori multipli per il confronto Radar.")
else: st.info("Dati insufficienti (medie categorie, demo, melted) per il grafico Radar.")
st.markdown("---")
# --- 3. Parallel Coordinates Plot ---
# (Code for Parallel Coordinates - kept similar, relies on df_melted_f)
st.subheader("|| Parrallel Coordinates: Pattern Medie Categorie per Gruppo")
st.warning("Attenzione: Questo grafico può essere lento o illeggibile con molti dati/categorie.")
if not avg_scores_per_category_f.empty and valid_demographic_cols and not df_melted_f.empty:
cats_parallel_options = avg_scores_per_category_f.index.unique().tolist()
if cats_parallel_options:
default_cats_parallel = cats_parallel_options[:min(len(cats_parallel_options), 8)]
cats_parallel = st.multiselect("Seleziona Categorie (Dimensioni):", cats_parallel_options, default=default_cats_parallel, key="par_cats")
if cats_parallel:
parallel_demo_options = [col for col in valid_demographic_cols if df_filtered[col].nunique() > 1]
if parallel_demo_options:
parallel_demo_col = st.selectbox("Colora Linee per Gruppo Demografico:", parallel_demo_options, key="par_demo")
# Calculate mean scores per selected category and chosen demo group
df_parallel_prep = df_melted_f[df_melted_f['Categoria'].isin(cats_parallel)]
df_parallel = df_parallel_prep.groupby([parallel_demo_col, 'Categoria'], observed=True)['Punteggio'].mean().unstack()
df_parallel = df_parallel.dropna().reset_index()
if not df_parallel.empty and parallel_demo_col in df_parallel.columns:
# Map group names to numerical values for continuous color scale
unique_groups_par = df_parallel[parallel_demo_col].unique()
group_map = {name: i for i, name in enumerate(unique_groups_par)}
df_parallel['color_val'] = df_parallel[parallel_demo_col].map(group_map)
dimensions = []
for cat in cats_parallel:
if cat in df_parallel.columns:
dimensions.append(dict(
range = [response_scale[0], response_scale[1]] if response_scale else [1,6],
label = str(cat)[:20] + '...' if len(str(cat))>20 else str(cat),
values = df_parallel[cat] ))
if dimensions:
color_palette_par = px.colors.qualitative.Plotly
fig_parallel = go.Figure(data=
go.Parcoords(
line = dict(color = df_parallel['color_val'],
colorscale = color_palette_par, # Use qualitative scale directly
showscale = False),
dimensions = dimensions ))
fig_parallel.update_layout( title=f"Medie Categorie per {parallel_demo_col} (Parallel Coordinates)", template=PLOTLY_TEMPLATE)
st.plotly_chart(fig_parallel, use_container_width=True)
# Manual legend
st.write(f"**Legenda Colori ({parallel_demo_col}):**")
cols_legend = st.columns(min(len(group_map), 5))
i = 0
for name, num in group_map.items():
color = color_palette_par[num % len(color_palette_par)]
with cols_legend[i % min(len(group_map), 5)]:
st.markdown(f"<span style='color:{color}; font-weight:bold;'>■</span> {name}", unsafe_allow_html=True)
i += 1
else: st.warning("Nessuna dimensione valida per Parallel Coordinates.")
else: st.warning(f"Nessun dato medio aggregato per {parallel_demo_col}.")
else: st.info("Nessuna colonna demografica con valori multipli per colorare le linee.")
else: st.info("Seleziona almeno una categoria (dimensione).")
else: st.info("Nessuna categoria disponibile per Parallel Coordinates.")
else: st.info("Dati insufficienti (medie categorie, demo, melted) per Parallel Coordinates.")
st.markdown("---")
# --- 4. Stacked Area Chart ---
# (Code for Stacked Area Chart - kept similar, relies on df_melted_f)
st.subheader("📊 Stacked Area Chart: Distribuzione Risposte per Categoria su Gruppo Ordinato")
if not df_melted_f.empty and valid_demographic_cols:
ordered_demo_options = [col for col in valid_demographic_cols if 'Eta' in col or 'Anzianita' in col]
if not ordered_demo_options: ordered_demo_options = valid_demographic_cols # Fallback
if ordered_demo_options:
area_demo_col = st.selectbox("Seleziona Gruppo Demografico Ordinato:", ordered_demo_options, key="area_demo")
area_cat_options = avg_scores_per_category_f.index.unique().tolist()
if area_cat_options:
area_category = st.selectbox("Seleziona Categoria:", area_cat_options, key="area_cat")
df_area_prep = df_melted_f[(df_melted_f['Categoria'] == area_category) & df_melted_f[area_demo_col].notna()].copy()
if not df_area_prep.empty:
df_area_prep['Sentiment'] = df_area_prep['Punteggio'].apply(categorize_score)
df_area = df_area_prep.groupby([area_demo_col, 'Sentiment'], observed=True).size().reset_index(name='Conteggio')
df_area['Percentuale'] = df_area.groupby(area_demo_col)['Conteggio'].transform(lambda x: x / float(x.sum()) * 100 if x.sum() > 0 else 0)
category_orders = {}
group_order = None
if 'Eta' in area_demo_col:
age_order_guess = ['Fino a 30 anni', '31-40 anni', '41-50 anni', 'Oltre i 50 anni', 'Non specificato']
actual_groups = df_area[area_demo_col].unique()
group_order = [g for g in age_order_guess if g in actual_groups]
group_order.extend(sorted([g for g in actual_groups if g not in age_order_guess]))
category_orders={area_demo_col: group_order}
# Ensure Sentiment order for stacking
sentiment_order = ["Critico", "Neutrale", "Positivo", "Non Risposto"]
category_orders['Sentiment'] = [s for s in sentiment_order if s in df_area['Sentiment'].unique()]
plot_colors = BUCKET_COLORS.copy()
plot_colors["Non Risposto"] = "#bbbbbb"
if not df_area.empty:
fig_area = px.area(df_area, x=area_demo_col, y='Percentuale', color='Sentiment',
title=f"Distribuzione Sentiment (%) per '{area_category}' per {area_demo_col}",
labels={'Percentuale': '% Rispondenti'},
category_orders=category_orders,
color_discrete_map=plot_colors,
template=PLOTLY_TEMPLATE)
fig_area.update_layout(yaxis_range=[0, 100], yaxis_ticksuffix="%")
st.plotly_chart(fig_area, use_container_width=True)
else: st.warning("Nessun dato aggregato per l'Area Chart.")
else: st.warning(f"Nessun dato trovato per la categoria '{area_category}'.")
else: st.info("Nessuna categoria valida trovata.")
else: st.info("Nessuna colonna demografica disponibile per l'Area Chart.")
else: st.info("Dati insufficienti (melted, demo) per l'Area Chart.")
# --- Download Button ---
st.sidebar.divider()
st.sidebar.subheader("📥 Download Dati Filtrati")
if df_filtered is not None and not df_filtered.empty:
output = BytesIO()
try:
df_to_download = df_filtered.copy()
df_to_download.to_csv(output, index=False, encoding='utf-8', sep=';')
output.seek(0)
st.sidebar.download_button(label="Scarica Dati Filtrati Correnti (CSV)", data=output,
file_name='dati_sondaggio_filtrati_avanzato.csv', mime='text/csv', key='download_csv')
except Exception as e:
st.sidebar.error(f"Errore durante la creazione del CSV: {e}")
else:
st.sidebar.info("Nessun dato filtrato da scaricare.")
# --- Footer ---
st.markdown("---")
# Use a dynamic timestamp
try:
current_time_str = pd.Timestamp.now(tz='Europe/Rome').strftime('%Y-%m-%d %H:%M:%S %Z')
except Exception: # Fallback if timezone fails
current_time_str = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
st.caption(f"Dashboard Analisi Clima")
# Altrimenti (se uploaded_file is None), non mostra nulla tranne l'uploader
else:
st.title("🚀 Dashboard Analisi Clima")
st.info("Per iniziare, carica un file CSV usando il widget qui sopra.")