Spaces:

noumanjavaid
/

billing-insite

Sleeping

App Files Files Community

noumanjavaid commited on Apr 21

Commit

50aedaa

verified ·

1 Parent(s): d6bc3d3

Update app.py

Browse files

Files changed (1) hide show

app.py +662 -2

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import plotly.graph_objects as go
 import google.generativeai as genai
 import os
 import warnings
 from dotenv import load_dotenv
 load_dotenv()
@@ -25,7 +26,7 @@ def configure_gemini():
         if not GEMINI_API_KEY:
             st.error("⚠️ AI helper not available (missing API key).")
             return None
-        genai.configure(api_key=GEMINI_API_KEY)
         model = genai.GenerativeModel('gemini-1.5-flash')
         return model
     except Exception as e:
@@ -291,4 +292,663 @@ if st.session_state.analysis_step >= 2:
                 # Fill in missing categories
                 categorical_cols = df_processed.select_dtypes(exclude=[np.number, 'datetime64[ns]']).columns
                 for col in categorical_cols:
-                    if df_processed[col].isnull().any():

 import google.generativeai as genai
 import os
 import warnings
+from sklearn.ensemble import RandomForestRegressor
 from dotenv import load_dotenv
 load_dotenv()
         if not GEMINI_API_KEY:
             st.error("⚠️ AI helper not available (missing API key).")
             return None
+        genai.configure(api_key="AIzaSyBAIYRTzVJZyLEcPpIyyc0Ceb4b04WmVY0")
         model = genai.GenerativeModel('gemini-1.5-flash')
         return model
     except Exception as e:
                 # Fill in missing categories
                 categorical_cols = df_processed.select_dtypes(exclude=[np.number, 'datetime64[ns]']).columns
                 for col in categorical_cols:
+                    if df_processed[col].isnull().any():
+                        try:
+                            mode_val = df_processed[col].mode()[0]
+                            df_processed[col] = df_processed[col].fillna(mode_val)
+                        except IndexError:
+                            df_processed[col] = df_processed[col].fillna('Unknown')
+                st.write("- Filled in missing categories with most common values")
+                # Store cleaned data for future use
+                st.session_state.cleaned_df = df_processed
+                st.session_state.analysis_step = max(st.session_state.analysis_step, 3)
+                st.success("✅ Data prepared successfully!")
+        # Now show the payment timing overview
+        if selected_analysis == "Payment Timing Overview":
+            st.subheader("📊 Payment Timing Overview")
+            cleaned_df = st.session_state.cleaned_df
+            payment_col = colmap.get('target_col')
+            if payment_col and payment_col in cleaned_df.columns:
+                # Create a layout with columns
+                col1, col2 = st.columns(2)
+                with col1:
+                    # Create a histogram showing payment timing distribution
+                    st.write("#### Distribution of Payment Timing")
+                    fig_hist = px.histogram(
+                        cleaned_df,
+                        x=payment_col,
+                        nbins=20,
+                        title=f"How quickly payments are made",
+                        labels={payment_col: "Days to Payment"},
+                        color_discrete_sequence=['#3366CC'],
+                    )
+                    fig_hist.update_layout(
+                        xaxis_title="Days to Payment",
+                        yaxis_title="Number of Invoices",
+                        showlegend=False
+                    )
+                    # Add a vertical line for average
+                    mean_days = cleaned_df[payment_col].mean()
+                    fig_hist.add_vline(
+                        x=mean_days,
+                        line_dash="dash",
+                        line_color="red",
+                        annotation_text=f"Average: {mean_days:.1f} days",
+                        annotation_position="top right"
+                    )
+                    # Add a vertical line for on-time (0 days)
+                    if colmap.get('payment_timing_vs_due'):
+                        fig_hist.add_vline(
+                            x=0,
+                            line_dash="dash",
+                            line_color="green",
+                            annotation_text="Due Date",
+                            annotation_position="top left"
+                        )
+                    st.plotly_chart(fig_hist, use_container_width=True)
+                with col2:
+                    # Create a box plot showing the spread of payment timings
+                    st.write("#### Payment Timing Statistics")
+                    fig_box = px.box(
+                        cleaned_df,
+                        y=payment_col,
+                        title="Range of Payment Timings",
+                        points="all",
+                        labels={payment_col: "Days to Payment"},
+                        color_discrete_sequence=['#3366CC'],
+                    )
+                    fig_box.update_layout(
+                        yaxis_title="Days to Payment",
+                        showlegend=False
+                    )
+                    st.plotly_chart(fig_box, use_container_width=True)
+                # Payment timing stats
+                st.write("#### Key Payment Statistics")
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    st.metric("Average Days", f"{cleaned_df[payment_col].mean():.1f}")
+                with col2:
+                    st.metric("Median Days", f"{cleaned_df[payment_col].median():.1f}")
+                with col3:
+                    early_percent = (cleaned_df[payment_col] <= 0).mean() * 100 if 'payment_timing_vs_due' in colmap.values() else None
+                    if early_percent is not None:
+                        st.metric("% Paid On Time", f"{early_percent:.1f}%")
+                    else:
+                        st.metric("Minimum Days", f"{cleaned_df[payment_col].min():.1f}")
+                with col4:
+                    st.metric("Maximum Days", f"{cleaned_df[payment_col].max():.1f}")
+                # Get AI explanation if payment_col is days vs due date
+                if colmap.get('payment_timing_vs_due') == payment_col:
+                    # Ask AI for explanation of payment patterns
+                    prompt = f"""Based on this payment data summary:
+                    - Average days to payment: {cleaned_df[payment_col].mean():.1f}
+                    - Median days to payment: {cleaned_df[payment_col].median():.1f}
+                    - % paid on time: {early_percent:.1f}%
+                    - Maximum days late: {cleaned_df[payment_col].max():.1f}
+                    Please explain in simple terms what this tells us about how customers are paying their invoices.
+                    Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
+                    """
+                    with st.expander("💡 What does this mean for my business?", expanded=True):
+                        explanation = ask_gemini(prompt)
+                        st.markdown(explanation)
+            else:
+                st.error("No payment timing column available. Please check your column mappings.")
+    # --- Step 3.2: Payment Patterns by Category ---
+    if selected_analysis == "Payment Patterns by Category":
+        st.subheader("📊 Payment Patterns by Category")
+        cleaned_df = st.session_state.cleaned_df
+        payment_col = colmap.get('target_col')
+        # Define possible category columns and let user select which to analyze
+        category_cols = []
+        for key in ['revenue_type', 'payment_method', 'customer_id']:
+            col = colmap.get(key)
+            if col and col in cleaned_df.columns:
+                category_cols.append(col)
+        if not category_cols:
+            st.warning("No category columns were identified. Please go back to column mapping and identify at least one of: Revenue Type, Payment Method, or Customer ID.")
+        else:
+            # Let user select which category to analyze
+            selected_category = st.selectbox(
+                "Select category to analyze:",
+                options=category_cols,
+                key="category_selector"
+            )
+            if selected_category and payment_col and payment_col in cleaned_df.columns:
+                # Limit categories to top 10 by frequency to avoid cluttered charts
+                top_categories = cleaned_df[selected_category].value_counts().nlargest(10).index
+                filtered_df = cleaned_df[cleaned_df[selected_category].isin(top_categories)]
+                # Create layout with columns
+                col1, col2 = st.columns(2)
+                with col1:
+                    # Box plot showing payment timing by category
+                    st.write(f"#### Payment Timing by {selected_category}")
+                    fig_category_box = px.box(
+                        filtered_df,
+                        x=selected_category,
+                        y=payment_col,
+                        color=selected_category,
+                        title=f"How different {selected_category} categories pay",
+                        labels={payment_col: "Days to Payment"},
+                    )
+                    fig_category_box.update_layout(
+                        xaxis_title=selected_category,
+                        yaxis_title="Days to Payment",
+                        xaxis={'categoryorder':'total descending'}
+                    )
+                    st.plotly_chart(fig_category_box, use_container_width=True)
+                with col2:
+                    # Bar chart showing average payment time by category
+                    st.write(f"#### Average Payment Time by {selected_category}")
+                    category_avg = filtered_df.groupby(selected_category)[payment_col].mean().reset_index()
+                    category_avg = category_avg.sort_values(payment_col)
+                    fig_category_bar = px.bar(
+                        category_avg,
+                        x=selected_category,
+                        y=payment_col,
+                        color=selected_category,
+                        title=f"Average days to payment by {selected_category}",
+                        labels={payment_col: "Average Days to Payment"},
+                    )
+                    fig_category_bar.update_layout(
+                        xaxis_title=selected_category,
+                        yaxis_title="Average Days to Payment",
+                        showlegend=False
+                    )
+                    st.plotly_chart(fig_category_bar, use_container_width=True)
+                # Calculate statistics by category
+                category_stats = filtered_df.groupby(selected_category).agg({
+                    payment_col: ['mean', 'median', 'count'],
+                })
+                category_stats.columns = [' '.join(col).strip() for col in category_stats.columns.values]
+                category_stats = category_stats.reset_index().rename(
+                    columns={f"{payment_col} mean": "Avg Days",
+                             f"{payment_col} median": "Median Days",
+                             f"{payment_col} count": "Count"}
+                )
+                category_stats["Avg Days"] = category_stats["Avg Days"].round(1)
+                category_stats["Median Days"] = category_stats["Median Days"].round(1)
+                # Show the statistics table
+                st.write(f"#### Statistics by {selected_category}")
+                st.dataframe(category_stats, use_container_width=True)
+                # Get AI explanation based on category
+                fastest_category = category_stats.loc[category_stats["Avg Days"].idxmin()][selected_category]
+                slowest_category = category_stats.loc[category_stats["Avg Days"].idxmax()][selected_category]
+                avg_diff = category_stats["Avg Days"].max() - category_stats["Avg Days"].min()
+                prompt = f"""Based on this payment data analysis by {selected_category}:
+                - Fastest paying category: {fastest_category} (average {category_stats['Avg Days'].min():.1f} days)
+                - Slowest paying category: {slowest_category} (average {category_stats['Avg Days'].max():.1f} days)
+                - Difference between fastest and slowest: {avg_diff:.1f} days
+                Please explain in simple terms what this tells us about how different {selected_category} categories are paying.
+                Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
+                """
+                with st.expander("💡 What does this pattern mean?", expanded=True):
+                    explanation = ask_gemini(prompt)
+                    st.markdown(explanation)
+    # --- Step 3.3: Customer Groups ---
+    if selected_analysis == "Customer Groups":
+        st.subheader("👥 Customer Payment Behavior Groups")
+        cleaned_df = st.session_state.cleaned_df
+        payment_col = colmap.get('target_col')
+        amount_col = colmap.get('amount')
+        customer_col = colmap.get('customer_id')
+        if not customer_col or not customer_col in cleaned_df.columns:
+            st.warning("Please identify a Customer ID column in the column mapping step to see customer groups.")
+        elif not payment_col or not payment_col in cleaned_df.columns:
+            st.warning("No payment timing column available. Please check your column mappings.")
+        else:
+            # Create customer-level summary
+            st.write("Analyzing customer payment patterns...")
+            # Group by customer and calculate statistics
+            customer_stats = cleaned_df.groupby(customer_col).agg({
+                payment_col: ['mean', 'median', 'min', 'max', 'count'],
+            })
+            customer_stats.columns = [' '.join(col).strip() for col in customer_stats.columns.values]
+            customer_stats = customer_stats.reset_index()
+            # Add amount stats if available
+            if amount_col and amount_col in cleaned_df.columns:
+                amount_stats = cleaned_df.groupby(customer_col).agg({
+                    amount_col: ['mean', 'sum'],
+                })
+                amount_stats.columns = [' '.join(col).strip() for col in amount_stats.columns.values]
+                amount_stats = amount_stats.reset_index()
+                customer_stats = customer_stats.merge(amount_stats, on=customer_col)
+            # Define payment behavior groups based on mean payment time
+            # Only apply if we have payment timing vs due date
+            if colmap.get('payment_timing_vs_due') == payment_col:
+                # Create behavior groups
+                conditions = [
+                    (customer_stats[f"{payment_col} mean"] < -5),  # Very early (>5 days before due)
+                    (customer_stats[f"{payment_col} mean"] >= -5) & (customer_stats[f"{payment_col} mean"] < 0),  # Early (0-5 days before due)
+                    (customer_stats[f"{payment_col} mean"] >= 0) & (customer_stats[f"{payment_col} mean"] < 15),  # On time to slightly late (0-15 days)
+                    (customer_stats[f"{payment_col} mean"] >= 15) & (customer_stats[f"{payment_col} mean"] < 30),  # Moderately late (15-30 days)
+                    (customer_stats[f"{payment_col} mean"] >= 30)  # Very late (>30 days)
+                ]
+                values = ['Very Early Payers', 'Early Payers', 'On-time/Slightly Late', 'Moderately Late', 'Very Late Payers']
+                customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown')
+            else:
+                # If we don't have vs due date, create relative groups
+                median_pay_time = customer_stats[f"{payment_col} mean"].median()
+                conditions = [
+                    (customer_stats[f"{payment_col} mean"] < 0.6 * median_pay_time),  # Much faster than median
+                    (customer_stats[f"{payment_col} mean"] >= 0.6 * median_pay_time) & (customer_stats[f"{payment_col} mean"] < 0.9 * median_pay_time),  # Faster than median
+                    (customer_stats[f"{payment_col} mean"] >= 0.9 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.1 * median_pay_time),  # Around median
+                    (customer_stats[f"{payment_col} mean"] > 1.1 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.5 * median_pay_time),  # Slower than median
+                    (customer_stats[f"{payment_col} mean"] > 1.5 * median_pay_time)  # Much slower than median
+                ]
+                values = ['Much Faster Payers', 'Faster Payers', 'Average Payers', 'Slower Payers', 'Much Slower Payers']
+                customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown')
+            # Count customers in each group
+            behavior_counts = customer_stats['Payment Behavior'].value_counts().reset_index()
+            behavior_counts.columns = ['Payment Behavior', 'Number of Customers']
+            # Create tabs for different views
+            tab1, tab2 = st.tabs(["Customer Groups", "Individual Customers"])
+            with tab1:
+                st.write("#### Customer Payment Behavior Groups")
+                # Create a pie chart showing distribution of customer behavior
+                fig_pie = px.pie(
+                    behavior_counts,
+                    values='Number of Customers',
+                    names='Payment Behavior',
+                    title="Distribution of Customer Payment Behavior"
+                )
+                st.plotly_chart(fig_pie, use_container_width=True)
+                # Show statistics for each behavior group
+                behavior_group_stats = customer_stats.groupby('Payment Behavior').agg({
+                    f"{payment_col} mean": 'mean',
+                    f"{payment_col} count": 'sum',
+                    customer_col: 'count'
+                }).reset_index()
+                behavior_group_stats.columns = ['Payment Behavior', 'Avg Days to Payment', 'Total Invoices', 'Customer Count']
+                behavior_group_stats["Avg Days to Payment"] = behavior_group_stats["Avg Days to Payment"].round(1)
+                # Add amount statistics if available
+                if amount_col and amount_col in cleaned_df.columns and f"{amount_col} sum" in customer_stats.columns:
+                    amount_by_behavior = customer_stats.groupby('Payment Behavior')[f"{amount_col} sum"].sum().reset_index()
+                    behavior_group_stats = behavior_group_stats.merge(amount_by_behavior, on='Payment Behavior')
+                    behavior_group_stats.rename(columns={f"{amount_col} sum": "Total Amount"}, inplace=True)
+                st.dataframe(behavior_group_stats, use_container_width=True)
+                # Get AI explanation
+                largest_group = behavior_counts.loc[behavior_counts['Number of Customers'].idxmax()]['Payment Behavior']
+                prompt = f"""Based on this customer payment behavior analysis:
+                - Largest customer group: {largest_group} ({behavior_counts['Number of Customers'].max()} customers)
+                - Total customer segments: {len(behavior_counts)}
+                Please explain in simple terms what this tells us about our customer base and their payment habits.
+                Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
+                """
+                with st.expander("💡 What does this mean for my business?", expanded=True):
+                    explanation = ask_gemini(prompt)
+                    st.markdown(explanation)
+            with tab2:
+                st.write("#### Individual Customer Payment Behavior")
+                st.write("Search for specific customers or sort by payment behavior:")
+                # Prepare the customer table
+                customer_display = customer_stats.copy()
+                customer_display = customer_display.rename(columns={
+                    f"{payment_col} mean": "Avg Days to Payment",
+                    f"{payment_col} median": "Median Days",
+                    f"{payment_col} min": "Min Days",
+                    f"{payment_col} max": "Max Days",
+                    f"{payment_col} count": "Invoice Count"
+                })
+                if amount_col and amount_col in cleaned_df.columns:
+                    customer_display = customer_display.rename(columns={
+                        f"{amount_col} mean": "Avg Amount",
+                        f"{amount_col} sum": "Total Amount"
+                    })
+                # Round numeric columns
+                numeric_cols = customer_display.select_dtypes(include=[np.number]).columns
+                customer_display[numeric_cols] = customer_display[numeric_cols].round(2)
+                # Allow filtering
+                selected_behavior = st.multiselect(
+                    "Filter by payment behavior:",
+                    options=customer_display['Payment Behavior'].unique(),
+                    default=None
+                )
+                if selected_behavior:
+                    filtered_customers = customer_display[customer_display['Payment Behavior'].isin(selected_behavior)]
+                else:
+                    filtered_customers = customer_display
+                st.dataframe(filtered_customers, use_container_width=True)
+    # --- Step 3.4: Payment Predictions ---
+    if selected_analysis == "Payment Predictions":
+        st.subheader("🔮 Payment Time Prediction Model")
+        cleaned_df = st.session_state.cleaned_df
+        payment_col = colmap.get('target_col')
+        # Check if we have the necessary data
+        if not payment_col or not payment_col in cleaned_df.columns:
+            st.warning("No payment timing column available. Please check your column mappings.")
+        else:
+            st.write("This model helps you predict when customers will pay based on invoice characteristics.")
+            # Identify potential predictor variables
+            numeric_cols = cleaned_df.select_dtypes(include=np.number).columns.tolist()
+            categorical_cols = cleaned_df.select_dtypes(include=['object', 'category']).columns.tolist()
+            # Remove the target variable from predictors
+            if payment_col in numeric_cols:
+                numeric_cols.remove(payment_col)
+            # Prepare predictor variables
+            potential_predictors = []
+            # Add amount if available
+            amount_col = colmap.get('amount')
+            if amount_col and amount_col in cleaned_df.columns:
+                potential_predictors.append(amount_col)
+            # Add customer_id if available
+            customer_col = colmap.get('customer_id')
+            if customer_col and customer_col in cleaned_df.columns and len(cleaned_df[customer_col].unique()) < 100:
+                potential_predictors.append(customer_col)
+            # Add revenue_type and payment_method if available
+            for key in ['revenue_type', 'payment_method']:
+                col = colmap.get(key)
+                if col and col in cleaned_df.columns:
+                    potential_predictors.append(col)
+            # Add other numeric columns that might be useful
+            for col in numeric_cols:
+                if col not in potential_predictors and col != payment_col and 'date' not in col.lower():
+                    potential_predictors.append(col)
+            if len(potential_predictors) < 1:
+                st.warning("Not enough predictor variables available for modeling. Please identify more columns in your data.")
+            else:
+                # Let user select predictor variables
+                st.write("#### Select variables to use for prediction")
+                selected_predictors = st.multiselect(
+                    "Choose which factors might influence payment timing:",
+                    options=potential_predictors,
+                    default=potential_predictors[:min(3, len(potential_predictors))]  # Default to first 3
+                )
+                if len(selected_predictors) < 1:
+                    st.warning("Please select at least one predictor variable.")
+                else:
+                    # Prepare data for modeling
+                    X = cleaned_df[selected_predictors].copy()
+                    y = cleaned_df[payment_col].copy()
+                    # Process categorical variables
+                    X_processed = pd.DataFrame()
+                    for col in X.columns:
+                        if col in categorical_cols:
+                            # One-hot encode categorical variables
+                            dummies = pd.get_dummies(X[col], prefix=col, drop_first=True)
+                            X_processed = pd.concat([X_processed, dummies], axis=1)
+                        else:
+                            # Keep numeric columns as is
+                            X_processed[col] = X[col]
+                    # Check if we have any data after processing
+                    if X_processed.shape[1] == 0:
+                        st.warning("No usable predictor variables after processing. Please select different variables.")
+                    else:
+                        # Build and train the model
+                        with st.spinner("Training prediction model..."):
+                            try:
+                                # Fill any remaining missing values with median
+                                X_processed = X_processed.fillna(X_processed.median())
+                                # Train a Random Forest model
+                                model = RandomForestRegressor(n_estimators=100, random_state=42)
+                                model.fit(X_processed, y)
+                                # Get feature importances
+                                feature_imp = pd.DataFrame({
+                                    'Feature': X_processed.columns,
+                                    'Importance': model.feature_importances_
+                                }).sort_values('Importance', ascending=False)
+                                st.success("✅ Payment prediction model trained!")
+                                # Show feature importances
+                                st.write("#### Factors that influence payment timing")
+                                fig_imp = px.bar(
+                                    feature_imp.head(10),
+                                    x='Importance',
+                                    y='Feature',
+                                    orientation='h',
+                                    title="Which factors best predict payment timing",
+                                    labels={'Importance': 'Importance Score'},
+                                )
+                                st.plotly_chart(fig_imp, use_container_width=True)
+                                # Get AI explanation of feature importance
+                                top_features = ", ".join(feature_imp.head(3)['Feature'].tolist())
+                                prompt = f"""Based on this payment prediction model analysis:
+                                - Top predictive factors: {top_features}
+                                Please explain in simple terms what this tells us about what influences payment timing.
+                                Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
+                                """
+                                with st.expander("💡 What influences payment timing?", expanded=True):
+                                    explanation = ask_gemini(prompt)
+                                    st.markdown(explanation)
+                                # Create prediction interface
+                                st.write("#### Predict payment timing for new invoices")
+                                st.write("Enter values for a new invoice to predict when it will be paid:")
+                                # Create input widgets for each predictor
+                                new_invoice_data = {}
+                                for col in selected_predictors:
+                                    if col in categorical_cols:
+                                        options = cleaned_df[col].unique().tolist()
+                                        new_invoice_data[col] = st.selectbox(f"Select {col}:", options=options)
+                                    else:
+                                        min_val = cleaned_df[col].min()
+                                        max_val = cleaned_df[col].max()
+                                        step = (max_val - min_val) / 100
+                                        new_invoice_data[col] = st.slider(f"Set {col}:", min_value=float(min_val), max_value=float(max_val), step=float(step))
+                                # Make prediction when button is clicked
+                                if st.button("Predict Payment Timing"):
+                                    # Prepare the input data in the same format as the training data
+                                    X_new = pd.DataFrame([new_invoice_data])
+                                    X_new_processed = pd.DataFrame()
+                                    # Process the input data the same way as training data
+                                    for col in X_new.columns:
+                                        if col in categorical_cols:
+                                            dummies = pd.get_dummies(X_new[col], prefix=col, drop_first=True)
+                                            X_new_processed = pd.concat([X_new_processed, dummies], axis=1)
+                                        else:
+                                            X_new_processed[col] = X_new[col]
+                                    # Add missing columns that were in the training data
+                                    for col in X_processed.columns:
+                                        if col not in X_new_processed.columns:
+                                            X_new_processed[col] = 0
+                                    # Keep only the columns used during training
+                                    X_new_processed = X_new_processed[X_processed.columns]
+                                    # Make prediction
+                                    prediction = model.predict(X_new_processed)[0]
+                                    # Display prediction
+                                    st.success(f"Predicted payment timing: **{prediction:.1f} days**")
+                                    # Add interpretation if we're predicting vs due date
+                                    if colmap.get('payment_timing_vs_due') == payment_col:
+                                        if prediction < 0:
+                                            st.info(f"This invoice is predicted to be paid **{abs(prediction):.1f} days before** the due date.")
+                                        elif prediction == 0:
+                                            st.info("This invoice is predicted to be paid exactly on the due date.")
+                                        else:
+                                            st.info(f"This invoice is predicted to be paid **{prediction:.1f} days after** the due date.")
+                            except Exception as e:
+                                st.error(f"Error building prediction model: {e}")
+    # --- Step 3.5: Summary Report ---
+    if selected_analysis == "Summary Report":
+        st.subheader("📝 Payment Analysis Summary Report")
+        cleaned_df = st.session_state.cleaned_df
+        payment_col = colmap.get('target_col')
+        if not payment_col or not payment_col in cleaned_df.columns:
+            st.warning("No payment timing column available. Please check your column mappings.")
+        else:
+            # Overall payment statistics
+            st.write("### Overall Payment Performance")
+            mean_days = cleaned_df[payment_col].mean()
+            median_days = cleaned_df[payment_col].median()
+            # Calculate on-time percentage if we have vs due date
+            if colmap.get('payment_timing_vs_due') == payment_col:
+                ontime_pct = (cleaned_df[payment_col] <= 0).mean() * 100
+                late_pct = 100 - ontime_pct
+            # Display metrics in columns
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Average Days to Payment", f"{mean_days:.1f}")
+            with col2:
+                st.metric("Median Days to Payment", f"{median_days:.1f}")
+            with col3:
+                if colmap.get('payment_timing_vs_due') == payment_col:
+                    st.metric("% Paid On Time", f"{ontime_pct:.1f}%")
+                else:
+                    st.metric("Total Invoices", f"{len(cleaned_df)}")
+            # Summary visualization
+            st.write("### Payment Timeline")
+            # Create a histogram of payment distribution
+            fig_summary = px.histogram(
+                cleaned_df,
+                x=payment_col,
+                nbins=30,
+                title="Distribution of Payment Timing",
+                color_discrete_sequence=['#3366CC']
+            )
+            # Add markers for key statistics
+            fig_summary.add_vline(x=mean_days, line_dash="dash", line_color="red", annotation_text=f"Mean: {mean_days:.1f}")
+            fig_summary.add_vline(x=median_days, line_dash="dash", line_color="green", annotation_text=f"Median: {median_days:.1f}")
+            if colmap.get('payment_timing_vs_due') == payment_col:
+                fig_summary.add_vline(x=0, line_dash="solid", line_color="black", annotation_text="Due Date")
+            st.plotly_chart(fig_summary, use_container_width=True)
+            # Generate a summary report with key findings
+            st.write("### Key Findings")
+            # Gather key data points for the AI summary
+            summary_data = {
+                "mean_days": mean_days,
+                "median_days": median_days,
+                "min_days": cleaned_df[payment_col].min(),
+                "max_days": cleaned_df[payment_col].max(),
+                "invoice_count": len(cleaned_df)
+            }
+            # Add on-time percentages if available
+            if colmap.get('payment_timing_vs_due') == payment_col:
+                summary_data["ontime_pct"] = ontime_pct
+                summary_data["late_pct"] = late_pct
+            # Add category information if available
+            category_insights = []
+            for cat_key in ['revenue_type', 'payment_method']:
+                cat_col = colmap.get(cat_key)
+                if cat_col and cat_col in cleaned_df.columns:
+                    # Calculate best and worst categories
+                    cat_avg = cleaned_df.groupby(cat_col)[payment_col].mean()
+                    if not cat_avg.empty:
+                        best_cat = cat_avg.idxmin()
+                        worst_cat = cat_avg.idxmax()
+                        best_days = cat_avg.min()
+                        worst_days = cat_avg.max()
+                        category_insights.append(f"Best {cat_key}: {best_cat} ({best_days:.1f} days)")
+                        category_insights.append(f"Worst {cat_key}: {worst_cat} ({worst_days:.1f} days)")
+            # Build prompt for AI summary
+            prompt = f"""Based on this payment data analysis:
+            - Average days to payment: {summary_data['mean_days']:.1f}
+            - Median days to payment: {summary_data['median_days']:.1f}
+            - Range: {summary_data['min_days']:.1f} to {summary_data['max_days']:.1f} days
+            - Total invoices analyzed: {summary_data['invoice_count']}
+            """
+            if 'ontime_pct' in summary_data:
+                prompt += f"- On-time payments: {summary_data['ontime_pct']:.1f}%\n"
+                prompt += f"- Late payments: {summary_data['late_pct']:.1f}%\n"
+            for insight in category_insights:
+                prompt += f"- {insight}\n"
+            prompt += """\nPlease provide a clear, concise 3-4 sentence summary of what this payment data tells us about the business.
+            Focus on the most important insights that would be valuable for financial decision-making.
+            Use simple, non-technical language that anyone in the business could understand.
+            """
+            with st.expander("💡 Summary of Key Findings", expanded=True):
+                final_summary = ask_gemini(prompt)
+                st.markdown(final_summary)
+            # Recommendations section
+            st.write("### Recommendations")
+            # Build prompt for AI recommendations
+            rec_prompt = prompt + "\n\nBased on this data, please provide 3-4 specific, actionable recommendations to improve payment collection. Each recommendation should be 1-2 sentences and focused on practical actions the business can take."
+            recommendations = ask_gemini(rec_prompt)
+            st.markdown(recommendations)