Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ import plotly.graph_objects as go
|
|
6 |
import google.generativeai as genai
|
7 |
import os
|
8 |
import warnings
|
|
|
9 |
from dotenv import load_dotenv
|
10 |
load_dotenv()
|
11 |
|
@@ -25,7 +26,7 @@ def configure_gemini():
|
|
25 |
if not GEMINI_API_KEY:
|
26 |
st.error("⚠️ AI helper not available (missing API key).")
|
27 |
return None
|
28 |
-
genai.configure(api_key=
|
29 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
30 |
return model
|
31 |
except Exception as e:
|
@@ -291,4 +292,663 @@ if st.session_state.analysis_step >= 2:
|
|
291 |
# Fill in missing categories
|
292 |
categorical_cols = df_processed.select_dtypes(exclude=[np.number, 'datetime64[ns]']).columns
|
293 |
for col in categorical_cols:
|
294 |
-
if df_processed[col].isnull().any():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import google.generativeai as genai
|
7 |
import os
|
8 |
import warnings
|
9 |
+
from sklearn.ensemble import RandomForestRegressor
|
10 |
from dotenv import load_dotenv
|
11 |
load_dotenv()
|
12 |
|
|
|
26 |
if not GEMINI_API_KEY:
|
27 |
st.error("⚠️ AI helper not available (missing API key).")
|
28 |
return None
|
29 |
+
genai.configure(api_key="AIzaSyBAIYRTzVJZyLEcPpIyyc0Ceb4b04WmVY0")
|
30 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
31 |
return model
|
32 |
except Exception as e:
|
|
|
292 |
# Fill in missing categories
|
293 |
categorical_cols = df_processed.select_dtypes(exclude=[np.number, 'datetime64[ns]']).columns
|
294 |
for col in categorical_cols:
|
295 |
+
if df_processed[col].isnull().any():
|
296 |
+
try:
|
297 |
+
mode_val = df_processed[col].mode()[0]
|
298 |
+
df_processed[col] = df_processed[col].fillna(mode_val)
|
299 |
+
except IndexError:
|
300 |
+
df_processed[col] = df_processed[col].fillna('Unknown')
|
301 |
+
st.write("- Filled in missing categories with most common values")
|
302 |
+
|
303 |
+
# Store cleaned data for future use
|
304 |
+
st.session_state.cleaned_df = df_processed
|
305 |
+
st.session_state.analysis_step = max(st.session_state.analysis_step, 3)
|
306 |
+
st.success("✅ Data prepared successfully!")
|
307 |
+
|
308 |
+
# Now show the payment timing overview
|
309 |
+
if selected_analysis == "Payment Timing Overview":
|
310 |
+
st.subheader("📊 Payment Timing Overview")
|
311 |
+
cleaned_df = st.session_state.cleaned_df
|
312 |
+
payment_col = colmap.get('target_col')
|
313 |
+
|
314 |
+
if payment_col and payment_col in cleaned_df.columns:
|
315 |
+
# Create a layout with columns
|
316 |
+
col1, col2 = st.columns(2)
|
317 |
+
|
318 |
+
with col1:
|
319 |
+
# Create a histogram showing payment timing distribution
|
320 |
+
st.write("#### Distribution of Payment Timing")
|
321 |
+
fig_hist = px.histogram(
|
322 |
+
cleaned_df,
|
323 |
+
x=payment_col,
|
324 |
+
nbins=20,
|
325 |
+
title=f"How quickly payments are made",
|
326 |
+
labels={payment_col: "Days to Payment"},
|
327 |
+
color_discrete_sequence=['#3366CC'],
|
328 |
+
)
|
329 |
+
fig_hist.update_layout(
|
330 |
+
xaxis_title="Days to Payment",
|
331 |
+
yaxis_title="Number of Invoices",
|
332 |
+
showlegend=False
|
333 |
+
)
|
334 |
+
|
335 |
+
# Add a vertical line for average
|
336 |
+
mean_days = cleaned_df[payment_col].mean()
|
337 |
+
fig_hist.add_vline(
|
338 |
+
x=mean_days,
|
339 |
+
line_dash="dash",
|
340 |
+
line_color="red",
|
341 |
+
annotation_text=f"Average: {mean_days:.1f} days",
|
342 |
+
annotation_position="top right"
|
343 |
+
)
|
344 |
+
|
345 |
+
# Add a vertical line for on-time (0 days)
|
346 |
+
if colmap.get('payment_timing_vs_due'):
|
347 |
+
fig_hist.add_vline(
|
348 |
+
x=0,
|
349 |
+
line_dash="dash",
|
350 |
+
line_color="green",
|
351 |
+
annotation_text="Due Date",
|
352 |
+
annotation_position="top left"
|
353 |
+
)
|
354 |
+
|
355 |
+
st.plotly_chart(fig_hist, use_container_width=True)
|
356 |
+
|
357 |
+
with col2:
|
358 |
+
# Create a box plot showing the spread of payment timings
|
359 |
+
st.write("#### Payment Timing Statistics")
|
360 |
+
fig_box = px.box(
|
361 |
+
cleaned_df,
|
362 |
+
y=payment_col,
|
363 |
+
title="Range of Payment Timings",
|
364 |
+
points="all",
|
365 |
+
labels={payment_col: "Days to Payment"},
|
366 |
+
color_discrete_sequence=['#3366CC'],
|
367 |
+
)
|
368 |
+
fig_box.update_layout(
|
369 |
+
yaxis_title="Days to Payment",
|
370 |
+
showlegend=False
|
371 |
+
)
|
372 |
+
st.plotly_chart(fig_box, use_container_width=True)
|
373 |
+
|
374 |
+
# Payment timing stats
|
375 |
+
st.write("#### Key Payment Statistics")
|
376 |
+
col1, col2, col3, col4 = st.columns(4)
|
377 |
+
with col1:
|
378 |
+
st.metric("Average Days", f"{cleaned_df[payment_col].mean():.1f}")
|
379 |
+
with col2:
|
380 |
+
st.metric("Median Days", f"{cleaned_df[payment_col].median():.1f}")
|
381 |
+
with col3:
|
382 |
+
early_percent = (cleaned_df[payment_col] <= 0).mean() * 100 if 'payment_timing_vs_due' in colmap.values() else None
|
383 |
+
if early_percent is not None:
|
384 |
+
st.metric("% Paid On Time", f"{early_percent:.1f}%")
|
385 |
+
else:
|
386 |
+
st.metric("Minimum Days", f"{cleaned_df[payment_col].min():.1f}")
|
387 |
+
with col4:
|
388 |
+
st.metric("Maximum Days", f"{cleaned_df[payment_col].max():.1f}")
|
389 |
+
|
390 |
+
# Get AI explanation if payment_col is days vs due date
|
391 |
+
if colmap.get('payment_timing_vs_due') == payment_col:
|
392 |
+
# Ask AI for explanation of payment patterns
|
393 |
+
prompt = f"""Based on this payment data summary:
|
394 |
+
- Average days to payment: {cleaned_df[payment_col].mean():.1f}
|
395 |
+
- Median days to payment: {cleaned_df[payment_col].median():.1f}
|
396 |
+
- % paid on time: {early_percent:.1f}%
|
397 |
+
- Maximum days late: {cleaned_df[payment_col].max():.1f}
|
398 |
+
|
399 |
+
Please explain in simple terms what this tells us about how customers are paying their invoices.
|
400 |
+
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
|
401 |
+
"""
|
402 |
+
|
403 |
+
with st.expander("💡 What does this mean for my business?", expanded=True):
|
404 |
+
explanation = ask_gemini(prompt)
|
405 |
+
st.markdown(explanation)
|
406 |
+
|
407 |
+
else:
|
408 |
+
st.error("No payment timing column available. Please check your column mappings.")
|
409 |
+
|
410 |
+
# --- Step 3.2: Payment Patterns by Category ---
|
411 |
+
if selected_analysis == "Payment Patterns by Category":
|
412 |
+
st.subheader("📊 Payment Patterns by Category")
|
413 |
+
cleaned_df = st.session_state.cleaned_df
|
414 |
+
payment_col = colmap.get('target_col')
|
415 |
+
|
416 |
+
# Define possible category columns and let user select which to analyze
|
417 |
+
category_cols = []
|
418 |
+
for key in ['revenue_type', 'payment_method', 'customer_id']:
|
419 |
+
col = colmap.get(key)
|
420 |
+
if col and col in cleaned_df.columns:
|
421 |
+
category_cols.append(col)
|
422 |
+
|
423 |
+
if not category_cols:
|
424 |
+
st.warning("No category columns were identified. Please go back to column mapping and identify at least one of: Revenue Type, Payment Method, or Customer ID.")
|
425 |
+
else:
|
426 |
+
# Let user select which category to analyze
|
427 |
+
selected_category = st.selectbox(
|
428 |
+
"Select category to analyze:",
|
429 |
+
options=category_cols,
|
430 |
+
key="category_selector"
|
431 |
+
)
|
432 |
+
|
433 |
+
if selected_category and payment_col and payment_col in cleaned_df.columns:
|
434 |
+
# Limit categories to top 10 by frequency to avoid cluttered charts
|
435 |
+
top_categories = cleaned_df[selected_category].value_counts().nlargest(10).index
|
436 |
+
filtered_df = cleaned_df[cleaned_df[selected_category].isin(top_categories)]
|
437 |
+
|
438 |
+
# Create layout with columns
|
439 |
+
col1, col2 = st.columns(2)
|
440 |
+
|
441 |
+
with col1:
|
442 |
+
# Box plot showing payment timing by category
|
443 |
+
st.write(f"#### Payment Timing by {selected_category}")
|
444 |
+
fig_category_box = px.box(
|
445 |
+
filtered_df,
|
446 |
+
x=selected_category,
|
447 |
+
y=payment_col,
|
448 |
+
color=selected_category,
|
449 |
+
title=f"How different {selected_category} categories pay",
|
450 |
+
labels={payment_col: "Days to Payment"},
|
451 |
+
)
|
452 |
+
fig_category_box.update_layout(
|
453 |
+
xaxis_title=selected_category,
|
454 |
+
yaxis_title="Days to Payment",
|
455 |
+
xaxis={'categoryorder':'total descending'}
|
456 |
+
)
|
457 |
+
st.plotly_chart(fig_category_box, use_container_width=True)
|
458 |
+
|
459 |
+
with col2:
|
460 |
+
# Bar chart showing average payment time by category
|
461 |
+
st.write(f"#### Average Payment Time by {selected_category}")
|
462 |
+
category_avg = filtered_df.groupby(selected_category)[payment_col].mean().reset_index()
|
463 |
+
category_avg = category_avg.sort_values(payment_col)
|
464 |
+
|
465 |
+
fig_category_bar = px.bar(
|
466 |
+
category_avg,
|
467 |
+
x=selected_category,
|
468 |
+
y=payment_col,
|
469 |
+
color=selected_category,
|
470 |
+
title=f"Average days to payment by {selected_category}",
|
471 |
+
labels={payment_col: "Average Days to Payment"},
|
472 |
+
)
|
473 |
+
fig_category_bar.update_layout(
|
474 |
+
xaxis_title=selected_category,
|
475 |
+
yaxis_title="Average Days to Payment",
|
476 |
+
showlegend=False
|
477 |
+
)
|
478 |
+
st.plotly_chart(fig_category_bar, use_container_width=True)
|
479 |
+
|
480 |
+
# Calculate statistics by category
|
481 |
+
category_stats = filtered_df.groupby(selected_category).agg({
|
482 |
+
payment_col: ['mean', 'median', 'count'],
|
483 |
+
})
|
484 |
+
category_stats.columns = [' '.join(col).strip() for col in category_stats.columns.values]
|
485 |
+
category_stats = category_stats.reset_index().rename(
|
486 |
+
columns={f"{payment_col} mean": "Avg Days",
|
487 |
+
f"{payment_col} median": "Median Days",
|
488 |
+
f"{payment_col} count": "Count"}
|
489 |
+
)
|
490 |
+
category_stats["Avg Days"] = category_stats["Avg Days"].round(1)
|
491 |
+
category_stats["Median Days"] = category_stats["Median Days"].round(1)
|
492 |
+
|
493 |
+
# Show the statistics table
|
494 |
+
st.write(f"#### Statistics by {selected_category}")
|
495 |
+
st.dataframe(category_stats, use_container_width=True)
|
496 |
+
|
497 |
+
# Get AI explanation based on category
|
498 |
+
fastest_category = category_stats.loc[category_stats["Avg Days"].idxmin()][selected_category]
|
499 |
+
slowest_category = category_stats.loc[category_stats["Avg Days"].idxmax()][selected_category]
|
500 |
+
avg_diff = category_stats["Avg Days"].max() - category_stats["Avg Days"].min()
|
501 |
+
|
502 |
+
prompt = f"""Based on this payment data analysis by {selected_category}:
|
503 |
+
- Fastest paying category: {fastest_category} (average {category_stats['Avg Days'].min():.1f} days)
|
504 |
+
- Slowest paying category: {slowest_category} (average {category_stats['Avg Days'].max():.1f} days)
|
505 |
+
- Difference between fastest and slowest: {avg_diff:.1f} days
|
506 |
+
|
507 |
+
Please explain in simple terms what this tells us about how different {selected_category} categories are paying.
|
508 |
+
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
|
509 |
+
"""
|
510 |
+
|
511 |
+
with st.expander("💡 What does this pattern mean?", expanded=True):
|
512 |
+
explanation = ask_gemini(prompt)
|
513 |
+
st.markdown(explanation)
|
514 |
+
|
515 |
+
# --- Step 3.3: Customer Groups ---
|
516 |
+
if selected_analysis == "Customer Groups":
|
517 |
+
st.subheader("👥 Customer Payment Behavior Groups")
|
518 |
+
cleaned_df = st.session_state.cleaned_df
|
519 |
+
payment_col = colmap.get('target_col')
|
520 |
+
amount_col = colmap.get('amount')
|
521 |
+
customer_col = colmap.get('customer_id')
|
522 |
+
|
523 |
+
if not customer_col or not customer_col in cleaned_df.columns:
|
524 |
+
st.warning("Please identify a Customer ID column in the column mapping step to see customer groups.")
|
525 |
+
elif not payment_col or not payment_col in cleaned_df.columns:
|
526 |
+
st.warning("No payment timing column available. Please check your column mappings.")
|
527 |
+
else:
|
528 |
+
# Create customer-level summary
|
529 |
+
st.write("Analyzing customer payment patterns...")
|
530 |
+
|
531 |
+
# Group by customer and calculate statistics
|
532 |
+
customer_stats = cleaned_df.groupby(customer_col).agg({
|
533 |
+
payment_col: ['mean', 'median', 'min', 'max', 'count'],
|
534 |
+
})
|
535 |
+
customer_stats.columns = [' '.join(col).strip() for col in customer_stats.columns.values]
|
536 |
+
customer_stats = customer_stats.reset_index()
|
537 |
+
|
538 |
+
# Add amount stats if available
|
539 |
+
if amount_col and amount_col in cleaned_df.columns:
|
540 |
+
amount_stats = cleaned_df.groupby(customer_col).agg({
|
541 |
+
amount_col: ['mean', 'sum'],
|
542 |
+
})
|
543 |
+
amount_stats.columns = [' '.join(col).strip() for col in amount_stats.columns.values]
|
544 |
+
amount_stats = amount_stats.reset_index()
|
545 |
+
customer_stats = customer_stats.merge(amount_stats, on=customer_col)
|
546 |
+
|
547 |
+
# Define payment behavior groups based on mean payment time
|
548 |
+
# Only apply if we have payment timing vs due date
|
549 |
+
if colmap.get('payment_timing_vs_due') == payment_col:
|
550 |
+
# Create behavior groups
|
551 |
+
conditions = [
|
552 |
+
(customer_stats[f"{payment_col} mean"] < -5), # Very early (>5 days before due)
|
553 |
+
(customer_stats[f"{payment_col} mean"] >= -5) & (customer_stats[f"{payment_col} mean"] < 0), # Early (0-5 days before due)
|
554 |
+
(customer_stats[f"{payment_col} mean"] >= 0) & (customer_stats[f"{payment_col} mean"] < 15), # On time to slightly late (0-15 days)
|
555 |
+
(customer_stats[f"{payment_col} mean"] >= 15) & (customer_stats[f"{payment_col} mean"] < 30), # Moderately late (15-30 days)
|
556 |
+
(customer_stats[f"{payment_col} mean"] >= 30) # Very late (>30 days)
|
557 |
+
]
|
558 |
+
values = ['Very Early Payers', 'Early Payers', 'On-time/Slightly Late', 'Moderately Late', 'Very Late Payers']
|
559 |
+
customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown')
|
560 |
+
else:
|
561 |
+
# If we don't have vs due date, create relative groups
|
562 |
+
median_pay_time = customer_stats[f"{payment_col} mean"].median()
|
563 |
+
conditions = [
|
564 |
+
(customer_stats[f"{payment_col} mean"] < 0.6 * median_pay_time), # Much faster than median
|
565 |
+
(customer_stats[f"{payment_col} mean"] >= 0.6 * median_pay_time) & (customer_stats[f"{payment_col} mean"] < 0.9 * median_pay_time), # Faster than median
|
566 |
+
(customer_stats[f"{payment_col} mean"] >= 0.9 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.1 * median_pay_time), # Around median
|
567 |
+
(customer_stats[f"{payment_col} mean"] > 1.1 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.5 * median_pay_time), # Slower than median
|
568 |
+
(customer_stats[f"{payment_col} mean"] > 1.5 * median_pay_time) # Much slower than median
|
569 |
+
]
|
570 |
+
values = ['Much Faster Payers', 'Faster Payers', 'Average Payers', 'Slower Payers', 'Much Slower Payers']
|
571 |
+
customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown')
|
572 |
+
|
573 |
+
# Count customers in each group
|
574 |
+
behavior_counts = customer_stats['Payment Behavior'].value_counts().reset_index()
|
575 |
+
behavior_counts.columns = ['Payment Behavior', 'Number of Customers']
|
576 |
+
|
577 |
+
# Create tabs for different views
|
578 |
+
tab1, tab2 = st.tabs(["Customer Groups", "Individual Customers"])
|
579 |
+
|
580 |
+
with tab1:
|
581 |
+
st.write("#### Customer Payment Behavior Groups")
|
582 |
+
|
583 |
+
# Create a pie chart showing distribution of customer behavior
|
584 |
+
fig_pie = px.pie(
|
585 |
+
behavior_counts,
|
586 |
+
values='Number of Customers',
|
587 |
+
names='Payment Behavior',
|
588 |
+
title="Distribution of Customer Payment Behavior"
|
589 |
+
)
|
590 |
+
st.plotly_chart(fig_pie, use_container_width=True)
|
591 |
+
|
592 |
+
# Show statistics for each behavior group
|
593 |
+
behavior_group_stats = customer_stats.groupby('Payment Behavior').agg({
|
594 |
+
f"{payment_col} mean": 'mean',
|
595 |
+
f"{payment_col} count": 'sum',
|
596 |
+
customer_col: 'count'
|
597 |
+
}).reset_index()
|
598 |
+
|
599 |
+
behavior_group_stats.columns = ['Payment Behavior', 'Avg Days to Payment', 'Total Invoices', 'Customer Count']
|
600 |
+
behavior_group_stats["Avg Days to Payment"] = behavior_group_stats["Avg Days to Payment"].round(1)
|
601 |
+
|
602 |
+
# Add amount statistics if available
|
603 |
+
if amount_col and amount_col in cleaned_df.columns and f"{amount_col} sum" in customer_stats.columns:
|
604 |
+
amount_by_behavior = customer_stats.groupby('Payment Behavior')[f"{amount_col} sum"].sum().reset_index()
|
605 |
+
behavior_group_stats = behavior_group_stats.merge(amount_by_behavior, on='Payment Behavior')
|
606 |
+
behavior_group_stats.rename(columns={f"{amount_col} sum": "Total Amount"}, inplace=True)
|
607 |
+
|
608 |
+
st.dataframe(behavior_group_stats, use_container_width=True)
|
609 |
+
|
610 |
+
# Get AI explanation
|
611 |
+
largest_group = behavior_counts.loc[behavior_counts['Number of Customers'].idxmax()]['Payment Behavior']
|
612 |
+
prompt = f"""Based on this customer payment behavior analysis:
|
613 |
+
- Largest customer group: {largest_group} ({behavior_counts['Number of Customers'].max()} customers)
|
614 |
+
- Total customer segments: {len(behavior_counts)}
|
615 |
+
|
616 |
+
Please explain in simple terms what this tells us about our customer base and their payment habits.
|
617 |
+
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
|
618 |
+
"""
|
619 |
+
|
620 |
+
with st.expander("💡 What does this mean for my business?", expanded=True):
|
621 |
+
explanation = ask_gemini(prompt)
|
622 |
+
st.markdown(explanation)
|
623 |
+
|
624 |
+
with tab2:
|
625 |
+
st.write("#### Individual Customer Payment Behavior")
|
626 |
+
st.write("Search for specific customers or sort by payment behavior:")
|
627 |
+
|
628 |
+
# Prepare the customer table
|
629 |
+
customer_display = customer_stats.copy()
|
630 |
+
customer_display = customer_display.rename(columns={
|
631 |
+
f"{payment_col} mean": "Avg Days to Payment",
|
632 |
+
f"{payment_col} median": "Median Days",
|
633 |
+
f"{payment_col} min": "Min Days",
|
634 |
+
f"{payment_col} max": "Max Days",
|
635 |
+
f"{payment_col} count": "Invoice Count"
|
636 |
+
})
|
637 |
+
|
638 |
+
if amount_col and amount_col in cleaned_df.columns:
|
639 |
+
customer_display = customer_display.rename(columns={
|
640 |
+
f"{amount_col} mean": "Avg Amount",
|
641 |
+
f"{amount_col} sum": "Total Amount"
|
642 |
+
})
|
643 |
+
|
644 |
+
# Round numeric columns
|
645 |
+
numeric_cols = customer_display.select_dtypes(include=[np.number]).columns
|
646 |
+
customer_display[numeric_cols] = customer_display[numeric_cols].round(2)
|
647 |
+
|
648 |
+
# Allow filtering
|
649 |
+
selected_behavior = st.multiselect(
|
650 |
+
"Filter by payment behavior:",
|
651 |
+
options=customer_display['Payment Behavior'].unique(),
|
652 |
+
default=None
|
653 |
+
)
|
654 |
+
|
655 |
+
if selected_behavior:
|
656 |
+
filtered_customers = customer_display[customer_display['Payment Behavior'].isin(selected_behavior)]
|
657 |
+
else:
|
658 |
+
filtered_customers = customer_display
|
659 |
+
|
660 |
+
st.dataframe(filtered_customers, use_container_width=True)
|
661 |
+
|
662 |
+
# --- Step 3.4: Payment Predictions ---
|
663 |
+
if selected_analysis == "Payment Predictions":
|
664 |
+
st.subheader("🔮 Payment Time Prediction Model")
|
665 |
+
cleaned_df = st.session_state.cleaned_df
|
666 |
+
payment_col = colmap.get('target_col')
|
667 |
+
|
668 |
+
# Check if we have the necessary data
|
669 |
+
if not payment_col or not payment_col in cleaned_df.columns:
|
670 |
+
st.warning("No payment timing column available. Please check your column mappings.")
|
671 |
+
else:
|
672 |
+
st.write("This model helps you predict when customers will pay based on invoice characteristics.")
|
673 |
+
|
674 |
+
# Identify potential predictor variables
|
675 |
+
numeric_cols = cleaned_df.select_dtypes(include=np.number).columns.tolist()
|
676 |
+
categorical_cols = cleaned_df.select_dtypes(include=['object', 'category']).columns.tolist()
|
677 |
+
|
678 |
+
# Remove the target variable from predictors
|
679 |
+
if payment_col in numeric_cols:
|
680 |
+
numeric_cols.remove(payment_col)
|
681 |
+
|
682 |
+
# Prepare predictor variables
|
683 |
+
potential_predictors = []
|
684 |
+
|
685 |
+
# Add amount if available
|
686 |
+
amount_col = colmap.get('amount')
|
687 |
+
if amount_col and amount_col in cleaned_df.columns:
|
688 |
+
potential_predictors.append(amount_col)
|
689 |
+
|
690 |
+
# Add customer_id if available
|
691 |
+
customer_col = colmap.get('customer_id')
|
692 |
+
if customer_col and customer_col in cleaned_df.columns and len(cleaned_df[customer_col].unique()) < 100:
|
693 |
+
potential_predictors.append(customer_col)
|
694 |
+
|
695 |
+
# Add revenue_type and payment_method if available
|
696 |
+
for key in ['revenue_type', 'payment_method']:
|
697 |
+
col = colmap.get(key)
|
698 |
+
if col and col in cleaned_df.columns:
|
699 |
+
potential_predictors.append(col)
|
700 |
+
|
701 |
+
# Add other numeric columns that might be useful
|
702 |
+
for col in numeric_cols:
|
703 |
+
if col not in potential_predictors and col != payment_col and 'date' not in col.lower():
|
704 |
+
potential_predictors.append(col)
|
705 |
+
|
706 |
+
if len(potential_predictors) < 1:
|
707 |
+
st.warning("Not enough predictor variables available for modeling. Please identify more columns in your data.")
|
708 |
+
else:
|
709 |
+
# Let user select predictor variables
|
710 |
+
st.write("#### Select variables to use for prediction")
|
711 |
+
selected_predictors = st.multiselect(
|
712 |
+
"Choose which factors might influence payment timing:",
|
713 |
+
options=potential_predictors,
|
714 |
+
default=potential_predictors[:min(3, len(potential_predictors))] # Default to first 3
|
715 |
+
)
|
716 |
+
|
717 |
+
if len(selected_predictors) < 1:
|
718 |
+
st.warning("Please select at least one predictor variable.")
|
719 |
+
else:
|
720 |
+
# Prepare data for modeling
|
721 |
+
X = cleaned_df[selected_predictors].copy()
|
722 |
+
y = cleaned_df[payment_col].copy()
|
723 |
+
|
724 |
+
# Process categorical variables
|
725 |
+
X_processed = pd.DataFrame()
|
726 |
+
|
727 |
+
for col in X.columns:
|
728 |
+
if col in categorical_cols:
|
729 |
+
# One-hot encode categorical variables
|
730 |
+
dummies = pd.get_dummies(X[col], prefix=col, drop_first=True)
|
731 |
+
X_processed = pd.concat([X_processed, dummies], axis=1)
|
732 |
+
else:
|
733 |
+
# Keep numeric columns as is
|
734 |
+
X_processed[col] = X[col]
|
735 |
+
|
736 |
+
# Check if we have any data after processing
|
737 |
+
if X_processed.shape[1] == 0:
|
738 |
+
st.warning("No usable predictor variables after processing. Please select different variables.")
|
739 |
+
else:
|
740 |
+
# Build and train the model
|
741 |
+
with st.spinner("Training prediction model..."):
|
742 |
+
try:
|
743 |
+
# Fill any remaining missing values with median
|
744 |
+
X_processed = X_processed.fillna(X_processed.median())
|
745 |
+
|
746 |
+
# Train a Random Forest model
|
747 |
+
model = RandomForestRegressor(n_estimators=100, random_state=42)
|
748 |
+
model.fit(X_processed, y)
|
749 |
+
|
750 |
+
# Get feature importances
|
751 |
+
feature_imp = pd.DataFrame({
|
752 |
+
'Feature': X_processed.columns,
|
753 |
+
'Importance': model.feature_importances_
|
754 |
+
}).sort_values('Importance', ascending=False)
|
755 |
+
|
756 |
+
st.success("✅ Payment prediction model trained!")
|
757 |
+
|
758 |
+
# Show feature importances
|
759 |
+
st.write("#### Factors that influence payment timing")
|
760 |
+
fig_imp = px.bar(
|
761 |
+
feature_imp.head(10),
|
762 |
+
x='Importance',
|
763 |
+
y='Feature',
|
764 |
+
orientation='h',
|
765 |
+
title="Which factors best predict payment timing",
|
766 |
+
labels={'Importance': 'Importance Score'},
|
767 |
+
)
|
768 |
+
st.plotly_chart(fig_imp, use_container_width=True)
|
769 |
+
|
770 |
+
# Get AI explanation of feature importance
|
771 |
+
top_features = ", ".join(feature_imp.head(3)['Feature'].tolist())
|
772 |
+
prompt = f"""Based on this payment prediction model analysis:
|
773 |
+
- Top predictive factors: {top_features}
|
774 |
+
|
775 |
+
Please explain in simple terms what this tells us about what influences payment timing.
|
776 |
+
Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
|
777 |
+
"""
|
778 |
+
|
779 |
+
with st.expander("💡 What influences payment timing?", expanded=True):
|
780 |
+
explanation = ask_gemini(prompt)
|
781 |
+
st.markdown(explanation)
|
782 |
+
|
783 |
+
# Create prediction interface
|
784 |
+
st.write("#### Predict payment timing for new invoices")
|
785 |
+
st.write("Enter values for a new invoice to predict when it will be paid:")
|
786 |
+
|
787 |
+
# Create input widgets for each predictor
|
788 |
+
new_invoice_data = {}
|
789 |
+
for col in selected_predictors:
|
790 |
+
if col in categorical_cols:
|
791 |
+
options = cleaned_df[col].unique().tolist()
|
792 |
+
new_invoice_data[col] = st.selectbox(f"Select {col}:", options=options)
|
793 |
+
else:
|
794 |
+
min_val = cleaned_df[col].min()
|
795 |
+
max_val = cleaned_df[col].max()
|
796 |
+
step = (max_val - min_val) / 100
|
797 |
+
new_invoice_data[col] = st.slider(f"Set {col}:", min_value=float(min_val), max_value=float(max_val), step=float(step))
|
798 |
+
|
799 |
+
# Make prediction when button is clicked
|
800 |
+
if st.button("Predict Payment Timing"):
|
801 |
+
# Prepare the input data in the same format as the training data
|
802 |
+
X_new = pd.DataFrame([new_invoice_data])
|
803 |
+
X_new_processed = pd.DataFrame()
|
804 |
+
|
805 |
+
# Process the input data the same way as training data
|
806 |
+
for col in X_new.columns:
|
807 |
+
if col in categorical_cols:
|
808 |
+
dummies = pd.get_dummies(X_new[col], prefix=col, drop_first=True)
|
809 |
+
X_new_processed = pd.concat([X_new_processed, dummies], axis=1)
|
810 |
+
else:
|
811 |
+
X_new_processed[col] = X_new[col]
|
812 |
+
|
813 |
+
# Add missing columns that were in the training data
|
814 |
+
for col in X_processed.columns:
|
815 |
+
if col not in X_new_processed.columns:
|
816 |
+
X_new_processed[col] = 0
|
817 |
+
|
818 |
+
# Keep only the columns used during training
|
819 |
+
X_new_processed = X_new_processed[X_processed.columns]
|
820 |
+
|
821 |
+
# Make prediction
|
822 |
+
prediction = model.predict(X_new_processed)[0]
|
823 |
+
|
824 |
+
# Display prediction
|
825 |
+
st.success(f"Predicted payment timing: **{prediction:.1f} days**")
|
826 |
+
|
827 |
+
# Add interpretation if we're predicting vs due date
|
828 |
+
if colmap.get('payment_timing_vs_due') == payment_col:
|
829 |
+
if prediction < 0:
|
830 |
+
st.info(f"This invoice is predicted to be paid **{abs(prediction):.1f} days before** the due date.")
|
831 |
+
elif prediction == 0:
|
832 |
+
st.info("This invoice is predicted to be paid exactly on the due date.")
|
833 |
+
else:
|
834 |
+
st.info(f"This invoice is predicted to be paid **{prediction:.1f} days after** the due date.")
|
835 |
+
except Exception as e:
|
836 |
+
st.error(f"Error building prediction model: {e}")
|
837 |
+
|
838 |
+
# --- Step 3.5: Summary Report ---
|
839 |
+
if selected_analysis == "Summary Report":
|
840 |
+
st.subheader("📝 Payment Analysis Summary Report")
|
841 |
+
cleaned_df = st.session_state.cleaned_df
|
842 |
+
payment_col = colmap.get('target_col')
|
843 |
+
|
844 |
+
if not payment_col or not payment_col in cleaned_df.columns:
|
845 |
+
st.warning("No payment timing column available. Please check your column mappings.")
|
846 |
+
else:
|
847 |
+
# Overall payment statistics
|
848 |
+
st.write("### Overall Payment Performance")
|
849 |
+
mean_days = cleaned_df[payment_col].mean()
|
850 |
+
median_days = cleaned_df[payment_col].median()
|
851 |
+
|
852 |
+
# Calculate on-time percentage if we have vs due date
|
853 |
+
if colmap.get('payment_timing_vs_due') == payment_col:
|
854 |
+
ontime_pct = (cleaned_df[payment_col] <= 0).mean() * 100
|
855 |
+
late_pct = 100 - ontime_pct
|
856 |
+
|
857 |
+
# Display metrics in columns
|
858 |
+
col1, col2, col3 = st.columns(3)
|
859 |
+
with col1:
|
860 |
+
st.metric("Average Days to Payment", f"{mean_days:.1f}")
|
861 |
+
with col2:
|
862 |
+
st.metric("Median Days to Payment", f"{median_days:.1f}")
|
863 |
+
with col3:
|
864 |
+
if colmap.get('payment_timing_vs_due') == payment_col:
|
865 |
+
st.metric("% Paid On Time", f"{ontime_pct:.1f}%")
|
866 |
+
else:
|
867 |
+
st.metric("Total Invoices", f"{len(cleaned_df)}")
|
868 |
+
|
869 |
+
# Summary visualization
|
870 |
+
st.write("### Payment Timeline")
|
871 |
+
|
872 |
+
# Create a histogram of payment distribution
|
873 |
+
fig_summary = px.histogram(
|
874 |
+
cleaned_df,
|
875 |
+
x=payment_col,
|
876 |
+
nbins=30,
|
877 |
+
title="Distribution of Payment Timing",
|
878 |
+
color_discrete_sequence=['#3366CC']
|
879 |
+
)
|
880 |
+
|
881 |
+
# Add markers for key statistics
|
882 |
+
fig_summary.add_vline(x=mean_days, line_dash="dash", line_color="red", annotation_text=f"Mean: {mean_days:.1f}")
|
883 |
+
fig_summary.add_vline(x=median_days, line_dash="dash", line_color="green", annotation_text=f"Median: {median_days:.1f}")
|
884 |
+
|
885 |
+
if colmap.get('payment_timing_vs_due') == payment_col:
|
886 |
+
fig_summary.add_vline(x=0, line_dash="solid", line_color="black", annotation_text="Due Date")
|
887 |
+
|
888 |
+
st.plotly_chart(fig_summary, use_container_width=True)
|
889 |
+
|
890 |
+
# Generate a summary report with key findings
|
891 |
+
st.write("### Key Findings")
|
892 |
+
|
893 |
+
# Gather key data points for the AI summary
|
894 |
+
summary_data = {
|
895 |
+
"mean_days": mean_days,
|
896 |
+
"median_days": median_days,
|
897 |
+
"min_days": cleaned_df[payment_col].min(),
|
898 |
+
"max_days": cleaned_df[payment_col].max(),
|
899 |
+
"invoice_count": len(cleaned_df)
|
900 |
+
}
|
901 |
+
|
902 |
+
# Add on-time percentages if available
|
903 |
+
if colmap.get('payment_timing_vs_due') == payment_col:
|
904 |
+
summary_data["ontime_pct"] = ontime_pct
|
905 |
+
summary_data["late_pct"] = late_pct
|
906 |
+
|
907 |
+
# Add category information if available
|
908 |
+
category_insights = []
|
909 |
+
for cat_key in ['revenue_type', 'payment_method']:
|
910 |
+
cat_col = colmap.get(cat_key)
|
911 |
+
if cat_col and cat_col in cleaned_df.columns:
|
912 |
+
# Calculate best and worst categories
|
913 |
+
cat_avg = cleaned_df.groupby(cat_col)[payment_col].mean()
|
914 |
+
if not cat_avg.empty:
|
915 |
+
best_cat = cat_avg.idxmin()
|
916 |
+
worst_cat = cat_avg.idxmax()
|
917 |
+
best_days = cat_avg.min()
|
918 |
+
worst_days = cat_avg.max()
|
919 |
+
|
920 |
+
category_insights.append(f"Best {cat_key}: {best_cat} ({best_days:.1f} days)")
|
921 |
+
category_insights.append(f"Worst {cat_key}: {worst_cat} ({worst_days:.1f} days)")
|
922 |
+
|
923 |
+
# Build prompt for AI summary
|
924 |
+
prompt = f"""Based on this payment data analysis:
|
925 |
+
- Average days to payment: {summary_data['mean_days']:.1f}
|
926 |
+
- Median days to payment: {summary_data['median_days']:.1f}
|
927 |
+
- Range: {summary_data['min_days']:.1f} to {summary_data['max_days']:.1f} days
|
928 |
+
- Total invoices analyzed: {summary_data['invoice_count']}
|
929 |
+
"""
|
930 |
+
|
931 |
+
if 'ontime_pct' in summary_data:
|
932 |
+
prompt += f"- On-time payments: {summary_data['ontime_pct']:.1f}%\n"
|
933 |
+
prompt += f"- Late payments: {summary_data['late_pct']:.1f}%\n"
|
934 |
+
|
935 |
+
for insight in category_insights:
|
936 |
+
prompt += f"- {insight}\n"
|
937 |
+
|
938 |
+
prompt += """\nPlease provide a clear, concise 3-4 sentence summary of what this payment data tells us about the business.
|
939 |
+
Focus on the most important insights that would be valuable for financial decision-making.
|
940 |
+
Use simple, non-technical language that anyone in the business could understand.
|
941 |
+
"""
|
942 |
+
|
943 |
+
with st.expander("💡 Summary of Key Findings", expanded=True):
|
944 |
+
final_summary = ask_gemini(prompt)
|
945 |
+
st.markdown(final_summary)
|
946 |
+
|
947 |
+
# Recommendations section
|
948 |
+
st.write("### Recommendations")
|
949 |
+
|
950 |
+
# Build prompt for AI recommendations
|
951 |
+
rec_prompt = prompt + "\n\nBased on this data, please provide 3-4 specific, actionable recommendations to improve payment collection. Each recommendation should be 1-2 sentences and focused on practical actions the business can take."
|
952 |
+
|
953 |
+
recommendations = ask_gemini(rec_prompt)
|
954 |
+
st.markdown(recommendations)
|