noumanjavaid commited on
Commit
50aedaa
·
verified ·
1 Parent(s): d6bc3d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +662 -2
app.py CHANGED
@@ -6,6 +6,7 @@ import plotly.graph_objects as go
6
  import google.generativeai as genai
7
  import os
8
  import warnings
 
9
  from dotenv import load_dotenv
10
  load_dotenv()
11
 
@@ -25,7 +26,7 @@ def configure_gemini():
25
  if not GEMINI_API_KEY:
26
  st.error("⚠️ AI helper not available (missing API key).")
27
  return None
28
- genai.configure(api_key=GEMINI_API_KEY)
29
  model = genai.GenerativeModel('gemini-1.5-flash')
30
  return model
31
  except Exception as e:
@@ -291,4 +292,663 @@ if st.session_state.analysis_step >= 2:
291
  # Fill in missing categories
292
  categorical_cols = df_processed.select_dtypes(exclude=[np.number, 'datetime64[ns]']).columns
293
  for col in categorical_cols:
294
- if df_processed[col].isnull().any():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import google.generativeai as genai
7
  import os
8
  import warnings
9
+ from sklearn.ensemble import RandomForestRegressor
10
  from dotenv import load_dotenv
11
  load_dotenv()
12
 
 
26
  if not GEMINI_API_KEY:
27
  st.error("⚠️ AI helper not available (missing API key).")
28
  return None
29
+ genai.configure(api_key="AIzaSyBAIYRTzVJZyLEcPpIyyc0Ceb4b04WmVY0")
30
  model = genai.GenerativeModel('gemini-1.5-flash')
31
  return model
32
  except Exception as e:
 
292
  # Fill in missing categories
293
  categorical_cols = df_processed.select_dtypes(exclude=[np.number, 'datetime64[ns]']).columns
294
  for col in categorical_cols:
295
+ if df_processed[col].isnull().any():
296
+ try:
297
+ mode_val = df_processed[col].mode()[0]
298
+ df_processed[col] = df_processed[col].fillna(mode_val)
299
+ except IndexError:
300
+ df_processed[col] = df_processed[col].fillna('Unknown')
301
+ st.write("- Filled in missing categories with most common values")
302
+
303
+ # Store cleaned data for future use
304
+ st.session_state.cleaned_df = df_processed
305
+ st.session_state.analysis_step = max(st.session_state.analysis_step, 3)
306
+ st.success("✅ Data prepared successfully!")
307
+
308
+ # Now show the payment timing overview
309
+ if selected_analysis == "Payment Timing Overview":
310
+ st.subheader("📊 Payment Timing Overview")
311
+ cleaned_df = st.session_state.cleaned_df
312
+ payment_col = colmap.get('target_col')
313
+
314
+ if payment_col and payment_col in cleaned_df.columns:
315
+ # Create a layout with columns
316
+ col1, col2 = st.columns(2)
317
+
318
+ with col1:
319
+ # Create a histogram showing payment timing distribution
320
+ st.write("#### Distribution of Payment Timing")
321
+ fig_hist = px.histogram(
322
+ cleaned_df,
323
+ x=payment_col,
324
+ nbins=20,
325
+ title=f"How quickly payments are made",
326
+ labels={payment_col: "Days to Payment"},
327
+ color_discrete_sequence=['#3366CC'],
328
+ )
329
+ fig_hist.update_layout(
330
+ xaxis_title="Days to Payment",
331
+ yaxis_title="Number of Invoices",
332
+ showlegend=False
333
+ )
334
+
335
+ # Add a vertical line for average
336
+ mean_days = cleaned_df[payment_col].mean()
337
+ fig_hist.add_vline(
338
+ x=mean_days,
339
+ line_dash="dash",
340
+ line_color="red",
341
+ annotation_text=f"Average: {mean_days:.1f} days",
342
+ annotation_position="top right"
343
+ )
344
+
345
+ # Add a vertical line for on-time (0 days)
346
+ if colmap.get('payment_timing_vs_due'):
347
+ fig_hist.add_vline(
348
+ x=0,
349
+ line_dash="dash",
350
+ line_color="green",
351
+ annotation_text="Due Date",
352
+ annotation_position="top left"
353
+ )
354
+
355
+ st.plotly_chart(fig_hist, use_container_width=True)
356
+
357
+ with col2:
358
+ # Create a box plot showing the spread of payment timings
359
+ st.write("#### Payment Timing Statistics")
360
+ fig_box = px.box(
361
+ cleaned_df,
362
+ y=payment_col,
363
+ title="Range of Payment Timings",
364
+ points="all",
365
+ labels={payment_col: "Days to Payment"},
366
+ color_discrete_sequence=['#3366CC'],
367
+ )
368
+ fig_box.update_layout(
369
+ yaxis_title="Days to Payment",
370
+ showlegend=False
371
+ )
372
+ st.plotly_chart(fig_box, use_container_width=True)
373
+
374
+ # Payment timing stats
375
+ st.write("#### Key Payment Statistics")
376
+ col1, col2, col3, col4 = st.columns(4)
377
+ with col1:
378
+ st.metric("Average Days", f"{cleaned_df[payment_col].mean():.1f}")
379
+ with col2:
380
+ st.metric("Median Days", f"{cleaned_df[payment_col].median():.1f}")
381
+ with col3:
382
+ early_percent = (cleaned_df[payment_col] <= 0).mean() * 100 if 'payment_timing_vs_due' in colmap.values() else None
383
+ if early_percent is not None:
384
+ st.metric("% Paid On Time", f"{early_percent:.1f}%")
385
+ else:
386
+ st.metric("Minimum Days", f"{cleaned_df[payment_col].min():.1f}")
387
+ with col4:
388
+ st.metric("Maximum Days", f"{cleaned_df[payment_col].max():.1f}")
389
+
390
+ # Get AI explanation if payment_col is days vs due date
391
+ if colmap.get('payment_timing_vs_due') == payment_col:
392
+ # Ask AI for explanation of payment patterns
393
+ prompt = f"""Based on this payment data summary:
394
+ - Average days to payment: {cleaned_df[payment_col].mean():.1f}
395
+ - Median days to payment: {cleaned_df[payment_col].median():.1f}
396
+ - % paid on time: {early_percent:.1f}%
397
+ - Maximum days late: {cleaned_df[payment_col].max():.1f}
398
+
399
+ Please explain in simple terms what this tells us about how customers are paying their invoices.
400
+ Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
401
+ """
402
+
403
+ with st.expander("💡 What does this mean for my business?", expanded=True):
404
+ explanation = ask_gemini(prompt)
405
+ st.markdown(explanation)
406
+
407
+ else:
408
+ st.error("No payment timing column available. Please check your column mappings.")
409
+
410
+ # --- Step 3.2: Payment Patterns by Category ---
411
+ if selected_analysis == "Payment Patterns by Category":
412
+ st.subheader("📊 Payment Patterns by Category")
413
+ cleaned_df = st.session_state.cleaned_df
414
+ payment_col = colmap.get('target_col')
415
+
416
+ # Define possible category columns and let user select which to analyze
417
+ category_cols = []
418
+ for key in ['revenue_type', 'payment_method', 'customer_id']:
419
+ col = colmap.get(key)
420
+ if col and col in cleaned_df.columns:
421
+ category_cols.append(col)
422
+
423
+ if not category_cols:
424
+ st.warning("No category columns were identified. Please go back to column mapping and identify at least one of: Revenue Type, Payment Method, or Customer ID.")
425
+ else:
426
+ # Let user select which category to analyze
427
+ selected_category = st.selectbox(
428
+ "Select category to analyze:",
429
+ options=category_cols,
430
+ key="category_selector"
431
+ )
432
+
433
+ if selected_category and payment_col and payment_col in cleaned_df.columns:
434
+ # Limit categories to top 10 by frequency to avoid cluttered charts
435
+ top_categories = cleaned_df[selected_category].value_counts().nlargest(10).index
436
+ filtered_df = cleaned_df[cleaned_df[selected_category].isin(top_categories)]
437
+
438
+ # Create layout with columns
439
+ col1, col2 = st.columns(2)
440
+
441
+ with col1:
442
+ # Box plot showing payment timing by category
443
+ st.write(f"#### Payment Timing by {selected_category}")
444
+ fig_category_box = px.box(
445
+ filtered_df,
446
+ x=selected_category,
447
+ y=payment_col,
448
+ color=selected_category,
449
+ title=f"How different {selected_category} categories pay",
450
+ labels={payment_col: "Days to Payment"},
451
+ )
452
+ fig_category_box.update_layout(
453
+ xaxis_title=selected_category,
454
+ yaxis_title="Days to Payment",
455
+ xaxis={'categoryorder':'total descending'}
456
+ )
457
+ st.plotly_chart(fig_category_box, use_container_width=True)
458
+
459
+ with col2:
460
+ # Bar chart showing average payment time by category
461
+ st.write(f"#### Average Payment Time by {selected_category}")
462
+ category_avg = filtered_df.groupby(selected_category)[payment_col].mean().reset_index()
463
+ category_avg = category_avg.sort_values(payment_col)
464
+
465
+ fig_category_bar = px.bar(
466
+ category_avg,
467
+ x=selected_category,
468
+ y=payment_col,
469
+ color=selected_category,
470
+ title=f"Average days to payment by {selected_category}",
471
+ labels={payment_col: "Average Days to Payment"},
472
+ )
473
+ fig_category_bar.update_layout(
474
+ xaxis_title=selected_category,
475
+ yaxis_title="Average Days to Payment",
476
+ showlegend=False
477
+ )
478
+ st.plotly_chart(fig_category_bar, use_container_width=True)
479
+
480
+ # Calculate statistics by category
481
+ category_stats = filtered_df.groupby(selected_category).agg({
482
+ payment_col: ['mean', 'median', 'count'],
483
+ })
484
+ category_stats.columns = [' '.join(col).strip() for col in category_stats.columns.values]
485
+ category_stats = category_stats.reset_index().rename(
486
+ columns={f"{payment_col} mean": "Avg Days",
487
+ f"{payment_col} median": "Median Days",
488
+ f"{payment_col} count": "Count"}
489
+ )
490
+ category_stats["Avg Days"] = category_stats["Avg Days"].round(1)
491
+ category_stats["Median Days"] = category_stats["Median Days"].round(1)
492
+
493
+ # Show the statistics table
494
+ st.write(f"#### Statistics by {selected_category}")
495
+ st.dataframe(category_stats, use_container_width=True)
496
+
497
+ # Get AI explanation based on category
498
+ fastest_category = category_stats.loc[category_stats["Avg Days"].idxmin()][selected_category]
499
+ slowest_category = category_stats.loc[category_stats["Avg Days"].idxmax()][selected_category]
500
+ avg_diff = category_stats["Avg Days"].max() - category_stats["Avg Days"].min()
501
+
502
+ prompt = f"""Based on this payment data analysis by {selected_category}:
503
+ - Fastest paying category: {fastest_category} (average {category_stats['Avg Days'].min():.1f} days)
504
+ - Slowest paying category: {slowest_category} (average {category_stats['Avg Days'].max():.1f} days)
505
+ - Difference between fastest and slowest: {avg_diff:.1f} days
506
+
507
+ Please explain in simple terms what this tells us about how different {selected_category} categories are paying.
508
+ Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
509
+ """
510
+
511
+ with st.expander("💡 What does this pattern mean?", expanded=True):
512
+ explanation = ask_gemini(prompt)
513
+ st.markdown(explanation)
514
+
515
+ # --- Step 3.3: Customer Groups ---
516
+ if selected_analysis == "Customer Groups":
517
+ st.subheader("👥 Customer Payment Behavior Groups")
518
+ cleaned_df = st.session_state.cleaned_df
519
+ payment_col = colmap.get('target_col')
520
+ amount_col = colmap.get('amount')
521
+ customer_col = colmap.get('customer_id')
522
+
523
+ if not customer_col or not customer_col in cleaned_df.columns:
524
+ st.warning("Please identify a Customer ID column in the column mapping step to see customer groups.")
525
+ elif not payment_col or not payment_col in cleaned_df.columns:
526
+ st.warning("No payment timing column available. Please check your column mappings.")
527
+ else:
528
+ # Create customer-level summary
529
+ st.write("Analyzing customer payment patterns...")
530
+
531
+ # Group by customer and calculate statistics
532
+ customer_stats = cleaned_df.groupby(customer_col).agg({
533
+ payment_col: ['mean', 'median', 'min', 'max', 'count'],
534
+ })
535
+ customer_stats.columns = [' '.join(col).strip() for col in customer_stats.columns.values]
536
+ customer_stats = customer_stats.reset_index()
537
+
538
+ # Add amount stats if available
539
+ if amount_col and amount_col in cleaned_df.columns:
540
+ amount_stats = cleaned_df.groupby(customer_col).agg({
541
+ amount_col: ['mean', 'sum'],
542
+ })
543
+ amount_stats.columns = [' '.join(col).strip() for col in amount_stats.columns.values]
544
+ amount_stats = amount_stats.reset_index()
545
+ customer_stats = customer_stats.merge(amount_stats, on=customer_col)
546
+
547
+ # Define payment behavior groups based on mean payment time
548
+ # Only apply if we have payment timing vs due date
549
+ if colmap.get('payment_timing_vs_due') == payment_col:
550
+ # Create behavior groups
551
+ conditions = [
552
+ (customer_stats[f"{payment_col} mean"] < -5), # Very early (>5 days before due)
553
+ (customer_stats[f"{payment_col} mean"] >= -5) & (customer_stats[f"{payment_col} mean"] < 0), # Early (0-5 days before due)
554
+ (customer_stats[f"{payment_col} mean"] >= 0) & (customer_stats[f"{payment_col} mean"] < 15), # On time to slightly late (0-15 days)
555
+ (customer_stats[f"{payment_col} mean"] >= 15) & (customer_stats[f"{payment_col} mean"] < 30), # Moderately late (15-30 days)
556
+ (customer_stats[f"{payment_col} mean"] >= 30) # Very late (>30 days)
557
+ ]
558
+ values = ['Very Early Payers', 'Early Payers', 'On-time/Slightly Late', 'Moderately Late', 'Very Late Payers']
559
+ customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown')
560
+ else:
561
+ # If we don't have vs due date, create relative groups
562
+ median_pay_time = customer_stats[f"{payment_col} mean"].median()
563
+ conditions = [
564
+ (customer_stats[f"{payment_col} mean"] < 0.6 * median_pay_time), # Much faster than median
565
+ (customer_stats[f"{payment_col} mean"] >= 0.6 * median_pay_time) & (customer_stats[f"{payment_col} mean"] < 0.9 * median_pay_time), # Faster than median
566
+ (customer_stats[f"{payment_col} mean"] >= 0.9 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.1 * median_pay_time), # Around median
567
+ (customer_stats[f"{payment_col} mean"] > 1.1 * median_pay_time) & (customer_stats[f"{payment_col} mean"] <= 1.5 * median_pay_time), # Slower than median
568
+ (customer_stats[f"{payment_col} mean"] > 1.5 * median_pay_time) # Much slower than median
569
+ ]
570
+ values = ['Much Faster Payers', 'Faster Payers', 'Average Payers', 'Slower Payers', 'Much Slower Payers']
571
+ customer_stats['Payment Behavior'] = np.select(conditions, values, default='Unknown')
572
+
573
+ # Count customers in each group
574
+ behavior_counts = customer_stats['Payment Behavior'].value_counts().reset_index()
575
+ behavior_counts.columns = ['Payment Behavior', 'Number of Customers']
576
+
577
+ # Create tabs for different views
578
+ tab1, tab2 = st.tabs(["Customer Groups", "Individual Customers"])
579
+
580
+ with tab1:
581
+ st.write("#### Customer Payment Behavior Groups")
582
+
583
+ # Create a pie chart showing distribution of customer behavior
584
+ fig_pie = px.pie(
585
+ behavior_counts,
586
+ values='Number of Customers',
587
+ names='Payment Behavior',
588
+ title="Distribution of Customer Payment Behavior"
589
+ )
590
+ st.plotly_chart(fig_pie, use_container_width=True)
591
+
592
+ # Show statistics for each behavior group
593
+ behavior_group_stats = customer_stats.groupby('Payment Behavior').agg({
594
+ f"{payment_col} mean": 'mean',
595
+ f"{payment_col} count": 'sum',
596
+ customer_col: 'count'
597
+ }).reset_index()
598
+
599
+ behavior_group_stats.columns = ['Payment Behavior', 'Avg Days to Payment', 'Total Invoices', 'Customer Count']
600
+ behavior_group_stats["Avg Days to Payment"] = behavior_group_stats["Avg Days to Payment"].round(1)
601
+
602
+ # Add amount statistics if available
603
+ if amount_col and amount_col in cleaned_df.columns and f"{amount_col} sum" in customer_stats.columns:
604
+ amount_by_behavior = customer_stats.groupby('Payment Behavior')[f"{amount_col} sum"].sum().reset_index()
605
+ behavior_group_stats = behavior_group_stats.merge(amount_by_behavior, on='Payment Behavior')
606
+ behavior_group_stats.rename(columns={f"{amount_col} sum": "Total Amount"}, inplace=True)
607
+
608
+ st.dataframe(behavior_group_stats, use_container_width=True)
609
+
610
+ # Get AI explanation
611
+ largest_group = behavior_counts.loc[behavior_counts['Number of Customers'].idxmax()]['Payment Behavior']
612
+ prompt = f"""Based on this customer payment behavior analysis:
613
+ - Largest customer group: {largest_group} ({behavior_counts['Number of Customers'].max()} customers)
614
+ - Total customer segments: {len(behavior_counts)}
615
+
616
+ Please explain in simple terms what this tells us about our customer base and their payment habits.
617
+ Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
618
+ """
619
+
620
+ with st.expander("💡 What does this mean for my business?", expanded=True):
621
+ explanation = ask_gemini(prompt)
622
+ st.markdown(explanation)
623
+
624
+ with tab2:
625
+ st.write("#### Individual Customer Payment Behavior")
626
+ st.write("Search for specific customers or sort by payment behavior:")
627
+
628
+ # Prepare the customer table
629
+ customer_display = customer_stats.copy()
630
+ customer_display = customer_display.rename(columns={
631
+ f"{payment_col} mean": "Avg Days to Payment",
632
+ f"{payment_col} median": "Median Days",
633
+ f"{payment_col} min": "Min Days",
634
+ f"{payment_col} max": "Max Days",
635
+ f"{payment_col} count": "Invoice Count"
636
+ })
637
+
638
+ if amount_col and amount_col in cleaned_df.columns:
639
+ customer_display = customer_display.rename(columns={
640
+ f"{amount_col} mean": "Avg Amount",
641
+ f"{amount_col} sum": "Total Amount"
642
+ })
643
+
644
+ # Round numeric columns
645
+ numeric_cols = customer_display.select_dtypes(include=[np.number]).columns
646
+ customer_display[numeric_cols] = customer_display[numeric_cols].round(2)
647
+
648
+ # Allow filtering
649
+ selected_behavior = st.multiselect(
650
+ "Filter by payment behavior:",
651
+ options=customer_display['Payment Behavior'].unique(),
652
+ default=None
653
+ )
654
+
655
+ if selected_behavior:
656
+ filtered_customers = customer_display[customer_display['Payment Behavior'].isin(selected_behavior)]
657
+ else:
658
+ filtered_customers = customer_display
659
+
660
+ st.dataframe(filtered_customers, use_container_width=True)
661
+
662
+ # --- Step 3.4: Payment Predictions ---
663
+ if selected_analysis == "Payment Predictions":
664
+ st.subheader("🔮 Payment Time Prediction Model")
665
+ cleaned_df = st.session_state.cleaned_df
666
+ payment_col = colmap.get('target_col')
667
+
668
+ # Check if we have the necessary data
669
+ if not payment_col or not payment_col in cleaned_df.columns:
670
+ st.warning("No payment timing column available. Please check your column mappings.")
671
+ else:
672
+ st.write("This model helps you predict when customers will pay based on invoice characteristics.")
673
+
674
+ # Identify potential predictor variables
675
+ numeric_cols = cleaned_df.select_dtypes(include=np.number).columns.tolist()
676
+ categorical_cols = cleaned_df.select_dtypes(include=['object', 'category']).columns.tolist()
677
+
678
+ # Remove the target variable from predictors
679
+ if payment_col in numeric_cols:
680
+ numeric_cols.remove(payment_col)
681
+
682
+ # Prepare predictor variables
683
+ potential_predictors = []
684
+
685
+ # Add amount if available
686
+ amount_col = colmap.get('amount')
687
+ if amount_col and amount_col in cleaned_df.columns:
688
+ potential_predictors.append(amount_col)
689
+
690
+ # Add customer_id if available
691
+ customer_col = colmap.get('customer_id')
692
+ if customer_col and customer_col in cleaned_df.columns and len(cleaned_df[customer_col].unique()) < 100:
693
+ potential_predictors.append(customer_col)
694
+
695
+ # Add revenue_type and payment_method if available
696
+ for key in ['revenue_type', 'payment_method']:
697
+ col = colmap.get(key)
698
+ if col and col in cleaned_df.columns:
699
+ potential_predictors.append(col)
700
+
701
+ # Add other numeric columns that might be useful
702
+ for col in numeric_cols:
703
+ if col not in potential_predictors and col != payment_col and 'date' not in col.lower():
704
+ potential_predictors.append(col)
705
+
706
+ if len(potential_predictors) < 1:
707
+ st.warning("Not enough predictor variables available for modeling. Please identify more columns in your data.")
708
+ else:
709
+ # Let user select predictor variables
710
+ st.write("#### Select variables to use for prediction")
711
+ selected_predictors = st.multiselect(
712
+ "Choose which factors might influence payment timing:",
713
+ options=potential_predictors,
714
+ default=potential_predictors[:min(3, len(potential_predictors))] # Default to first 3
715
+ )
716
+
717
+ if len(selected_predictors) < 1:
718
+ st.warning("Please select at least one predictor variable.")
719
+ else:
720
+ # Prepare data for modeling
721
+ X = cleaned_df[selected_predictors].copy()
722
+ y = cleaned_df[payment_col].copy()
723
+
724
+ # Process categorical variables
725
+ X_processed = pd.DataFrame()
726
+
727
+ for col in X.columns:
728
+ if col in categorical_cols:
729
+ # One-hot encode categorical variables
730
+ dummies = pd.get_dummies(X[col], prefix=col, drop_first=True)
731
+ X_processed = pd.concat([X_processed, dummies], axis=1)
732
+ else:
733
+ # Keep numeric columns as is
734
+ X_processed[col] = X[col]
735
+
736
+ # Check if we have any data after processing
737
+ if X_processed.shape[1] == 0:
738
+ st.warning("No usable predictor variables after processing. Please select different variables.")
739
+ else:
740
+ # Build and train the model
741
+ with st.spinner("Training prediction model..."):
742
+ try:
743
+ # Fill any remaining missing values with median
744
+ X_processed = X_processed.fillna(X_processed.median())
745
+
746
+ # Train a Random Forest model
747
+ model = RandomForestRegressor(n_estimators=100, random_state=42)
748
+ model.fit(X_processed, y)
749
+
750
+ # Get feature importances
751
+ feature_imp = pd.DataFrame({
752
+ 'Feature': X_processed.columns,
753
+ 'Importance': model.feature_importances_
754
+ }).sort_values('Importance', ascending=False)
755
+
756
+ st.success("✅ Payment prediction model trained!")
757
+
758
+ # Show feature importances
759
+ st.write("#### Factors that influence payment timing")
760
+ fig_imp = px.bar(
761
+ feature_imp.head(10),
762
+ x='Importance',
763
+ y='Feature',
764
+ orientation='h',
765
+ title="Which factors best predict payment timing",
766
+ labels={'Importance': 'Importance Score'},
767
+ )
768
+ st.plotly_chart(fig_imp, use_container_width=True)
769
+
770
+ # Get AI explanation of feature importance
771
+ top_features = ", ".join(feature_imp.head(3)['Feature'].tolist())
772
+ prompt = f"""Based on this payment prediction model analysis:
773
+ - Top predictive factors: {top_features}
774
+
775
+ Please explain in simple terms what this tells us about what influences payment timing.
776
+ Keep your explanation short (3-4 sentences), non-technical, and focused on what this means for the business.
777
+ """
778
+
779
+ with st.expander("💡 What influences payment timing?", expanded=True):
780
+ explanation = ask_gemini(prompt)
781
+ st.markdown(explanation)
782
+
783
+ # Create prediction interface
784
+ st.write("#### Predict payment timing for new invoices")
785
+ st.write("Enter values for a new invoice to predict when it will be paid:")
786
+
787
+ # Create input widgets for each predictor
788
+ new_invoice_data = {}
789
+ for col in selected_predictors:
790
+ if col in categorical_cols:
791
+ options = cleaned_df[col].unique().tolist()
792
+ new_invoice_data[col] = st.selectbox(f"Select {col}:", options=options)
793
+ else:
794
+ min_val = cleaned_df[col].min()
795
+ max_val = cleaned_df[col].max()
796
+ step = (max_val - min_val) / 100
797
+ new_invoice_data[col] = st.slider(f"Set {col}:", min_value=float(min_val), max_value=float(max_val), step=float(step))
798
+
799
+ # Make prediction when button is clicked
800
+ if st.button("Predict Payment Timing"):
801
+ # Prepare the input data in the same format as the training data
802
+ X_new = pd.DataFrame([new_invoice_data])
803
+ X_new_processed = pd.DataFrame()
804
+
805
+ # Process the input data the same way as training data
806
+ for col in X_new.columns:
807
+ if col in categorical_cols:
808
+ dummies = pd.get_dummies(X_new[col], prefix=col, drop_first=True)
809
+ X_new_processed = pd.concat([X_new_processed, dummies], axis=1)
810
+ else:
811
+ X_new_processed[col] = X_new[col]
812
+
813
+ # Add missing columns that were in the training data
814
+ for col in X_processed.columns:
815
+ if col not in X_new_processed.columns:
816
+ X_new_processed[col] = 0
817
+
818
+ # Keep only the columns used during training
819
+ X_new_processed = X_new_processed[X_processed.columns]
820
+
821
+ # Make prediction
822
+ prediction = model.predict(X_new_processed)[0]
823
+
824
+ # Display prediction
825
+ st.success(f"Predicted payment timing: **{prediction:.1f} days**")
826
+
827
+ # Add interpretation if we're predicting vs due date
828
+ if colmap.get('payment_timing_vs_due') == payment_col:
829
+ if prediction < 0:
830
+ st.info(f"This invoice is predicted to be paid **{abs(prediction):.1f} days before** the due date.")
831
+ elif prediction == 0:
832
+ st.info("This invoice is predicted to be paid exactly on the due date.")
833
+ else:
834
+ st.info(f"This invoice is predicted to be paid **{prediction:.1f} days after** the due date.")
835
+ except Exception as e:
836
+ st.error(f"Error building prediction model: {e}")
837
+
838
+ # --- Step 3.5: Summary Report ---
839
+ if selected_analysis == "Summary Report":
840
+ st.subheader("📝 Payment Analysis Summary Report")
841
+ cleaned_df = st.session_state.cleaned_df
842
+ payment_col = colmap.get('target_col')
843
+
844
+ if not payment_col or not payment_col in cleaned_df.columns:
845
+ st.warning("No payment timing column available. Please check your column mappings.")
846
+ else:
847
+ # Overall payment statistics
848
+ st.write("### Overall Payment Performance")
849
+ mean_days = cleaned_df[payment_col].mean()
850
+ median_days = cleaned_df[payment_col].median()
851
+
852
+ # Calculate on-time percentage if we have vs due date
853
+ if colmap.get('payment_timing_vs_due') == payment_col:
854
+ ontime_pct = (cleaned_df[payment_col] <= 0).mean() * 100
855
+ late_pct = 100 - ontime_pct
856
+
857
+ # Display metrics in columns
858
+ col1, col2, col3 = st.columns(3)
859
+ with col1:
860
+ st.metric("Average Days to Payment", f"{mean_days:.1f}")
861
+ with col2:
862
+ st.metric("Median Days to Payment", f"{median_days:.1f}")
863
+ with col3:
864
+ if colmap.get('payment_timing_vs_due') == payment_col:
865
+ st.metric("% Paid On Time", f"{ontime_pct:.1f}%")
866
+ else:
867
+ st.metric("Total Invoices", f"{len(cleaned_df)}")
868
+
869
+ # Summary visualization
870
+ st.write("### Payment Timeline")
871
+
872
+ # Create a histogram of payment distribution
873
+ fig_summary = px.histogram(
874
+ cleaned_df,
875
+ x=payment_col,
876
+ nbins=30,
877
+ title="Distribution of Payment Timing",
878
+ color_discrete_sequence=['#3366CC']
879
+ )
880
+
881
+ # Add markers for key statistics
882
+ fig_summary.add_vline(x=mean_days, line_dash="dash", line_color="red", annotation_text=f"Mean: {mean_days:.1f}")
883
+ fig_summary.add_vline(x=median_days, line_dash="dash", line_color="green", annotation_text=f"Median: {median_days:.1f}")
884
+
885
+ if colmap.get('payment_timing_vs_due') == payment_col:
886
+ fig_summary.add_vline(x=0, line_dash="solid", line_color="black", annotation_text="Due Date")
887
+
888
+ st.plotly_chart(fig_summary, use_container_width=True)
889
+
890
+ # Generate a summary report with key findings
891
+ st.write("### Key Findings")
892
+
893
+ # Gather key data points for the AI summary
894
+ summary_data = {
895
+ "mean_days": mean_days,
896
+ "median_days": median_days,
897
+ "min_days": cleaned_df[payment_col].min(),
898
+ "max_days": cleaned_df[payment_col].max(),
899
+ "invoice_count": len(cleaned_df)
900
+ }
901
+
902
+ # Add on-time percentages if available
903
+ if colmap.get('payment_timing_vs_due') == payment_col:
904
+ summary_data["ontime_pct"] = ontime_pct
905
+ summary_data["late_pct"] = late_pct
906
+
907
+ # Add category information if available
908
+ category_insights = []
909
+ for cat_key in ['revenue_type', 'payment_method']:
910
+ cat_col = colmap.get(cat_key)
911
+ if cat_col and cat_col in cleaned_df.columns:
912
+ # Calculate best and worst categories
913
+ cat_avg = cleaned_df.groupby(cat_col)[payment_col].mean()
914
+ if not cat_avg.empty:
915
+ best_cat = cat_avg.idxmin()
916
+ worst_cat = cat_avg.idxmax()
917
+ best_days = cat_avg.min()
918
+ worst_days = cat_avg.max()
919
+
920
+ category_insights.append(f"Best {cat_key}: {best_cat} ({best_days:.1f} days)")
921
+ category_insights.append(f"Worst {cat_key}: {worst_cat} ({worst_days:.1f} days)")
922
+
923
+ # Build prompt for AI summary
924
+ prompt = f"""Based on this payment data analysis:
925
+ - Average days to payment: {summary_data['mean_days']:.1f}
926
+ - Median days to payment: {summary_data['median_days']:.1f}
927
+ - Range: {summary_data['min_days']:.1f} to {summary_data['max_days']:.1f} days
928
+ - Total invoices analyzed: {summary_data['invoice_count']}
929
+ """
930
+
931
+ if 'ontime_pct' in summary_data:
932
+ prompt += f"- On-time payments: {summary_data['ontime_pct']:.1f}%\n"
933
+ prompt += f"- Late payments: {summary_data['late_pct']:.1f}%\n"
934
+
935
+ for insight in category_insights:
936
+ prompt += f"- {insight}\n"
937
+
938
+ prompt += """\nPlease provide a clear, concise 3-4 sentence summary of what this payment data tells us about the business.
939
+ Focus on the most important insights that would be valuable for financial decision-making.
940
+ Use simple, non-technical language that anyone in the business could understand.
941
+ """
942
+
943
+ with st.expander("💡 Summary of Key Findings", expanded=True):
944
+ final_summary = ask_gemini(prompt)
945
+ st.markdown(final_summary)
946
+
947
+ # Recommendations section
948
+ st.write("### Recommendations")
949
+
950
+ # Build prompt for AI recommendations
951
+ rec_prompt = prompt + "\n\nBased on this data, please provide 3-4 specific, actionable recommendations to improve payment collection. Each recommendation should be 1-2 sentences and focused on practical actions the business can take."
952
+
953
+ recommendations = ask_gemini(rec_prompt)
954
+ st.markdown(recommendations)