GMARTINEZMILLA commited on
Commit
ad618df
·
1 Parent(s): 9d618d8

feat: updated app.py

Browse files
Files changed (1) hide show
  1. app.py +332 -589
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import plotly.express as px
4
- import plotly.graph_objects as go
5
  import matplotlib.pyplot as plt
6
  import numpy as np
7
  import lightgbm as lgb
@@ -12,76 +11,84 @@ from sklearn.metrics import mean_absolute_error, mean_squared_error
12
  # Page configuration
13
  st.set_page_config(page_title="Customer Insights App", page_icon=":bar_chart:")
14
 
15
- # Load CSV files
16
  df = pd.read_csv("df_clean.csv")
17
  nombres_proveedores = pd.read_csv("nombres_proveedores.csv", sep=';')
18
  euros_proveedor = pd.read_csv("euros_proveedor.csv", sep=',')
19
  ventas_clientes = pd.read_csv("ventas_clientes.csv", sep=',')
20
- customer_clusters = pd.read_csv('predicts/customer_clusters.csv')
21
- df_agg_2024 = pd.read_csv('predicts/df_agg_2024.csv')
22
 
23
  # Ensure customer codes are strings
24
  df['CLIENTE'] = df['CLIENTE'].astype(str)
25
  nombres_proveedores['codigo'] = nombres_proveedores['codigo'].astype(str)
26
  euros_proveedor['CLIENTE'] = euros_proveedor['CLIENTE'].astype(str)
27
- customer_clusters['cliente_id'] = customer_clusters['cliente_id'].astype(str)
28
  fieles_df = pd.read_csv("clientes_relevantes.csv")
29
  cestas = pd.read_csv("cestas.csv")
30
  productos = pd.read_csv("productos.csv")
31
  df_agg_2024['cliente_id'] = df_agg_2024['cliente_id'].astype(str)
32
 
33
- # Convert columns in euros_proveedor to numeric
34
  for col in euros_proveedor.columns:
35
  if col != 'CLIENTE':
36
  euros_proveedor[col] = pd.to_numeric(euros_proveedor[col], errors='coerce')
37
 
38
- # Check for NaN values in euros_proveedor
39
  if euros_proveedor.isna().any().any():
40
  st.warning("Some values in euros_proveedor couldn't be converted to numbers. Please review the input data.")
41
 
42
- # Ignore the last two columns in df
43
  df = df.iloc[:, :-2]
44
 
45
  # Function to get supplier name
46
  def get_supplier_name(code):
47
- code = str(code)
48
  name = nombres_proveedores[nombres_proveedores['codigo'] == code]['nombre'].values
49
  return name[0] if len(name) > 0 else code
50
 
51
- # Function to create radar chart using Plotly
52
  def radar_chart(categories, values, amounts, title):
53
- # Ensure the first and last data points are the same to close the radar chart
54
- values.append(values[0])
55
- amounts.append(amounts[0])
56
- categories.append(categories[0])
57
-
58
- fig = px.line_polar(
59
- r=values,
60
- theta=categories,
61
- line_close=True,
62
- labels={'r': '% Units Sold', 'theta': 'Manufacturers'},
63
- title=title
64
- )
65
 
66
- # Adding a second trace for the spend amounts
67
- fig.add_trace(
68
- go.Scatterpolar(
69
- r=amounts,
70
- theta=categories,
71
- mode='lines+markers',
72
- name="Spend ()",
73
- line=dict(color='blue', width=2)
74
- )
75
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- fig.update_traces(fill='toself') # Fill the radar chart area
78
- fig.update_layout(
79
- polar=dict(
80
- radialaxis=dict(visible=True, range=[0, 1])
81
- ),
82
- showlegend=True
83
- )
84
-
85
  return fig
86
 
87
  # Main page design
@@ -114,632 +121,368 @@ elif page == "Customer Analysis":
114
 
115
  if st.button("Calcular"):
116
  if customer_code:
 
117
  customer_match = customer_clusters[customer_clusters['cliente_id'] == customer_code]
118
 
119
  if not customer_match.empty:
120
  cluster = customer_match['cluster_id'].values[0]
121
  st.write(f"Customer {customer_code} belongs to cluster {cluster}")
122
 
123
- # Load the corresponding model
124
  model_path = f'models/modelo_cluster_{cluster}.txt'
125
  gbm = lgb.Booster(model_file=model_path)
126
  st.write(f"Loaded model for cluster {cluster}")
127
 
 
 
 
 
 
 
 
128
  # Load predict data for that cluster
129
  predict_data = pd.read_csv(f'predicts/predict_cluster_{cluster}.csv')
 
 
130
  predict_data['cliente_id'] = predict_data['cliente_id'].astype(str)
 
 
 
 
131
 
132
  # Filter for the specific customer
133
- customer_data = predict_data[predict_data['cliente_id'] == customer_code]
 
 
 
 
 
 
 
 
 
134
 
135
  if not customer_data.empty:
 
136
  lag_features = [f'precio_total_lag_{lag}' for lag in range(1, 25)]
137
  features = lag_features + ['mes', 'marca_id_encoded', 'año', 'cluster_id']
 
 
138
  X_predict = customer_data[features]
139
 
140
  # Convert categorical features to 'category' dtype
141
  categorical_features = ['mes', 'marca_id_encoded', 'cluster_id']
142
  for feature in categorical_features:
143
  X_predict[feature] = X_predict[feature].astype('category')
144
-
 
 
 
 
 
 
145
  # Make Prediction for the selected customer
146
  y_pred = gbm.predict(X_predict, num_iteration=gbm.best_iteration)
147
-
148
- # Results DataFrame
 
 
 
 
 
149
  results = customer_data[['cliente_id', 'marca_id_encoded', 'fecha_mes']].copy()
150
  results['ventas_predichas'] = y_pred
 
 
 
 
 
151
 
152
- # Load actual sales data
153
- actual_sales = df_agg_2024[df_agg_2024['cliente_id'] == customer_code]
 
 
 
 
154
  if not actual_sales.empty:
155
- results = results.merge(
156
- actual_sales[['cliente_id', 'marca_id_encoded', 'fecha_mes', 'precio_total']],
157
- on=['cliente_id', 'marca_id_encoded', 'fecha_mes'],
158
- how='left'
159
- )
160
  results.rename(columns={'precio_total': 'ventas_reales'}, inplace=True)
161
  results['ventas_reales'].fillna(0, inplace=True)
162
-
163
- # Calculate error metrics
 
 
 
164
  valid_results = results.dropna(subset=['ventas_reales'])
165
  if not valid_results.empty:
166
  mae = mean_absolute_error(valid_results['ventas_reales'], valid_results['ventas_predichas'])
167
  mape = np.mean(np.abs((valid_results['ventas_reales'] - valid_results['ventas_predichas']) / valid_results['ventas_reales'])) * 100
168
  rmse = np.sqrt(mean_squared_error(valid_results['ventas_reales'], valid_results['ventas_predichas']))
169
 
 
170
  st.write(f"MAE: {mae:.2f}")
171
  st.write(f"MAPE: {mape:.2f}%")
172
  st.write(f"RMSE: {rmse:.2f}")
173
 
174
- # Plot radar chart
175
- top_units = df[df["CLIENTE"] == str(customer_code)].iloc[:, 1:].T
176
- top_sales = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)].iloc[:, 1:].T
177
-
178
- combined_top = pd.concat([top_units, top_sales]).index.unique()[:20]
179
-
180
- combined_data = pd.DataFrame({
181
- 'units': top_units.loc[combined_top, top_units.columns[0]],
182
- 'sales': top_sales.loc[combined_top, top_sales.columns[0]]
183
- }).fillna(0)
184
-
185
- manufacturers = [get_supplier_name(m) for m in combined_data.index]
186
- values = combined_data['units'].tolist()
187
- amounts = combined_data['sales'].tolist()
188
-
189
- fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Customer {customer_code}')
190
- st.plotly_chart(fig)
191
-
192
- # Articles Recommendations Page
193
- elif page == "Articles Recommendations":
194
- st.title("Articles Recommendations")
195
-
196
- st.markdown("""
197
- Get tailored recommendations for your customers based on their basket.
198
- """)
199
-
200
- partial_code = st.text_input("Enter part of Customer Code for Recommendations (or leave empty to see all)")
201
- if partial_code:
202
- filtered_customers = df[df['CLIENTE'].str.contains(partial_code)]
203
- else:
204
- filtered_customers = df
205
- customer_list = filtered_customers['CLIENTE'].unique()
206
- customer_code = st.selectbox("Select Customer Code for Recommendations", [""] + list(customer_list))
207
-
208
- if customer_code:
209
- option = st.selectbox("Select Recommendation Type", ["Select an option", "By Purchase History", "By Current Basket"])
210
-
211
- if option == "By Current Basket":
212
- st.write("Select the items and assign quantities for the basket:")
213
-
214
- available_articles = productos['ARTICULO'].unique()
215
- selected_articles = st.multiselect("Select Articles", available_articles)
216
-
217
- quantities = {article: st.number_input(f"Quantity for {article}", min_value=0, step=1) for article in selected_articles}
218
-
219
- if st.button("Calcular"):
220
- new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0]
221
-
222
- if new_basket:
223
- def recomienda(new_basket):
224
- tfidf = TfidfVectorizer()
225
- tfidf_matrix = tfidf.fit_transform(cestas['Cestas'])
226
- new_basket_tfidf = tfidf.transform([' '.join(new_basket)])
227
- similarities = cosine_similarity(new_basket_tfidf, tfidf_matrix)
228
- similar_indices = similarities.argsort()[0][-3:]
229
-
230
- recommendations_count = {}
231
- total_similarity = 0
232
-
233
- for idx in similar_indices:
234
- sim_score = similarities[0][idx]
235
- total_similarity += sim_score
236
- products = cestas.iloc[idx]['Cestas'].split()
237
-
238
- for product in products:
239
- if product not in new_basket:
240
- recommendations_count[product] = recommendations_count.get(product, 0) + sim_score
241
-
242
- recommendations_with_prob = [(prod, score / total_similarity) for prod, score in recommendations_count.items()]
243
- recommendations_with_prob.sort(key=lambda x: x[1], reverse=True)
244
-
245
- recommendations_df = pd.DataFrame({
246
- 'ARTICULO': [r[0] for r in recommendations_with_prob],
247
- 'PROBABILIDAD': [r[1] for r in recommendations_with_prob]
248
- })
249
- return recommendations_df
250
-
251
- recommendations_df = recomienda(new_basket)
252
- st.dataframe(recommendations_df)
253
- else:
254
- st.warning("Please select at least one article and set its quantity.")
255
-
256
-
257
-
258
- # import streamlit as st
259
- # import pandas as pd
260
- # import plotly.express as px
261
- # import matplotlib.pyplot as plt
262
- # import numpy as np
263
- # import lightgbm as lgb
264
- # from sklearn.feature_extraction.text import TfidfVectorizer
265
- # from sklearn.metrics.pairwise import cosine_similarity
266
- # from sklearn.metrics import mean_absolute_error, mean_squared_error
267
-
268
- # # Page configuration
269
- # st.set_page_config(page_title="Customer Insights App", page_icon=":bar_chart:")
270
-
271
- # # Load CSV files at the top
272
- # df = pd.read_csv("df_clean.csv")
273
- # nombres_proveedores = pd.read_csv("nombres_proveedores.csv", sep=';')
274
- # euros_proveedor = pd.read_csv("euros_proveedor.csv", sep=',')
275
- # ventas_clientes = pd.read_csv("ventas_clientes.csv", sep=',')
276
- # customer_clusters = pd.read_csv('predicts/customer_clusters.csv') # Load the customer clusters here
277
- # df_agg_2024 = pd.read_csv('predicts/df_agg_2024.csv')
278
-
279
- # # Ensure customer codes are strings
280
- # df['CLIENTE'] = df['CLIENTE'].astype(str)
281
- # nombres_proveedores['codigo'] = nombres_proveedores['codigo'].astype(str)
282
- # euros_proveedor['CLIENTE'] = euros_proveedor['CLIENTE'].astype(str)
283
- # customer_clusters['cliente_id'] = customer_clusters['cliente_id'].astype(str) # Ensure customer IDs are strings
284
- # fieles_df = pd.read_csv("clientes_relevantes.csv")
285
- # cestas = pd.read_csv("cestas.csv")
286
- # productos = pd.read_csv("productos.csv")
287
- # df_agg_2024['cliente_id'] = df_agg_2024['cliente_id'].astype(str)
288
-
289
- # # Convert all columns except 'CLIENTE' to float in euros_proveedor
290
- # for col in euros_proveedor.columns:
291
- # if col != 'CLIENTE':
292
- # euros_proveedor[col] = pd.to_numeric(euros_proveedor[col], errors='coerce')
293
-
294
- # # Check for NaN values after conversion
295
- # if euros_proveedor.isna().any().any():
296
- # st.warning("Some values in euros_proveedor couldn't be converted to numbers. Please review the input data.")
297
-
298
- # # Ignore the last two columns of df
299
- # df = df.iloc[:, :-2]
300
-
301
- # # Function to get supplier name
302
- # def get_supplier_name(code):
303
- # code = str(code) # Ensure code is a string
304
- # name = nombres_proveedores[nombres_proveedores['codigo'] == code]['nombre'].values
305
- # return name[0] if len(name) > 0 else code
306
-
307
- # # Function to create radar chart with square root transformation
308
- # def radar_chart(categories, values, amounts, title):
309
- # N = len(categories)
310
- # angles = [n / float(N) * 2 * np.pi for n in range(N)]
311
- # angles += angles[:1]
312
-
313
- # fig, ax = plt.subplots(figsize=(12, 12), subplot_kw=dict(projection='polar'))
314
-
315
- # # Apply square root transformation
316
- # sqrt_values = np.sqrt(values)
317
- # sqrt_amounts = np.sqrt(amounts)
318
-
319
- # max_sqrt_value = max(sqrt_values)
320
- # normalized_values = [v / max_sqrt_value for v in sqrt_values]
321
-
322
- # # Adjust scaling for spend values
323
- # max_sqrt_amount = max(sqrt_amounts)
324
- # scaling_factor = 0.7 # Adjust this value to control how much the spend values are scaled up
325
- # normalized_amounts = [min((a / max_sqrt_amount) * scaling_factor, 1.0) for a in sqrt_amounts]
326
-
327
- # normalized_values += normalized_values[:1]
328
- # ax.plot(angles, normalized_values, 'o-', linewidth=2, color='#FF69B4', label='% Units (sqrt)')
329
- # ax.fill(angles, normalized_values, alpha=0.25, color='#FF69B4')
330
-
331
- # normalized_amounts += normalized_amounts[:1]
332
- # ax.plot(angles, normalized_amounts, 'o-', linewidth=2, color='#4B0082', label='% Spend (sqrt)')
333
- # ax.fill(angles, normalized_amounts, alpha=0.25, color='#4B0082')
334
-
335
- # ax.set_xticks(angles[:-1])
336
- # ax.set_xticklabels(categories, size=8, wrap=True)
337
- # ax.set_ylim(0, 1)
338
-
339
- # circles = np.linspace(0, 1, 5)
340
- # for circle in circles:
341
- # ax.plot(angles, [circle]*len(angles), '--', color='gray', alpha=0.3, linewidth=0.5)
342
-
343
- # ax.set_yticklabels([])
344
- # ax.spines['polar'].set_visible(False)
345
-
346
- # plt.title(title, size=16, y=1.1)
347
- # plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
348
-
349
- # return fig
350
-
351
- # # Main page design
352
- # st.title("Welcome to Customer Insights App")
353
- # st.markdown("""
354
- # This app helps businesses analyze customer behaviors and provide personalized recommendations based on purchase history.
355
- # Use the tools below to dive deeper into your customer data.
356
- # """)
357
-
358
- # # Navigation menu
359
- # page = st.selectbox("Select the tool you want to use", ["", "Customer Analysis", "Articles Recommendations"])
360
-
361
- # # Home Page
362
- # if page == "":
363
- # st.markdown("## Welcome to the Customer Insights App")
364
- # st.write("Use the dropdown menu to navigate between the different sections.")
365
-
366
- # # Customer Analysis Page
367
- # elif page == "Customer Analysis":
368
- # st.title("Customer Analysis")
369
- # st.markdown("Use the tools below to explore your customer data.")
370
-
371
- # partial_code = st.text_input("Enter part of Customer Code (or leave empty to see all)")
372
- # if partial_code:
373
- # filtered_customers = df[df['CLIENTE'].str.contains(partial_code)]
374
- # else:
375
- # filtered_customers = df
376
- # customer_list = filtered_customers['CLIENTE'].unique()
377
- # customer_code = st.selectbox("Select Customer Code", customer_list)
378
-
379
- # if st.button("Calcular"):
380
- # if customer_code:
381
- # # Find Customer's Cluster
382
- # customer_match = customer_clusters[customer_clusters['cliente_id'] == customer_code]
383
-
384
- # if not customer_match.empty:
385
- # cluster = customer_match['cluster_id'].values[0]
386
- # st.write(f"Customer {customer_code} belongs to cluster {cluster}")
387
-
388
- # # Load the Corresponding Model
389
- # model_path = f'models/modelo_cluster_{cluster}.txt'
390
- # gbm = lgb.Booster(model_file=model_path)
391
- # st.write(f"Loaded model for cluster {cluster}")
392
-
393
- # # Inspect the model
394
- # st.write("### Model Information:")
395
- # st.write(f"Number of trees: {gbm.num_trees()}")
396
- # st.write(f"Number of features: {gbm.num_feature()}")
397
- # st.write("Feature names:")
398
- # st.write(gbm.feature_name())
399
-
400
- # # Load predict data for that cluster
401
- # predict_data = pd.read_csv(f'predicts/predict_cluster_{cluster}.csv')
402
-
403
- # # Convert cliente_id to string
404
- # predict_data['cliente_id'] = predict_data['cliente_id'].astype(str)
405
-
406
- # st.write("### Predict Data DataFrame:")
407
- # st.write(predict_data.head())
408
- # st.write(f"Shape: {predict_data.shape}")
409
-
410
- # # Filter for the specific customer
411
- # customer_code_str = str(customer_code)
412
- # customer_data = predict_data[predict_data['cliente_id'] == customer_code_str]
413
-
414
- # # Add debug statements
415
- # st.write(f"Unique customer IDs in predict data: {predict_data['cliente_id'].unique()}")
416
- # st.write(f"Customer code we're looking for: {customer_code_str}")
417
-
418
- # st.write("### Customer Data:")
419
- # st.write(customer_data.head())
420
- # st.write(f"Shape: {customer_data.shape}")
421
-
422
- # if not customer_data.empty:
423
- # # Define features consistently with the training process
424
- # lag_features = [f'precio_total_lag_{lag}' for lag in range(1, 25)]
425
- # features = lag_features + ['mes', 'marca_id_encoded', 'año', 'cluster_id']
426
-
427
- # # Prepare data for prediction
428
- # X_predict = customer_data[features]
429
-
430
- # # Convert categorical features to 'category' dtype
431
- # categorical_features = ['mes', 'marca_id_encoded', 'cluster_id']
432
- # for feature in categorical_features:
433
- # X_predict[feature] = X_predict[feature].astype('category')
434
-
435
- # st.write("### Features for Prediction:")
436
- # st.write(X_predict.head())
437
- # st.write(f"Shape: {X_predict.shape}")
438
- # st.write("Data types:")
439
- # st.write(X_predict.dtypes)
440
-
441
- # # Make Prediction for the selected customer
442
- # y_pred = gbm.predict(X_predict, num_iteration=gbm.best_iteration)
443
- # st.write("### Prediction Results:")
444
- # st.write(f"Type of y_pred: {type(y_pred)}")
445
- # st.write(f"Shape of y_pred: {y_pred.shape}")
446
- # st.write("First few predictions:")
447
- # st.write(y_pred[:5])
448
-
449
- # # Reassemble the results
450
- # results = customer_data[['cliente_id', 'marca_id_encoded', 'fecha_mes']].copy()
451
- # results['ventas_predichas'] = y_pred
452
- # st.write("### Results DataFrame:")
453
- # st.write(results.head())
454
- # st.write(f"Shape: {results.shape}")
455
-
456
- # st.write(f"Predicted total sales for Customer {customer_code}: {results['ventas_predichas'].sum():.2f}")
457
 
458
- # # Load actual data
459
- # actual_sales = df_agg_2024[df_agg_2024['cliente_id'] == customer_code_str]
460
- # st.write("### Actual Sales DataFrame:")
461
- # st.write(actual_sales.head())
462
- # st.write(f"Shape: {actual_sales.shape}")
463
-
464
- # if not actual_sales.empty:
465
- # results = results.merge(actual_sales[['cliente_id', 'marca_id_encoded', 'fecha_mes', 'precio_total']],
466
- # on=['cliente_id', 'marca_id_encoded', 'fecha_mes'],
467
- # how='left')
468
- # results.rename(columns={'precio_total': 'ventas_reales'}, inplace=True)
469
- # results['ventas_reales'].fillna(0, inplace=True)
470
- # st.write("### Final Results DataFrame:")
471
- # st.write(results.head())
472
- # st.write(f"Shape: {results.shape}")
473
-
474
- # # Calculate metrics only for non-null actual sales
475
- # valid_results = results.dropna(subset=['ventas_reales'])
476
- # if not valid_results.empty:
477
- # mae = mean_absolute_error(valid_results['ventas_reales'], valid_results['ventas_predichas'])
478
- # mape = np.mean(np.abs((valid_results['ventas_reales'] - valid_results['ventas_predichas']) / valid_results['ventas_reales'])) * 100
479
- # rmse = np.sqrt(mean_squared_error(valid_results['ventas_reales'], valid_results['ventas_predichas']))
480
 
481
- # st.write(f"Actual total sales for Customer {customer_code}: {valid_results['ventas_reales'].sum():.2f}")
482
- # st.write(f"MAE: {mae:.2f}")
483
- # st.write(f"MAPE: {mape:.2f}%")
484
- # st.write(f"RMSE: {rmse:.2f}")
485
 
486
- # # Analysis of results
487
- # threshold_good = 100 # You may want to adjust this threshold
488
- # if mae < threshold_good:
489
- # st.success(f"Customer {customer_code} is performing well based on the predictions.")
490
- # else:
491
- # st.warning(f"Customer {customer_code} is not performing well based on the predictions.")
492
- # else:
493
- # st.warning(f"No actual sales data found for customer {customer_code} in df_agg_2024.")
494
 
495
- # st.write("### Debug Information for Radar Chart:")
496
- # st.write(f"Shape of customer_data: {customer_data.shape}")
497
- # st.write(f"Shape of euros_proveedor: {euros_proveedor.shape}")
498
 
499
- # # Get percentage of units sold for each manufacturer
500
- # customer_df = df[df["CLIENTE"] == str(customer_code)] # Get the customer data
501
- # all_manufacturers = customer_df.iloc[:, 1:].T # Exclude CLIENTE column (manufacturers are in columns)
502
- # all_manufacturers.index = all_manufacturers.index.astype(str)
503
 
504
- # # Get total sales for each manufacturer from euros_proveedor
505
- # customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)]
506
- # sales_data = customer_euros.iloc[:, 1:].T # Exclude CLIENTE column
507
- # sales_data.index = sales_data.index.astype(str)
508
 
509
- # # Remove the 'CLIENTE' row from sales_data to avoid issues with mixed types
510
- # sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore')
511
 
512
- # # Ensure all values are numeric
513
- # sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce')
514
- # all_manufacturers = all_manufacturers.apply(pd.to_numeric, errors='coerce')
515
 
516
- # # Sort manufacturers by percentage of units and get top 10
517
- # top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10)
518
 
519
- # # Sort manufacturers by total sales and get top 10
520
- # top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10)
521
 
522
- # # Combine top manufacturers from both lists and get up to 20 unique manufacturers
523
- # combined_top = pd.concat([top_units, top_sales]).index.unique()[:20]
 
 
 
 
524
 
525
- # # Filter out manufacturers that are not present in both datasets
526
- # combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index]
527
 
528
- # st.write(f"Number of combined top manufacturers: {len(combined_top)}")
 
529
 
530
- # if combined_top:
531
- # # Create a DataFrame with combined data for these top manufacturers
532
- # combined_data = pd.DataFrame({
533
- # 'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]],
534
- # 'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]]
535
- # }).fillna(0)
536
 
537
- # # Sort by units, then by sales
538
- # combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False)
 
539
 
540
- # # Filter out manufacturers with 0 units
541
- # non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0]
 
542
 
543
- # # If we have less than 3 non-zero manufacturers, add some zero-value ones
544
- # if len(non_zero_manufacturers) < 3:
545
- # zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers))
546
- # manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers])
547
- # else:
548
- # manufacturers_to_show = non_zero_manufacturers
549
 
550
- # values = manufacturers_to_show['units'].tolist()
551
- # amounts = manufacturers_to_show['sales'].tolist()
552
- # manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index]
553
 
554
- # st.write(f"### Results for top {len(manufacturers)} manufacturers:")
555
- # for manufacturer, value, amount in zip(manufacturers, values, amounts):
556
- # st.write(f"{manufacturer} = {value:.2f}% of units, €{amount:.2f} total sales")
557
 
558
- # if manufacturers: # Only create the chart if we have data
559
- # fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}')
560
- # st.pyplot(fig)
561
- # else:
562
- # st.warning("No data available to create the radar chart.")
563
 
564
- # else:
565
- # st.warning("No combined top manufacturers found.")
 
 
566
 
567
- # # Ensure codigo_cliente in ventas_clientes is a string
568
- # ventas_clientes['codigo_cliente'] = ventas_clientes['codigo_cliente'].astype(str).str.strip()
569
-
570
- # # Ensure customer_code is a string and strip any spaces
571
- # customer_code = str(customer_code).strip()
572
-
573
- # if customer_code in ventas_clientes['codigo_cliente'].unique():
574
- # st.write(f"Customer {customer_code} found in ventas_clientes")
575
- # else:
576
- # st.write(f"Customer {customer_code} not found in ventas_clientes")
577
-
578
- # # Customer sales 2021-2024 (if data exists)
579
- # sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023']
580
- # if all(col in ventas_clientes.columns for col in sales_columns):
581
- # customer_sales_data = ventas_clientes[ventas_clientes['codigo_cliente'] == customer_code]
582
 
583
- # if not customer_sales_data.empty:
584
- # customer_sales = customer_sales_data[sales_columns].values[0]
585
- # years = ['2021', '2022', '2023']
586
 
587
- # fig_sales = px.line(x=years, y=customer_sales, markers=True, title=f'Sales Over the Years for Customer {customer_code}')
588
- # fig_sales.update_layout(xaxis_title="Year", yaxis_title="Sales")
589
- # st.plotly_chart(fig_sales)
590
- # else:
591
- # st.warning(f"No historical sales data found for customer {customer_code}")
592
- # else:
593
- # st.warning("Sales data for 2021-2023 not available in the dataset.")
594
- # else:
595
- # st.warning(f"No data found for customer {customer_code}. Please check the code.")
596
- # else:
597
- # st.warning("Please select a customer.")
598
 
599
 
600
- # # Customer Recommendations Page
601
- # elif page == "Articles Recommendations":
602
- # st.title("Articles Recommendations")
603
 
604
- # st.markdown("""
605
- # Get tailored recommendations for your customers based on their basket.
606
- # """)
607
 
608
- # # Campo input para cliente
609
- # partial_code = st.text_input("Enter part of Customer Code for Recommendations (or leave empty to see all)")
610
- # if partial_code:
611
- # filtered_customers = df[df['CLIENTE'].str.contains(partial_code)]
612
- # else:
613
- # filtered_customers = df
614
- # customer_list = filtered_customers['CLIENTE'].unique()
615
- # customer_code = st.selectbox("Select Customer Code for Recommendations", [""] + list(customer_list))
616
-
617
- # # Definición de la función recomienda
618
- # def recomienda(new_basket):
619
- # # Calcular la matriz TF-IDF
620
- # tfidf = TfidfVectorizer()
621
- # tfidf_matrix = tfidf.fit_transform(cestas['Cestas'])
622
-
623
- # # Convertir la nueva cesta en formato TF-IDF
624
- # new_basket_str = ' '.join(new_basket)
625
- # new_basket_tfidf = tfidf.transform([new_basket_str])
626
-
627
- # # Comparar la nueva cesta con las anteriores
628
- # similarities = cosine_similarity(new_basket_tfidf, tfidf_matrix)
629
-
630
- # # Obtener los índices de las cestas más similares
631
- # similar_indices = similarities.argsort()[0][-3:] # Las 3 más similares
632
-
633
- # # Crear un diccionario para contar las recomendaciones
634
- # recommendations_count = {}
635
- # total_similarity = 0
636
-
637
- # # Recomendar productos de cestas similares
638
- # for idx in similar_indices:
639
- # sim_score = similarities[0][idx]
640
- # total_similarity += sim_score
641
- # products = cestas.iloc[idx]['Cestas'].split()
642
-
643
- # for product in products:
644
- # if product.strip() not in new_basket: # Evitar recomendar lo que ya está en la cesta
645
- # if product.strip() in recommendations_count:
646
- # recommendations_count[product.strip()] += sim_score
647
- # else:
648
- # recommendations_count[product.strip()] = sim_score
649
 
650
- # # Calcular la probabilidad relativa de cada producto recomendado
651
- # recommendations_with_prob = []
652
- # if total_similarity > 0: # Verificar que total_similarity no sea cero
653
- # recommendations_with_prob = [(product, score / total_similarity) for product, score in recommendations_count.items()]
654
- # else:
655
- # print("No se encontraron similitudes suficientes para calcular probabilidades.")
656
-
657
- # recommendations_with_prob.sort(key=lambda x: x[1], reverse=True) # Ordenar por puntuación
658
-
659
- # # Crear un nuevo DataFrame para almacenar las recomendaciones con descripciones y probabilidades
660
- # recommendations_df = pd.DataFrame(columns=['ARTICULO', 'DESCRIPCION', 'PROBABILIDAD'])
661
-
662
- # # Agregar las recomendaciones al DataFrame usando pd.concat
663
- # for product, prob in recommendations_with_prob:
664
- # # Buscar la descripción en el DataFrame de productos
665
- # description = productos.loc[productos['ARTICULO'] == product, 'DESCRIPCION']
666
- # if not description.empty:
667
- # # Crear un nuevo DataFrame temporal para la recomendación
668
- # temp_df = pd.DataFrame({
669
- # 'ARTICULO': [product],
670
- # 'DESCRIPCION': [description.values[0]], # Obtener el primer valor encontrado
671
- # 'PROBABILIDAD': [prob]
672
- # })
673
- # # Concatenar el DataFrame temporal al DataFrame de recomendaciones
674
- # recommendations_df = pd.concat([recommendations_df, temp_df], ignore_index=True)
675
-
676
- # return recommendations_df
677
-
678
- # # Comprobar si el cliente está en el CSV de fieles
679
- # is_fiel = customer_code in fieles_df['Cliente'].astype(str).values
680
-
681
- # if customer_code:
682
- # if is_fiel:
683
- # st.write(f"### Customer {customer_code} is a loyal customer.")
684
- # option = st.selectbox("Select Recommendation Type", ["Select an option", "By Purchase History", "By Current Basket"])
685
-
686
- # if option == "By Purchase History":
687
- # st.warning("Option not available... aún")
688
- # elif option == "By Current Basket":
689
- # st.write("Select the items and assign quantities for the basket:")
690
-
691
- # # Mostrar lista de artículos disponibles
692
- # available_articles = productos['ARTICULO'].unique()
693
- # selected_articles = st.multiselect("Select Articles", available_articles)
694
-
695
- # # Crear inputs para ingresar las cantidades de cada artículo seleccionado
696
- # quantities = {}
697
- # for article in selected_articles:
698
- # quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1)
699
-
700
- # if st.button("Calcular"): # Añadimos el botón "Calcular"
701
- # # Crear una lista de artículos basada en la selección
702
- # new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0]
703
-
704
- # if new_basket:
705
- # # Procesar la lista para recomendar
706
- # recommendations_df = recomienda(new_basket)
707
-
708
- # if not recommendations_df.empty:
709
- # st.write("### Recommendations based on the current basket:")
710
- # st.dataframe(recommendations_df)
711
- # else:
712
- # st.warning("No recommendations found for the provided basket.")
713
- # else:
714
- # st.warning("Please select at least one article and set its quantity.")
715
- # else:
716
- # st.write(f"### Customer {customer_code} is not a loyal customer.")
717
- # st.write("Select items and assign quantities for the basket:")
718
 
719
- # # Mostrar lista de artículos disponibles
720
- # available_articles = productos['ARTICULO'].unique()
721
- # selected_articles = st.multiselect("Select Articles", available_articles)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
 
723
- # # Crear inputs para ingresar las cantidades de cada artículo seleccionado
724
- # quantities = {}
725
- # for article in selected_articles:
726
- # quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1)
727
 
728
- # if st.button("Calcular"): # Añadimos el botón "Calcular"
729
- # # Crear una lista de artículos basada en la selección
730
- # new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0]
731
 
732
- # if new_basket:
733
- # # Procesar la lista para recomendar
734
- # recommendations_df = recomienda(new_basket)
735
 
736
- # if not recommendations_df.empty:
737
- # st.write("### Recommendations based on the current basket:")
738
- # st.dataframe(recommendations_df)
739
- # else:
740
- # st.warning("No recommendations found for the provided basket.")
741
- # else:
742
- # st.warning("Please select at least one article and set its quantity.")
743
 
744
 
745
  # Customer Analysis Page
 
1
  import streamlit as st
2
  import pandas as pd
3
  import plotly.express as px
 
4
  import matplotlib.pyplot as plt
5
  import numpy as np
6
  import lightgbm as lgb
 
11
  # Page configuration
12
  st.set_page_config(page_title="Customer Insights App", page_icon=":bar_chart:")
13
 
14
+ # Load CSV files at the top
15
  df = pd.read_csv("df_clean.csv")
16
  nombres_proveedores = pd.read_csv("nombres_proveedores.csv", sep=';')
17
  euros_proveedor = pd.read_csv("euros_proveedor.csv", sep=',')
18
  ventas_clientes = pd.read_csv("ventas_clientes.csv", sep=',')
19
+ customer_clusters = pd.read_csv('predicts/customer_clusters.csv') # Load the customer clusters here
20
+ df_agg_2024 = pd.read_csv('predicts/df_agg_2024.csv')
21
 
22
  # Ensure customer codes are strings
23
  df['CLIENTE'] = df['CLIENTE'].astype(str)
24
  nombres_proveedores['codigo'] = nombres_proveedores['codigo'].astype(str)
25
  euros_proveedor['CLIENTE'] = euros_proveedor['CLIENTE'].astype(str)
26
+ customer_clusters['cliente_id'] = customer_clusters['cliente_id'].astype(str) # Ensure customer IDs are strings
27
  fieles_df = pd.read_csv("clientes_relevantes.csv")
28
  cestas = pd.read_csv("cestas.csv")
29
  productos = pd.read_csv("productos.csv")
30
  df_agg_2024['cliente_id'] = df_agg_2024['cliente_id'].astype(str)
31
 
32
+ # Convert all columns except 'CLIENTE' to float in euros_proveedor
33
  for col in euros_proveedor.columns:
34
  if col != 'CLIENTE':
35
  euros_proveedor[col] = pd.to_numeric(euros_proveedor[col], errors='coerce')
36
 
37
+ # Check for NaN values after conversion
38
  if euros_proveedor.isna().any().any():
39
  st.warning("Some values in euros_proveedor couldn't be converted to numbers. Please review the input data.")
40
 
41
+ # Ignore the last two columns of df
42
  df = df.iloc[:, :-2]
43
 
44
  # Function to get supplier name
45
  def get_supplier_name(code):
46
+ code = str(code) # Ensure code is a string
47
  name = nombres_proveedores[nombres_proveedores['codigo'] == code]['nombre'].values
48
  return name[0] if len(name) > 0 else code
49
 
50
+ # Function to create radar chart with square root transformation
51
  def radar_chart(categories, values, amounts, title):
52
+ N = len(categories)
53
+ angles = [n / float(N) * 2 * np.pi for n in range(N)]
54
+ angles += angles[:1]
 
 
 
 
 
 
 
 
 
55
 
56
+ fig, ax = plt.subplots(figsize=(12, 12), subplot_kw=dict(projection='polar'))
57
+
58
+ # Apply square root transformation
59
+ sqrt_values = np.sqrt(values)
60
+ sqrt_amounts = np.sqrt(amounts)
61
+
62
+ max_sqrt_value = max(sqrt_values)
63
+ normalized_values = [v / max_sqrt_value for v in sqrt_values]
64
+
65
+ # Adjust scaling for spend values
66
+ max_sqrt_amount = max(sqrt_amounts)
67
+ scaling_factor = 0.7 # Adjust this value to control how much the spend values are scaled up
68
+ normalized_amounts = [min((a / max_sqrt_amount) * scaling_factor, 1.0) for a in sqrt_amounts]
69
+
70
+ normalized_values += normalized_values[:1]
71
+ ax.plot(angles, normalized_values, 'o-', linewidth=2, color='#FF69B4', label='% Units (sqrt)')
72
+ ax.fill(angles, normalized_values, alpha=0.25, color='#FF69B4')
73
+
74
+ normalized_amounts += normalized_amounts[:1]
75
+ ax.plot(angles, normalized_amounts, 'o-', linewidth=2, color='#4B0082', label='% Spend (sqrt)')
76
+ ax.fill(angles, normalized_amounts, alpha=0.25, color='#4B0082')
77
+
78
+ ax.set_xticks(angles[:-1])
79
+ ax.set_xticklabels(categories, size=8, wrap=True)
80
+ ax.set_ylim(0, 1)
81
+
82
+ circles = np.linspace(0, 1, 5)
83
+ for circle in circles:
84
+ ax.plot(angles, [circle]*len(angles), '--', color='gray', alpha=0.3, linewidth=0.5)
85
+
86
+ ax.set_yticklabels([])
87
+ ax.spines['polar'].set_visible(False)
88
+
89
+ plt.title(title, size=16, y=1.1)
90
+ plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
91
 
 
 
 
 
 
 
 
 
92
  return fig
93
 
94
  # Main page design
 
121
 
122
  if st.button("Calcular"):
123
  if customer_code:
124
+ # Find Customer's Cluster
125
  customer_match = customer_clusters[customer_clusters['cliente_id'] == customer_code]
126
 
127
  if not customer_match.empty:
128
  cluster = customer_match['cluster_id'].values[0]
129
  st.write(f"Customer {customer_code} belongs to cluster {cluster}")
130
 
131
+ # Load the Corresponding Model
132
  model_path = f'models/modelo_cluster_{cluster}.txt'
133
  gbm = lgb.Booster(model_file=model_path)
134
  st.write(f"Loaded model for cluster {cluster}")
135
 
136
+ # Inspect the model
137
+ st.write("### Model Information:")
138
+ st.write(f"Number of trees: {gbm.num_trees()}")
139
+ st.write(f"Number of features: {gbm.num_feature()}")
140
+ st.write("Feature names:")
141
+ st.write(gbm.feature_name())
142
+
143
  # Load predict data for that cluster
144
  predict_data = pd.read_csv(f'predicts/predict_cluster_{cluster}.csv')
145
+
146
+ # Convert cliente_id to string
147
  predict_data['cliente_id'] = predict_data['cliente_id'].astype(str)
148
+
149
+ st.write("### Predict Data DataFrame:")
150
+ st.write(predict_data.head())
151
+ st.write(f"Shape: {predict_data.shape}")
152
 
153
  # Filter for the specific customer
154
+ customer_code_str = str(customer_code)
155
+ customer_data = predict_data[predict_data['cliente_id'] == customer_code_str]
156
+
157
+ # Add debug statements
158
+ st.write(f"Unique customer IDs in predict data: {predict_data['cliente_id'].unique()}")
159
+ st.write(f"Customer code we're looking for: {customer_code_str}")
160
+
161
+ st.write("### Customer Data:")
162
+ st.write(customer_data.head())
163
+ st.write(f"Shape: {customer_data.shape}")
164
 
165
  if not customer_data.empty:
166
+ # Define features consistently with the training process
167
  lag_features = [f'precio_total_lag_{lag}' for lag in range(1, 25)]
168
  features = lag_features + ['mes', 'marca_id_encoded', 'año', 'cluster_id']
169
+
170
+ # Prepare data for prediction
171
  X_predict = customer_data[features]
172
 
173
  # Convert categorical features to 'category' dtype
174
  categorical_features = ['mes', 'marca_id_encoded', 'cluster_id']
175
  for feature in categorical_features:
176
  X_predict[feature] = X_predict[feature].astype('category')
177
+
178
+ st.write("### Features for Prediction:")
179
+ st.write(X_predict.head())
180
+ st.write(f"Shape: {X_predict.shape}")
181
+ st.write("Data types:")
182
+ st.write(X_predict.dtypes)
183
+
184
  # Make Prediction for the selected customer
185
  y_pred = gbm.predict(X_predict, num_iteration=gbm.best_iteration)
186
+ st.write("### Prediction Results:")
187
+ st.write(f"Type of y_pred: {type(y_pred)}")
188
+ st.write(f"Shape of y_pred: {y_pred.shape}")
189
+ st.write("First few predictions:")
190
+ st.write(y_pred[:5])
191
+
192
+ # Reassemble the results
193
  results = customer_data[['cliente_id', 'marca_id_encoded', 'fecha_mes']].copy()
194
  results['ventas_predichas'] = y_pred
195
+ st.write("### Results DataFrame:")
196
+ st.write(results.head())
197
+ st.write(f"Shape: {results.shape}")
198
+
199
+ st.write(f"Predicted total sales for Customer {customer_code}: {results['ventas_predichas'].sum():.2f}")
200
 
201
+ # Load actual data
202
+ actual_sales = df_agg_2024[df_agg_2024['cliente_id'] == customer_code_str]
203
+ st.write("### Actual Sales DataFrame:")
204
+ st.write(actual_sales.head())
205
+ st.write(f"Shape: {actual_sales.shape}")
206
+
207
  if not actual_sales.empty:
208
+ results = results.merge(actual_sales[['cliente_id', 'marca_id_encoded', 'fecha_mes', 'precio_total']],
209
+ on=['cliente_id', 'marca_id_encoded', 'fecha_mes'],
210
+ how='left')
 
 
211
  results.rename(columns={'precio_total': 'ventas_reales'}, inplace=True)
212
  results['ventas_reales'].fillna(0, inplace=True)
213
+ st.write("### Final Results DataFrame:")
214
+ st.write(results.head())
215
+ st.write(f"Shape: {results.shape}")
216
+
217
+ # Calculate metrics only for non-null actual sales
218
  valid_results = results.dropna(subset=['ventas_reales'])
219
  if not valid_results.empty:
220
  mae = mean_absolute_error(valid_results['ventas_reales'], valid_results['ventas_predichas'])
221
  mape = np.mean(np.abs((valid_results['ventas_reales'] - valid_results['ventas_predichas']) / valid_results['ventas_reales'])) * 100
222
  rmse = np.sqrt(mean_squared_error(valid_results['ventas_reales'], valid_results['ventas_predichas']))
223
 
224
+ st.write(f"Actual total sales for Customer {customer_code}: {valid_results['ventas_reales'].sum():.2f}")
225
  st.write(f"MAE: {mae:.2f}")
226
  st.write(f"MAPE: {mape:.2f}%")
227
  st.write(f"RMSE: {rmse:.2f}")
228
 
229
+ # Analysis of results
230
+ threshold_good = 100 # You may want to adjust this threshold
231
+ if mae < threshold_good:
232
+ st.success(f"Customer {customer_code} is performing well based on the predictions.")
233
+ else:
234
+ st.warning(f"Customer {customer_code} is not performing well based on the predictions.")
235
+ else:
236
+ st.warning(f"No actual sales data found for customer {customer_code} in df_agg_2024.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
+ st.write("### Debug Information for Radar Chart:")
239
+ st.write(f"Shape of customer_data: {customer_data.shape}")
240
+ st.write(f"Shape of euros_proveedor: {euros_proveedor.shape}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
+ # Get percentage of units sold for each manufacturer
243
+ customer_df = df[df["CLIENTE"] == str(customer_code)] # Get the customer data
244
+ all_manufacturers = customer_df.iloc[:, 1:].T # Exclude CLIENTE column (manufacturers are in columns)
245
+ all_manufacturers.index = all_manufacturers.index.astype(str)
246
 
247
+ # Get total sales for each manufacturer from euros_proveedor
248
+ customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)]
249
+ sales_data = customer_euros.iloc[:, 1:].T # Exclude CLIENTE column
250
+ sales_data.index = sales_data.index.astype(str)
 
 
 
 
251
 
252
+ # Remove the 'CLIENTE' row from sales_data to avoid issues with mixed types
253
+ sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore')
 
254
 
255
+ # Ensure all values are numeric
256
+ sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce')
257
+ all_manufacturers = all_manufacturers.apply(pd.to_numeric, errors='coerce')
 
258
 
259
+ # Sort manufacturers by percentage of units and get top 10
260
+ top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10)
 
 
261
 
262
+ # Sort manufacturers by total sales and get top 10
263
+ top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10)
264
 
265
+ # Combine top manufacturers from both lists and get up to 20 unique manufacturers
266
+ combined_top = pd.concat([top_units, top_sales]).index.unique()[:20]
 
267
 
268
+ # Filter out manufacturers that are not present in both datasets
269
+ combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index]
270
 
271
+ st.write(f"Number of combined top manufacturers: {len(combined_top)}")
 
272
 
273
+ if combined_top:
274
+ # Create a DataFrame with combined data for these top manufacturers
275
+ combined_data = pd.DataFrame({
276
+ 'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]],
277
+ 'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]]
278
+ }).fillna(0)
279
 
280
+ # Sort by units, then by sales
281
+ combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False)
282
 
283
+ # Filter out manufacturers with 0 units
284
+ non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0]
285
 
286
+ # If we have less than 3 non-zero manufacturers, add some zero-value ones
287
+ if len(non_zero_manufacturers) < 3:
288
+ zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers))
289
+ manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers])
290
+ else:
291
+ manufacturers_to_show = non_zero_manufacturers
292
 
293
+ values = manufacturers_to_show['units'].tolist()
294
+ amounts = manufacturers_to_show['sales'].tolist()
295
+ manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index]
296
 
297
+ st.write(f"### Results for top {len(manufacturers)} manufacturers:")
298
+ for manufacturer, value, amount in zip(manufacturers, values, amounts):
299
+ st.write(f"{manufacturer} = {value:.2f}% of units, €{amount:.2f} total sales")
300
 
301
+ if manufacturers: # Only create the chart if we have data
302
+ fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}')
303
+ st.pyplot(fig)
304
+ else:
305
+ st.warning("No data available to create the radar chart.")
 
306
 
307
+ else:
308
+ st.warning("No combined top manufacturers found.")
 
309
 
310
+ # Ensure codigo_cliente in ventas_clientes is a string
311
+ ventas_clientes['codigo_cliente'] = ventas_clientes['codigo_cliente'].astype(str).str.strip()
 
312
 
313
+ # Ensure customer_code is a string and strip any spaces
314
+ customer_code = str(customer_code).strip()
 
 
 
315
 
316
+ if customer_code in ventas_clientes['codigo_cliente'].unique():
317
+ st.write(f"Customer {customer_code} found in ventas_clientes")
318
+ else:
319
+ st.write(f"Customer {customer_code} not found in ventas_clientes")
320
 
321
+ # Customer sales 2021-2024 (if data exists)
322
+ sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023']
323
+ if all(col in ventas_clientes.columns for col in sales_columns):
324
+ customer_sales_data = ventas_clientes[ventas_clientes['codigo_cliente'] == customer_code]
 
 
 
 
 
 
 
 
 
 
 
325
 
326
+ if not customer_sales_data.empty:
327
+ customer_sales = customer_sales_data[sales_columns].values[0]
328
+ years = ['2021', '2022', '2023']
329
 
330
+ fig_sales = px.line(x=years, y=customer_sales, markers=True, title=f'Sales Over the Years for Customer {customer_code}')
331
+ fig_sales.update_layout(xaxis_title="Year", yaxis_title="Sales")
332
+ st.plotly_chart(fig_sales)
333
+ else:
334
+ st.warning(f"No historical sales data found for customer {customer_code}")
335
+ else:
336
+ st.warning("Sales data for 2021-2023 not available in the dataset.")
337
+ else:
338
+ st.warning(f"No data found for customer {customer_code}. Please check the code.")
339
+ else:
340
+ st.warning("Please select a customer.")
341
 
342
 
343
+ # Customer Recommendations Page
344
+ elif page == "Articles Recommendations":
345
+ st.title("Articles Recommendations")
346
 
347
+ st.markdown("""
348
+ Get tailored recommendations for your customers based on their basket.
349
+ """)
350
 
351
+ # Campo input para cliente
352
+ partial_code = st.text_input("Enter part of Customer Code for Recommendations (or leave empty to see all)")
353
+ if partial_code:
354
+ filtered_customers = df[df['CLIENTE'].str.contains(partial_code)]
355
+ else:
356
+ filtered_customers = df
357
+ customer_list = filtered_customers['CLIENTE'].unique()
358
+ customer_code = st.selectbox("Select Customer Code for Recommendations", [""] + list(customer_list))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
 
360
+ # Definición de la función recomienda
361
+ def recomienda(new_basket):
362
+ # Calcular la matriz TF-IDF
363
+ tfidf = TfidfVectorizer()
364
+ tfidf_matrix = tfidf.fit_transform(cestas['Cestas'])
365
+
366
+ # Convertir la nueva cesta en formato TF-IDF
367
+ new_basket_str = ' '.join(new_basket)
368
+ new_basket_tfidf = tfidf.transform([new_basket_str])
369
+
370
+ # Comparar la nueva cesta con las anteriores
371
+ similarities = cosine_similarity(new_basket_tfidf, tfidf_matrix)
372
+
373
+ # Obtener los índices de las cestas más similares
374
+ similar_indices = similarities.argsort()[0][-3:] # Las 3 más similares
375
+
376
+ # Crear un diccionario para contar las recomendaciones
377
+ recommendations_count = {}
378
+ total_similarity = 0
379
+
380
+ # Recomendar productos de cestas similares
381
+ for idx in similar_indices:
382
+ sim_score = similarities[0][idx]
383
+ total_similarity += sim_score
384
+ products = cestas.iloc[idx]['Cestas'].split()
385
+
386
+ for product in products:
387
+ if product.strip() not in new_basket: # Evitar recomendar lo que ya está en la cesta
388
+ if product.strip() in recommendations_count:
389
+ recommendations_count[product.strip()] += sim_score
390
+ else:
391
+ recommendations_count[product.strip()] = sim_score
392
+
393
+ # Calcular la probabilidad relativa de cada producto recomendado
394
+ recommendations_with_prob = []
395
+ if total_similarity > 0: # Verificar que total_similarity no sea cero
396
+ recommendations_with_prob = [(product, score / total_similarity) for product, score in recommendations_count.items()]
397
+ else:
398
+ print("No se encontraron similitudes suficientes para calcular probabilidades.")
399
+
400
+ recommendations_with_prob.sort(key=lambda x: x[1], reverse=True) # Ordenar por puntuación
401
+
402
+ # Crear un nuevo DataFrame para almacenar las recomendaciones con descripciones y probabilidades
403
+ recommendations_df = pd.DataFrame(columns=['ARTICULO', 'DESCRIPCION', 'PROBABILIDAD'])
404
+
405
+ # Agregar las recomendaciones al DataFrame usando pd.concat
406
+ for product, prob in recommendations_with_prob:
407
+ # Buscar la descripción en el DataFrame de productos
408
+ description = productos.loc[productos['ARTICULO'] == product, 'DESCRIPCION']
409
+ if not description.empty:
410
+ # Crear un nuevo DataFrame temporal para la recomendación
411
+ temp_df = pd.DataFrame({
412
+ 'ARTICULO': [product],
413
+ 'DESCRIPCION': [description.values[0]], # Obtener el primer valor encontrado
414
+ 'PROBABILIDAD': [prob]
415
+ })
416
+ # Concatenar el DataFrame temporal al DataFrame de recomendaciones
417
+ recommendations_df = pd.concat([recommendations_df, temp_df], ignore_index=True)
418
+
419
+ return recommendations_df
420
+
421
+ # Comprobar si el cliente está en el CSV de fieles
422
+ is_fiel = customer_code in fieles_df['Cliente'].astype(str).values
 
 
 
 
 
423
 
424
+ if customer_code:
425
+ if is_fiel:
426
+ st.write(f"### Customer {customer_code} is a loyal customer.")
427
+ option = st.selectbox("Select Recommendation Type", ["Select an option", "By Purchase History", "By Current Basket"])
428
+
429
+ if option == "By Purchase History":
430
+ st.warning("Option not available... aún")
431
+ elif option == "By Current Basket":
432
+ st.write("Select the items and assign quantities for the basket:")
433
+
434
+ # Mostrar lista de artículos disponibles
435
+ available_articles = productos['ARTICULO'].unique()
436
+ selected_articles = st.multiselect("Select Articles", available_articles)
437
+
438
+ # Crear inputs para ingresar las cantidades de cada artículo seleccionado
439
+ quantities = {}
440
+ for article in selected_articles:
441
+ quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1)
442
+
443
+ if st.button("Calcular"): # Añadimos el botón "Calcular"
444
+ # Crear una lista de artículos basada en la selección
445
+ new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0]
446
+
447
+ if new_basket:
448
+ # Procesar la lista para recomendar
449
+ recommendations_df = recomienda(new_basket)
450
+
451
+ if not recommendations_df.empty:
452
+ st.write("### Recommendations based on the current basket:")
453
+ st.dataframe(recommendations_df)
454
+ else:
455
+ st.warning("No recommendations found for the provided basket.")
456
+ else:
457
+ st.warning("Please select at least one article and set its quantity.")
458
+ else:
459
+ st.write(f"### Customer {customer_code} is not a loyal customer.")
460
+ st.write("Select items and assign quantities for the basket:")
461
+
462
+ # Mostrar lista de artículos disponibles
463
+ available_articles = productos['ARTICULO'].unique()
464
+ selected_articles = st.multiselect("Select Articles", available_articles)
465
 
466
+ # Crear inputs para ingresar las cantidades de cada artículo seleccionado
467
+ quantities = {}
468
+ for article in selected_articles:
469
+ quantities[article] = st.number_input(f"Quantity for {article}", min_value=0, step=1)
470
 
471
+ if st.button("Calcular"): # Añadimos el botón "Calcular"
472
+ # Crear una lista de artículos basada en la selección
473
+ new_basket = [f"{article} x{quantities[article]}" for article in selected_articles if quantities[article] > 0]
474
 
475
+ if new_basket:
476
+ # Procesar la lista para recomendar
477
+ recommendations_df = recomienda(new_basket)
478
 
479
+ if not recommendations_df.empty:
480
+ st.write("### Recommendations based on the current basket:")
481
+ st.dataframe(recommendations_df)
482
+ else:
483
+ st.warning("No recommendations found for the provided basket.")
484
+ else:
485
+ st.warning("Please select at least one article and set its quantity.")
486
 
487
 
488
  # Customer Analysis Page