SantanuBanerjee commited on
Commit
6a89968
·
verified ·
1 Parent(s): 95831be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -41
app.py CHANGED
@@ -10,7 +10,7 @@ import gradio as gr
10
  import pandas as pd
11
 
12
  def data_pre_processing(file_responses):
13
- console_messages.append("Starting data pre-processing...")
14
  # Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
15
 
16
  try: # Define the columns to be processed
@@ -54,8 +54,7 @@ def data_pre_processing(file_responses):
54
  initial_dataset_2['Financial_Weight'] = file_responses['Personal_TaxDirection_2_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
55
  initial_dataset_3['Financial_Weight'] = file_responses['Personal_TaxDirection_3_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
56
 
57
- # Removing useless rows
58
- # Drop rows where Problem_Description is NaN or an empty string
59
  initial_dataset_1 = initial_dataset_1.dropna(subset=['Problem_Description'], axis=0)
60
  initial_dataset_2 = initial_dataset_2.dropna(subset=['Problem_Description'], axis=0)
61
  initial_dataset_3 = initial_dataset_3.dropna(subset=['Problem_Description'], axis=0)
@@ -65,19 +64,15 @@ def data_pre_processing(file_responses):
65
  initial_dataset_2['Problem_Description'] = initial_dataset_2['Problem_Description'].astype(str)
66
  initial_dataset_3['Problem_Description'] = initial_dataset_3['Problem_Description'].astype(str)
67
 
68
- # Merging the Datasets
69
- # Vertically concatenating (merging) the 3 DataFrames
70
  merged_dataset = pd.concat([initial_dataset_1, initial_dataset_2, initial_dataset_3], ignore_index=True)
71
-
72
 
73
  # Different return can be used to check the processing
74
- console_messages.append("Data pre-processing completed.")
75
- # return file_responses
76
  return merged_dataset
77
 
78
  except Exception as e:
79
- console_messages.append(f"Error during data pre-processing: {str(e)}")
80
- # return str(e), console_messages
81
  return None
82
 
83
 
@@ -201,7 +196,7 @@ def extract_problem_domains(df,
201
  text_column='Processed_ProblemDescription_forDomainExtraction',
202
  cluster_range=(6, 10),
203
  top_words=7):
204
- console_messages.append("Extracting Problem Domains...")
205
 
206
  # Sentence Transformers approach
207
  model = SentenceTransformer('all-mpnet-base-v2')
@@ -232,8 +227,7 @@ def extract_problem_domains(df,
232
  df["Problem_Cluster"] = cluster_labels
233
  df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
234
 
235
- # console_messages.append("Returning from Problem Domain Extraction function.")
236
- console_messages.append("Problem Domain Extraction completed.")
237
  return df, optimal_n_clusters, cluster_representations
238
 
239
 
@@ -282,13 +276,13 @@ def text_processing_for_location(text):
282
  def extract_location_clusters(df,
283
  text_column1='Processed_LocationText_forClustering', # Extracted through NLP
284
  text_column2='Geographical_Location', # User Input
285
- cluster_range=(1, 5),
286
  top_words=3):
287
  # Combine the two text columns
288
  text_column = "Combined_Location_Text"
289
  df[text_column] = df[text_column1] + ' ' + df[text_column2]
290
 
291
- console_messages.append("Extracting Location Clusters...")
292
 
293
  # Sentence Transformers approach for embeddings
294
  model = SentenceTransformer('all-mpnet-base-v2')
@@ -320,7 +314,7 @@ def extract_location_clusters(df,
320
  df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
321
 
322
  df = df.drop(text_column, axis=1)
323
- console_messages.append("Location Clustering completed.")
324
  return df, optimal_n_clusters, cluster_representations
325
 
326
 
@@ -408,17 +402,14 @@ def generate_project_proposal(prompt):
408
 
409
 
410
  def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
411
- print("\n Starting function: create_project_proposals")
412
- console_messages.append("\n Starting function: create_project_proposals")
413
  proposals = {}
414
 
415
  for loc in budget_cluster_df.index:
416
- print("\n loc: ", loc)
417
- console_messages.append(f"\n loc: {loc}")
418
 
419
  for prob in budget_cluster_df.columns:
420
- console_messages.append(f"\n prob: {prob}")
421
- print("\n prob: ", prob)
422
 
423
  location = ", ".join([item.strip() for item in location_clusters[loc] if item]) # Clean and join
424
  problem_domain = ", ".join([item.strip() for item in problem_clusters[prob] if item]) # Clean and join
@@ -432,7 +423,7 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
432
  # Check if problem_descriptions is valid (not NaN and not an empty list)
433
  if isinstance(problem_descriptions, list) and problem_descriptions:
434
  # print(f"\nGenerating proposal for location: {location}, problem domain: {problem_domain}")
435
- print(f"Generating PP")
436
 
437
  # Prepare the prompt
438
  # problems_summary = "; \n".join(problem_descriptions) # Join all problem descriptions
@@ -531,17 +522,17 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
531
 
532
 
533
  def nlp_pipeline(original_df):
534
- console_messages.append("Starting NLP pipeline...")
535
 
536
  # Data Preprocessing
537
  processed_df = data_pre_processing(original_df) # merged_dataset
538
 
539
  # Starting the Pipeline for Domain Extraction
540
- console_messages.append("Executing Text processing function for Domain identification")
541
  # Apply the text_processing_for_domain function to the DataFrame
542
  processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
543
 
544
- console_messages.append("Removing entries which could not be allocated to any Problem Domain")
545
  # processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
546
  # Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
547
  processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
@@ -549,13 +540,13 @@ def nlp_pipeline(original_df):
549
  # Domain Clustering
550
  try:
551
  processed_df, optimal_n_clusters, problem_clusters = extract_problem_domains(processed_df)
552
- console_messages.append(f"Optimal clusters for Domain extraction: {optimal_n_clusters}")
553
  except Exception as e:
554
- console_messages.append(f"Error in extract_problem_domains: {str(e)}")
555
- console_messages.append("NLP pipeline for Problem Domain extraction completed.")
556
 
557
 
558
- console_messages.append("Starting NLP pipeline for Location extraction with text processing.")
559
 
560
  # Apply the text_processing_for_location function to the DataFrame
561
  processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
@@ -564,10 +555,10 @@ def nlp_pipeline(original_df):
564
  # Location Clustering
565
  try:
566
  processed_df, optimal_n_clusters, location_clusters = extract_location_clusters(processed_df)
567
- console_messages.append(f"Optimal clusters for Location extraction: {optimal_n_clusters}")
568
  except Exception as e:
569
- console_messages.append(f"Error in extract_location_clusters: {str(e)}")
570
- console_messages.append("NLP pipeline for location extraction completed.")
571
 
572
 
573
  # Create cluster dataframes
@@ -585,8 +576,7 @@ def nlp_pipeline(original_df):
585
  # print("\n problem_clusters_2: ", problem_clusters)
586
  project_proposals = create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters)
587
 
588
- console_messages.append("NLP pipeline completed.")
589
- print("NLP pipeline completed.")
590
  return processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters
591
 
592
 
@@ -597,8 +587,15 @@ def nlp_pipeline(original_df):
597
 
598
 
599
  console_messages = []
 
 
 
 
 
 
 
600
  def process_excel(file):
601
- console_messages.append("Processing starts. Reading the uploaded Excel file...")
602
  # Ensure the file path is correct
603
  file_path = file.name if hasattr(file, 'name') else file
604
  # Read the Excel file
@@ -606,7 +603,7 @@ def process_excel(file):
606
 
607
  try:
608
  # Process the DataFrame
609
- console_messages.append("Processing the DataFrame...")
610
  processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters = nlp_pipeline(df)
611
  # processed_df, budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters = nlp_pipeline(df)
612
 
@@ -628,25 +625,25 @@ def process_excel(file):
628
  # if isinstance(location_clusters, pd.DataFrame):
629
  # location_clusters.to_excel(writer, sheet_name='Location_Clusters', index=False)
630
  # else:
631
- # console_messages.append("Converting Location Clusters to df")
632
  # pd.DataFrame(location_clusters).to_excel(writer, sheet_name='Location_Clusters', index=False)
633
 
634
  # if isinstance(problem_clusters, pd.DataFrame):
635
  # problem_clusters.to_excel(writer, sheet_name='Problem_Clusters', index=False)
636
  # else:
637
- # console_messages.append("Converting Problem Clusters to df")
638
  # pd.DataFrame(problem_clusters).to_excel(writer, sheet_name='Problem_Clusters', index=False)
639
 
640
 
641
 
642
- console_messages.append("Processing completed. Ready for download.")
643
  return output_filename, "\n".join(console_messages) # Return the processed DataFrame as Excel file
644
 
645
  except Exception as e:
646
  # return str(e) # Return the error message
647
  # error_message = f"Error processing file: {str(e)}"
648
  # print(error_message) # Log the error
649
- console_messages.append(f"Error during processing: {str(e)}")
650
  # return error_message, "Santanu Banerjee" # Return the error message to the user
651
  return None, "\n".join(console_messages)
652
 
 
10
  import pandas as pd
11
 
12
  def data_pre_processing(file_responses):
13
+ consoleMessage_and_Print("Starting data pre-processing...")
14
  # Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
15
 
16
  try: # Define the columns to be processed
 
54
  initial_dataset_2['Financial_Weight'] = file_responses['Personal_TaxDirection_2_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
55
  initial_dataset_3['Financial_Weight'] = file_responses['Personal_TaxDirection_3_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
56
 
57
+ # Removing useless rows # Drop rows where Problem_Description is NaN or an empty string
 
58
  initial_dataset_1 = initial_dataset_1.dropna(subset=['Problem_Description'], axis=0)
59
  initial_dataset_2 = initial_dataset_2.dropna(subset=['Problem_Description'], axis=0)
60
  initial_dataset_3 = initial_dataset_3.dropna(subset=['Problem_Description'], axis=0)
 
64
  initial_dataset_2['Problem_Description'] = initial_dataset_2['Problem_Description'].astype(str)
65
  initial_dataset_3['Problem_Description'] = initial_dataset_3['Problem_Description'].astype(str)
66
 
67
+ # Merging the Datasets # Vertically concatenating (merging) the 3 DataFrames
 
68
  merged_dataset = pd.concat([initial_dataset_1, initial_dataset_2, initial_dataset_3], ignore_index=True)
 
69
 
70
  # Different return can be used to check the processing
71
+ consoleMessage_and_Print("Data pre-processing completed.")
 
72
  return merged_dataset
73
 
74
  except Exception as e:
75
+ consoleMessage_and_Print(f"Error during data pre-processing: {str(e)}")
 
76
  return None
77
 
78
 
 
196
  text_column='Processed_ProblemDescription_forDomainExtraction',
197
  cluster_range=(6, 10),
198
  top_words=7):
199
+ consoleMessage_and_Print("Extracting Problem Domains...")
200
 
201
  # Sentence Transformers approach
202
  model = SentenceTransformer('all-mpnet-base-v2')
 
227
  df["Problem_Cluster"] = cluster_labels
228
  df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
229
 
230
+ consoleMessage_and_Print("Problem Domain Extraction completed. Returning from Problem Domain Extraction function.")
 
231
  return df, optimal_n_clusters, cluster_representations
232
 
233
 
 
276
  def extract_location_clusters(df,
277
  text_column1='Processed_LocationText_forClustering', # Extracted through NLP
278
  text_column2='Geographical_Location', # User Input
279
+ cluster_range=(2, 5),
280
  top_words=3):
281
  # Combine the two text columns
282
  text_column = "Combined_Location_Text"
283
  df[text_column] = df[text_column1] + ' ' + df[text_column2]
284
 
285
+ consoleMessage_and_Print("Extracting Location Clusters...")
286
 
287
  # Sentence Transformers approach for embeddings
288
  model = SentenceTransformer('all-mpnet-base-v2')
 
314
  df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
315
 
316
  df = df.drop(text_column, axis=1)
317
+ consoleMessage_and_Print("Location Clustering completed.")
318
  return df, optimal_n_clusters, cluster_representations
319
 
320
 
 
402
 
403
 
404
  def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
405
+ consoleMessage_and_Print("\n Starting function: create_project_proposals")
 
406
  proposals = {}
407
 
408
  for loc in budget_cluster_df.index:
409
+ consoleMessage_and_Print(f"\n loc: {loc}")
 
410
 
411
  for prob in budget_cluster_df.columns:
412
+ consoleMessage_and_Print(f"\n prob: {prob}")
 
413
 
414
  location = ", ".join([item.strip() for item in location_clusters[loc] if item]) # Clean and join
415
  problem_domain = ", ".join([item.strip() for item in problem_clusters[prob] if item]) # Clean and join
 
423
  # Check if problem_descriptions is valid (not NaN and not an empty list)
424
  if isinstance(problem_descriptions, list) and problem_descriptions:
425
  # print(f"\nGenerating proposal for location: {location}, problem domain: {problem_domain}")
426
+ consoleMessage_and_Print(f"Generating PP")
427
 
428
  # Prepare the prompt
429
  # problems_summary = "; \n".join(problem_descriptions) # Join all problem descriptions
 
522
 
523
 
524
  def nlp_pipeline(original_df):
525
+ consoleMessage_and_Print("Starting NLP pipeline...")
526
 
527
  # Data Preprocessing
528
  processed_df = data_pre_processing(original_df) # merged_dataset
529
 
530
  # Starting the Pipeline for Domain Extraction
531
+ consoleMessage_and_Print("Executing Text processing function for Domain identification")
532
  # Apply the text_processing_for_domain function to the DataFrame
533
  processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
534
 
535
+ consoleMessage_and_Print("Removing entries which could not be allocated to any Problem Domain")
536
  # processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
537
  # Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
538
  processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
 
540
  # Domain Clustering
541
  try:
542
  processed_df, optimal_n_clusters, problem_clusters = extract_problem_domains(processed_df)
543
+ consoleMessage_and_Print(f"Optimal clusters for Domain extraction: {optimal_n_clusters}")
544
  except Exception as e:
545
+ consoleMessage_and_Print(f"Error in extract_problem_domains: {str(e)}")
546
+ consoleMessage_and_Print("NLP pipeline for Problem Domain extraction completed.")
547
 
548
 
549
+ consoleMessage_and_Print("Starting NLP pipeline for Location extraction with text processing.")
550
 
551
  # Apply the text_processing_for_location function to the DataFrame
552
  processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
 
555
  # Location Clustering
556
  try:
557
  processed_df, optimal_n_clusters, location_clusters = extract_location_clusters(processed_df)
558
+ consoleMessage_and_Print(f"Optimal clusters for Location extraction: {optimal_n_clusters}")
559
  except Exception as e:
560
+ consoleMessage_and_Print(f"Error in extract_location_clusters: {str(e)}")
561
+ consoleMessage_and_Print("NLP pipeline for location extraction completed.")
562
 
563
 
564
  # Create cluster dataframes
 
576
  # print("\n problem_clusters_2: ", problem_clusters)
577
  project_proposals = create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters)
578
 
579
+ consoleMessage_and_Print("NLP pipeline completed.")
 
580
  return processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters
581
 
582
 
 
587
 
588
 
589
  console_messages = []
590
+ def consoleMessage_and_Print(some_text = ""):
591
+ console_messages.append(some_text)
592
+ print(some_text)
593
+
594
+
595
+
596
+
597
  def process_excel(file):
598
+ consoleMessage_and_Print("Processing starts. Reading the uploaded Excel file...")
599
  # Ensure the file path is correct
600
  file_path = file.name if hasattr(file, 'name') else file
601
  # Read the Excel file
 
603
 
604
  try:
605
  # Process the DataFrame
606
+ consoleMessage_and_Print("Processing the DataFrame...")
607
  processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters = nlp_pipeline(df)
608
  # processed_df, budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters = nlp_pipeline(df)
609
 
 
625
  # if isinstance(location_clusters, pd.DataFrame):
626
  # location_clusters.to_excel(writer, sheet_name='Location_Clusters', index=False)
627
  # else:
628
+ # consoleMessage_and_Print("Converting Location Clusters to df")
629
  # pd.DataFrame(location_clusters).to_excel(writer, sheet_name='Location_Clusters', index=False)
630
 
631
  # if isinstance(problem_clusters, pd.DataFrame):
632
  # problem_clusters.to_excel(writer, sheet_name='Problem_Clusters', index=False)
633
  # else:
634
+ # consoleMessage_and_Print("Converting Problem Clusters to df")
635
  # pd.DataFrame(problem_clusters).to_excel(writer, sheet_name='Problem_Clusters', index=False)
636
 
637
 
638
 
639
+ consoleMessage_and_Print("Processing completed. Ready for download.")
640
  return output_filename, "\n".join(console_messages) # Return the processed DataFrame as Excel file
641
 
642
  except Exception as e:
643
  # return str(e) # Return the error message
644
  # error_message = f"Error processing file: {str(e)}"
645
  # print(error_message) # Log the error
646
+ consoleMessage_and_Print(f"Error during processing: {str(e)}")
647
  # return error_message, "Santanu Banerjee" # Return the error message to the user
648
  return None, "\n".join(console_messages)
649