Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -10,7 +10,7 @@ import gradio as gr
|
|
10 |
import pandas as pd
|
11 |
|
12 |
def data_pre_processing(file_responses):
|
13 |
-
|
14 |
# Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
|
15 |
|
16 |
try: # Define the columns to be processed
|
@@ -54,8 +54,7 @@ def data_pre_processing(file_responses):
|
|
54 |
initial_dataset_2['Financial_Weight'] = file_responses['Personal_TaxDirection_2_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
|
55 |
initial_dataset_3['Financial_Weight'] = file_responses['Personal_TaxDirection_3_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
|
56 |
|
57 |
-
# Removing useless rows
|
58 |
-
# Drop rows where Problem_Description is NaN or an empty string
|
59 |
initial_dataset_1 = initial_dataset_1.dropna(subset=['Problem_Description'], axis=0)
|
60 |
initial_dataset_2 = initial_dataset_2.dropna(subset=['Problem_Description'], axis=0)
|
61 |
initial_dataset_3 = initial_dataset_3.dropna(subset=['Problem_Description'], axis=0)
|
@@ -65,19 +64,15 @@ def data_pre_processing(file_responses):
|
|
65 |
initial_dataset_2['Problem_Description'] = initial_dataset_2['Problem_Description'].astype(str)
|
66 |
initial_dataset_3['Problem_Description'] = initial_dataset_3['Problem_Description'].astype(str)
|
67 |
|
68 |
-
# Merging the Datasets
|
69 |
-
# Vertically concatenating (merging) the 3 DataFrames
|
70 |
merged_dataset = pd.concat([initial_dataset_1, initial_dataset_2, initial_dataset_3], ignore_index=True)
|
71 |
-
|
72 |
|
73 |
# Different return can be used to check the processing
|
74 |
-
|
75 |
-
# return file_responses
|
76 |
return merged_dataset
|
77 |
|
78 |
except Exception as e:
|
79 |
-
|
80 |
-
# return str(e), console_messages
|
81 |
return None
|
82 |
|
83 |
|
@@ -201,7 +196,7 @@ def extract_problem_domains(df,
|
|
201 |
text_column='Processed_ProblemDescription_forDomainExtraction',
|
202 |
cluster_range=(6, 10),
|
203 |
top_words=7):
|
204 |
-
|
205 |
|
206 |
# Sentence Transformers approach
|
207 |
model = SentenceTransformer('all-mpnet-base-v2')
|
@@ -232,8 +227,7 @@ def extract_problem_domains(df,
|
|
232 |
df["Problem_Cluster"] = cluster_labels
|
233 |
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
234 |
|
235 |
-
|
236 |
-
console_messages.append("Problem Domain Extraction completed.")
|
237 |
return df, optimal_n_clusters, cluster_representations
|
238 |
|
239 |
|
@@ -282,13 +276,13 @@ def text_processing_for_location(text):
|
|
282 |
def extract_location_clusters(df,
|
283 |
text_column1='Processed_LocationText_forClustering', # Extracted through NLP
|
284 |
text_column2='Geographical_Location', # User Input
|
285 |
-
cluster_range=(
|
286 |
top_words=3):
|
287 |
# Combine the two text columns
|
288 |
text_column = "Combined_Location_Text"
|
289 |
df[text_column] = df[text_column1] + ' ' + df[text_column2]
|
290 |
|
291 |
-
|
292 |
|
293 |
# Sentence Transformers approach for embeddings
|
294 |
model = SentenceTransformer('all-mpnet-base-v2')
|
@@ -320,7 +314,7 @@ def extract_location_clusters(df,
|
|
320 |
df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
321 |
|
322 |
df = df.drop(text_column, axis=1)
|
323 |
-
|
324 |
return df, optimal_n_clusters, cluster_representations
|
325 |
|
326 |
|
@@ -408,17 +402,14 @@ def generate_project_proposal(prompt):
|
|
408 |
|
409 |
|
410 |
def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
|
411 |
-
|
412 |
-
console_messages.append("\n Starting function: create_project_proposals")
|
413 |
proposals = {}
|
414 |
|
415 |
for loc in budget_cluster_df.index:
|
416 |
-
|
417 |
-
console_messages.append(f"\n loc: {loc}")
|
418 |
|
419 |
for prob in budget_cluster_df.columns:
|
420 |
-
|
421 |
-
print("\n prob: ", prob)
|
422 |
|
423 |
location = ", ".join([item.strip() for item in location_clusters[loc] if item]) # Clean and join
|
424 |
problem_domain = ", ".join([item.strip() for item in problem_clusters[prob] if item]) # Clean and join
|
@@ -432,7 +423,7 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
|
|
432 |
# Check if problem_descriptions is valid (not NaN and not an empty list)
|
433 |
if isinstance(problem_descriptions, list) and problem_descriptions:
|
434 |
# print(f"\nGenerating proposal for location: {location}, problem domain: {problem_domain}")
|
435 |
-
|
436 |
|
437 |
# Prepare the prompt
|
438 |
# problems_summary = "; \n".join(problem_descriptions) # Join all problem descriptions
|
@@ -531,17 +522,17 @@ def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clu
|
|
531 |
|
532 |
|
533 |
def nlp_pipeline(original_df):
|
534 |
-
|
535 |
|
536 |
# Data Preprocessing
|
537 |
processed_df = data_pre_processing(original_df) # merged_dataset
|
538 |
|
539 |
# Starting the Pipeline for Domain Extraction
|
540 |
-
|
541 |
# Apply the text_processing_for_domain function to the DataFrame
|
542 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
543 |
|
544 |
-
|
545 |
# processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
|
546 |
# Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
|
547 |
processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
|
@@ -549,13 +540,13 @@ def nlp_pipeline(original_df):
|
|
549 |
# Domain Clustering
|
550 |
try:
|
551 |
processed_df, optimal_n_clusters, problem_clusters = extract_problem_domains(processed_df)
|
552 |
-
|
553 |
except Exception as e:
|
554 |
-
|
555 |
-
|
556 |
|
557 |
|
558 |
-
|
559 |
|
560 |
# Apply the text_processing_for_location function to the DataFrame
|
561 |
processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
|
@@ -564,10 +555,10 @@ def nlp_pipeline(original_df):
|
|
564 |
# Location Clustering
|
565 |
try:
|
566 |
processed_df, optimal_n_clusters, location_clusters = extract_location_clusters(processed_df)
|
567 |
-
|
568 |
except Exception as e:
|
569 |
-
|
570 |
-
|
571 |
|
572 |
|
573 |
# Create cluster dataframes
|
@@ -585,8 +576,7 @@ def nlp_pipeline(original_df):
|
|
585 |
# print("\n problem_clusters_2: ", problem_clusters)
|
586 |
project_proposals = create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters)
|
587 |
|
588 |
-
|
589 |
-
print("NLP pipeline completed.")
|
590 |
return processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters
|
591 |
|
592 |
|
@@ -597,8 +587,15 @@ def nlp_pipeline(original_df):
|
|
597 |
|
598 |
|
599 |
console_messages = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
600 |
def process_excel(file):
|
601 |
-
|
602 |
# Ensure the file path is correct
|
603 |
file_path = file.name if hasattr(file, 'name') else file
|
604 |
# Read the Excel file
|
@@ -606,7 +603,7 @@ def process_excel(file):
|
|
606 |
|
607 |
try:
|
608 |
# Process the DataFrame
|
609 |
-
|
610 |
processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters = nlp_pipeline(df)
|
611 |
# processed_df, budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters = nlp_pipeline(df)
|
612 |
|
@@ -628,25 +625,25 @@ def process_excel(file):
|
|
628 |
# if isinstance(location_clusters, pd.DataFrame):
|
629 |
# location_clusters.to_excel(writer, sheet_name='Location_Clusters', index=False)
|
630 |
# else:
|
631 |
-
#
|
632 |
# pd.DataFrame(location_clusters).to_excel(writer, sheet_name='Location_Clusters', index=False)
|
633 |
|
634 |
# if isinstance(problem_clusters, pd.DataFrame):
|
635 |
# problem_clusters.to_excel(writer, sheet_name='Problem_Clusters', index=False)
|
636 |
# else:
|
637 |
-
#
|
638 |
# pd.DataFrame(problem_clusters).to_excel(writer, sheet_name='Problem_Clusters', index=False)
|
639 |
|
640 |
|
641 |
|
642 |
-
|
643 |
return output_filename, "\n".join(console_messages) # Return the processed DataFrame as Excel file
|
644 |
|
645 |
except Exception as e:
|
646 |
# return str(e) # Return the error message
|
647 |
# error_message = f"Error processing file: {str(e)}"
|
648 |
# print(error_message) # Log the error
|
649 |
-
|
650 |
# return error_message, "Santanu Banerjee" # Return the error message to the user
|
651 |
return None, "\n".join(console_messages)
|
652 |
|
|
|
10 |
import pandas as pd
|
11 |
|
12 |
def data_pre_processing(file_responses):
|
13 |
+
consoleMessage_and_Print("Starting data pre-processing...")
|
14 |
# Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
|
15 |
|
16 |
try: # Define the columns to be processed
|
|
|
54 |
initial_dataset_2['Financial_Weight'] = file_responses['Personal_TaxDirection_2_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
|
55 |
initial_dataset_3['Financial_Weight'] = file_responses['Personal_TaxDirection_3_TaxWeightageAllocated'] * file_responses['Latest estimated Tax payment?'] / file_responses['TotalWeightageAllocated']
|
56 |
|
57 |
+
# Removing useless rows # Drop rows where Problem_Description is NaN or an empty string
|
|
|
58 |
initial_dataset_1 = initial_dataset_1.dropna(subset=['Problem_Description'], axis=0)
|
59 |
initial_dataset_2 = initial_dataset_2.dropna(subset=['Problem_Description'], axis=0)
|
60 |
initial_dataset_3 = initial_dataset_3.dropna(subset=['Problem_Description'], axis=0)
|
|
|
64 |
initial_dataset_2['Problem_Description'] = initial_dataset_2['Problem_Description'].astype(str)
|
65 |
initial_dataset_3['Problem_Description'] = initial_dataset_3['Problem_Description'].astype(str)
|
66 |
|
67 |
+
# Merging the Datasets # Vertically concatenating (merging) the 3 DataFrames
|
|
|
68 |
merged_dataset = pd.concat([initial_dataset_1, initial_dataset_2, initial_dataset_3], ignore_index=True)
|
|
|
69 |
|
70 |
# Different return can be used to check the processing
|
71 |
+
consoleMessage_and_Print("Data pre-processing completed.")
|
|
|
72 |
return merged_dataset
|
73 |
|
74 |
except Exception as e:
|
75 |
+
consoleMessage_and_Print(f"Error during data pre-processing: {str(e)}")
|
|
|
76 |
return None
|
77 |
|
78 |
|
|
|
196 |
text_column='Processed_ProblemDescription_forDomainExtraction',
|
197 |
cluster_range=(6, 10),
|
198 |
top_words=7):
|
199 |
+
consoleMessage_and_Print("Extracting Problem Domains...")
|
200 |
|
201 |
# Sentence Transformers approach
|
202 |
model = SentenceTransformer('all-mpnet-base-v2')
|
|
|
227 |
df["Problem_Cluster"] = cluster_labels
|
228 |
df['Problem_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
229 |
|
230 |
+
consoleMessage_and_Print("Problem Domain Extraction completed. Returning from Problem Domain Extraction function.")
|
|
|
231 |
return df, optimal_n_clusters, cluster_representations
|
232 |
|
233 |
|
|
|
276 |
def extract_location_clusters(df,
|
277 |
text_column1='Processed_LocationText_forClustering', # Extracted through NLP
|
278 |
text_column2='Geographical_Location', # User Input
|
279 |
+
cluster_range=(2, 5),
|
280 |
top_words=3):
|
281 |
# Combine the two text columns
|
282 |
text_column = "Combined_Location_Text"
|
283 |
df[text_column] = df[text_column1] + ' ' + df[text_column2]
|
284 |
|
285 |
+
consoleMessage_and_Print("Extracting Location Clusters...")
|
286 |
|
287 |
# Sentence Transformers approach for embeddings
|
288 |
model = SentenceTransformer('all-mpnet-base-v2')
|
|
|
314 |
df['Location_Category_Words'] = [cluster_representations[label] for label in cluster_labels]
|
315 |
|
316 |
df = df.drop(text_column, axis=1)
|
317 |
+
consoleMessage_and_Print("Location Clustering completed.")
|
318 |
return df, optimal_n_clusters, cluster_representations
|
319 |
|
320 |
|
|
|
402 |
|
403 |
|
404 |
def create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters):
|
405 |
+
consoleMessage_and_Print("\n Starting function: create_project_proposals")
|
|
|
406 |
proposals = {}
|
407 |
|
408 |
for loc in budget_cluster_df.index:
|
409 |
+
consoleMessage_and_Print(f"\n loc: {loc}")
|
|
|
410 |
|
411 |
for prob in budget_cluster_df.columns:
|
412 |
+
consoleMessage_and_Print(f"\n prob: {prob}")
|
|
|
413 |
|
414 |
location = ", ".join([item.strip() for item in location_clusters[loc] if item]) # Clean and join
|
415 |
problem_domain = ", ".join([item.strip() for item in problem_clusters[prob] if item]) # Clean and join
|
|
|
423 |
# Check if problem_descriptions is valid (not NaN and not an empty list)
|
424 |
if isinstance(problem_descriptions, list) and problem_descriptions:
|
425 |
# print(f"\nGenerating proposal for location: {location}, problem domain: {problem_domain}")
|
426 |
+
consoleMessage_and_Print(f"Generating PP")
|
427 |
|
428 |
# Prepare the prompt
|
429 |
# problems_summary = "; \n".join(problem_descriptions) # Join all problem descriptions
|
|
|
522 |
|
523 |
|
524 |
def nlp_pipeline(original_df):
|
525 |
+
consoleMessage_and_Print("Starting NLP pipeline...")
|
526 |
|
527 |
# Data Preprocessing
|
528 |
processed_df = data_pre_processing(original_df) # merged_dataset
|
529 |
|
530 |
# Starting the Pipeline for Domain Extraction
|
531 |
+
consoleMessage_and_Print("Executing Text processing function for Domain identification")
|
532 |
# Apply the text_processing_for_domain function to the DataFrame
|
533 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
534 |
|
535 |
+
consoleMessage_and_Print("Removing entries which could not be allocated to any Problem Domain")
|
536 |
# processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
|
537 |
# Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
|
538 |
processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
|
|
|
540 |
# Domain Clustering
|
541 |
try:
|
542 |
processed_df, optimal_n_clusters, problem_clusters = extract_problem_domains(processed_df)
|
543 |
+
consoleMessage_and_Print(f"Optimal clusters for Domain extraction: {optimal_n_clusters}")
|
544 |
except Exception as e:
|
545 |
+
consoleMessage_and_Print(f"Error in extract_problem_domains: {str(e)}")
|
546 |
+
consoleMessage_and_Print("NLP pipeline for Problem Domain extraction completed.")
|
547 |
|
548 |
|
549 |
+
consoleMessage_and_Print("Starting NLP pipeline for Location extraction with text processing.")
|
550 |
|
551 |
# Apply the text_processing_for_location function to the DataFrame
|
552 |
processed_df['Processed_LocationText_forClustering'] = processed_df['Problem_Description'].apply(text_processing_for_location)
|
|
|
555 |
# Location Clustering
|
556 |
try:
|
557 |
processed_df, optimal_n_clusters, location_clusters = extract_location_clusters(processed_df)
|
558 |
+
consoleMessage_and_Print(f"Optimal clusters for Location extraction: {optimal_n_clusters}")
|
559 |
except Exception as e:
|
560 |
+
consoleMessage_and_Print(f"Error in extract_location_clusters: {str(e)}")
|
561 |
+
consoleMessage_and_Print("NLP pipeline for location extraction completed.")
|
562 |
|
563 |
|
564 |
# Create cluster dataframes
|
|
|
576 |
# print("\n problem_clusters_2: ", problem_clusters)
|
577 |
project_proposals = create_project_proposals(budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters)
|
578 |
|
579 |
+
consoleMessage_and_Print("NLP pipeline completed.")
|
|
|
580 |
return processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters
|
581 |
|
582 |
|
|
|
587 |
|
588 |
|
589 |
console_messages = []
|
590 |
+
def consoleMessage_and_Print(some_text = ""):
|
591 |
+
console_messages.append(some_text)
|
592 |
+
print(some_text)
|
593 |
+
|
594 |
+
|
595 |
+
|
596 |
+
|
597 |
def process_excel(file):
|
598 |
+
consoleMessage_and_Print("Processing starts. Reading the uploaded Excel file...")
|
599 |
# Ensure the file path is correct
|
600 |
file_path = file.name if hasattr(file, 'name') else file
|
601 |
# Read the Excel file
|
|
|
603 |
|
604 |
try:
|
605 |
# Process the DataFrame
|
606 |
+
consoleMessage_and_Print("Processing the DataFrame...")
|
607 |
processed_df, budget_cluster_df, problem_cluster_df, project_proposals, location_clusters, problem_clusters = nlp_pipeline(df)
|
608 |
# processed_df, budget_cluster_df, problem_cluster_df, location_clusters, problem_clusters = nlp_pipeline(df)
|
609 |
|
|
|
625 |
# if isinstance(location_clusters, pd.DataFrame):
|
626 |
# location_clusters.to_excel(writer, sheet_name='Location_Clusters', index=False)
|
627 |
# else:
|
628 |
+
# consoleMessage_and_Print("Converting Location Clusters to df")
|
629 |
# pd.DataFrame(location_clusters).to_excel(writer, sheet_name='Location_Clusters', index=False)
|
630 |
|
631 |
# if isinstance(problem_clusters, pd.DataFrame):
|
632 |
# problem_clusters.to_excel(writer, sheet_name='Problem_Clusters', index=False)
|
633 |
# else:
|
634 |
+
# consoleMessage_and_Print("Converting Problem Clusters to df")
|
635 |
# pd.DataFrame(problem_clusters).to_excel(writer, sheet_name='Problem_Clusters', index=False)
|
636 |
|
637 |
|
638 |
|
639 |
+
consoleMessage_and_Print("Processing completed. Ready for download.")
|
640 |
return output_filename, "\n".join(console_messages) # Return the processed DataFrame as Excel file
|
641 |
|
642 |
except Exception as e:
|
643 |
# return str(e) # Return the error message
|
644 |
# error_message = f"Error processing file: {str(e)}"
|
645 |
# print(error_message) # Log the error
|
646 |
+
consoleMessage_and_Print(f"Error during processing: {str(e)}")
|
647 |
# return error_message, "Santanu Banerjee" # Return the error message to the user
|
648 |
return None, "\n".join(console_messages)
|
649 |
|