Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 4, 2024

Commit

a2fcce4

verified ·

1 Parent(s): ba1e210

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -15

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import pandas as pd
-def data_pre_processing(file_responses, console_messages):
     console_messages.append("Starting data pre-processing...")
     # Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
@@ -65,12 +65,12 @@ def data_pre_processing(file_responses, console_messages):
         # Different return can be used to check the processing
         console_messages.append("Data pre-processing completed.")
         # return file_responses
-        return merged_dataset, console_messages
     except Exception as e:
         console_messages.append(f"Error during data pre-processing: {str(e)}")
         # return str(e), console_messages
-        return None, console_messages
@@ -146,6 +146,8 @@ nltk.download('averaged_perceptron_tagger')
 def text_processing_for_domain(text):
     # Text Cleaning
     text = re.sub(r'[^\w\s]', '', text)
     text = re.sub(r'\d+', '', text)
@@ -174,7 +176,8 @@ def text_processing_for_domain(text):
     inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
     with torch.no_grad():
         outputs = model(**inputs)
     return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
@@ -226,7 +229,6 @@ import numpy as np
 def extract_problem_domains(df,
-                            console_messages,
                             text_column='Problem_Description',
                             cluster_range=(10, 50),
                             top_words=17,
@@ -354,11 +356,11 @@ def extract_problem_domains(df,
 # def nlp_pipeline(original_df):
-def nlp_pipeline(original_df, console_messages):
     console_messages.append("Starting NLP pipeline...")
     # Data Preprocessing
-    processed_df, console_messages = data_pre_processing(original_df, console_messages) # merged_dataset
     # Starting the Pipeline for Domain Extraction
@@ -368,18 +370,18 @@ def nlp_pipeline(original_df, console_messages):
     # Domain Clustering
     try:
-        domain_df, optimal_n_clusters = extract_problem_domains(processed_df, console_messages)
         # print(f"Optimal clusters: {optimal_clusters}")
         # print(result_df.head())
         # console_messages.append(f"Optimal clusters: {optimal_n_clusters}")
         console_messages.append("NLP pipeline completed.")
-        return domain_df, console_messages
     except Exception as e:
         # print(f"Error in extract_problem_domains: {e}")
         console_messages.append(f"Error in extract_problem_domains: {str(e)}")
-        return processed_df, console_messages
-        # return domain_df, console_messages
     # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
@@ -388,9 +390,8 @@ def nlp_pipeline(original_df, console_messages):
 def process_excel(file):
-    console_messages = []
     console_messages.append("Processing starts. Reading the uploaded Excel file...")
     # Ensure the file path is correct
     file_path = file.name if hasattr(file, 'name') else file
@@ -400,7 +401,7 @@ def process_excel(file):
     try:
         # Process the DataFrame
         console_messages.append("Processing the DataFrame...")
-        result_df, console_messages = nlp_pipeline(df, console_messages)
         # output_file = "Output_ProjectProposals.xlsx"
         output_file = "Output_Proposals.xlsx"
@@ -445,7 +446,7 @@ interface = gr.Interface(
     outputs=[
         gr.File(label="Download the processed Excel File containing the ** Project Proposals ** for each Location~Problem paired combination"),  # File download output
-        gr.Textbox(label="Console Messages", lines=10, interactive=False)  # Console messages output
         ],

 import gradio as gr
 import pandas as pd
+def data_pre_processing(file_responses):
     console_messages.append("Starting data pre-processing...")
     # Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
         # Different return can be used to check the processing
         console_messages.append("Data pre-processing completed.")
         # return file_responses
+        return merged_dataset
     except Exception as e:
         console_messages.append(f"Error during data pre-processing: {str(e)}")
         # return str(e), console_messages
+        return None
 def text_processing_for_domain(text):
+    console_messages.append("Entering Text processing function for Domain identification")
     # Text Cleaning
     text = re.sub(r'[^\w\s]', '', text)
     text = re.sub(r'\d+', '', text)
     inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
     with torch.no_grad():
         outputs = model(**inputs)
+    console_messages.append("Exiting Text processing function for Domain identification")
     return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
 def extract_problem_domains(df,
                             text_column='Problem_Description',
                             cluster_range=(10, 50),
                             top_words=17,
 # def nlp_pipeline(original_df):
+def nlp_pipeline(original_df):
     console_messages.append("Starting NLP pipeline...")
     # Data Preprocessing
+    processed_df = data_pre_processing(original_df) # merged_dataset
     # Starting the Pipeline for Domain Extraction
     # Domain Clustering
     try:
+        domain_df, optimal_n_clusters = extract_problem_domains(processed_df)
         # print(f"Optimal clusters: {optimal_clusters}")
         # print(result_df.head())
         # console_messages.append(f"Optimal clusters: {optimal_n_clusters}")
         console_messages.append("NLP pipeline completed.")
+        return domain_df
     except Exception as e:
         # print(f"Error in extract_problem_domains: {e}")
         console_messages.append(f"Error in extract_problem_domains: {str(e)}")
+        return processed_df
+        # return domain_df
     # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
+console_messages = []
 def process_excel(file):
     console_messages.append("Processing starts. Reading the uploaded Excel file...")
     # Ensure the file path is correct
     file_path = file.name if hasattr(file, 'name') else file
     try:
         # Process the DataFrame
         console_messages.append("Processing the DataFrame...")
+        result_df = nlp_pipeline(df)
         # output_file = "Output_ProjectProposals.xlsx"
         output_file = "Output_Proposals.xlsx"
     outputs=[
         gr.File(label="Download the processed Excel File containing the ** Project Proposals ** for each Location~Problem paired combination"),  # File download output
+        gr.Textbox(label="Console Messages", lines=100, interactive=False)  # Console messages output
         ],