SantanuBanerjee commited on
Commit
a2fcce4
·
verified ·
1 Parent(s): ba1e210

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -15
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import pandas as pd
3
 
4
- def data_pre_processing(file_responses, console_messages):
5
  console_messages.append("Starting data pre-processing...")
6
  # Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
7
 
@@ -65,12 +65,12 @@ def data_pre_processing(file_responses, console_messages):
65
  # Different return can be used to check the processing
66
  console_messages.append("Data pre-processing completed.")
67
  # return file_responses
68
- return merged_dataset, console_messages
69
 
70
  except Exception as e:
71
  console_messages.append(f"Error during data pre-processing: {str(e)}")
72
  # return str(e), console_messages
73
- return None, console_messages
74
 
75
 
76
 
@@ -146,6 +146,8 @@ nltk.download('averaged_perceptron_tagger')
146
 
147
 
148
  def text_processing_for_domain(text):
 
 
149
  # Text Cleaning
150
  text = re.sub(r'[^\w\s]', '', text)
151
  text = re.sub(r'\d+', '', text)
@@ -174,7 +176,8 @@ def text_processing_for_domain(text):
174
  inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
175
  with torch.no_grad():
176
  outputs = model(**inputs)
177
-
 
178
  return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
179
 
180
 
@@ -226,7 +229,6 @@ import numpy as np
226
 
227
 
228
  def extract_problem_domains(df,
229
- console_messages,
230
  text_column='Problem_Description',
231
  cluster_range=(10, 50),
232
  top_words=17,
@@ -354,11 +356,11 @@ def extract_problem_domains(df,
354
 
355
 
356
  # def nlp_pipeline(original_df):
357
- def nlp_pipeline(original_df, console_messages):
358
  console_messages.append("Starting NLP pipeline...")
359
 
360
  # Data Preprocessing
361
- processed_df, console_messages = data_pre_processing(original_df, console_messages) # merged_dataset
362
 
363
 
364
  # Starting the Pipeline for Domain Extraction
@@ -368,18 +370,18 @@ def nlp_pipeline(original_df, console_messages):
368
 
369
  # Domain Clustering
370
  try:
371
- domain_df, optimal_n_clusters = extract_problem_domains(processed_df, console_messages)
372
  # print(f"Optimal clusters: {optimal_clusters}")
373
  # print(result_df.head())
374
  # console_messages.append(f"Optimal clusters: {optimal_n_clusters}")
375
 
376
  console_messages.append("NLP pipeline completed.")
377
- return domain_df, console_messages
378
  except Exception as e:
379
  # print(f"Error in extract_problem_domains: {e}")
380
  console_messages.append(f"Error in extract_problem_domains: {str(e)}")
381
- return processed_df, console_messages
382
- # return domain_df, console_messages
383
 
384
 
385
  # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
@@ -388,9 +390,8 @@ def nlp_pipeline(original_df, console_messages):
388
 
389
 
390
 
391
-
392
  def process_excel(file):
393
- console_messages = []
394
  console_messages.append("Processing starts. Reading the uploaded Excel file...")
395
  # Ensure the file path is correct
396
  file_path = file.name if hasattr(file, 'name') else file
@@ -400,7 +401,7 @@ def process_excel(file):
400
  try:
401
  # Process the DataFrame
402
  console_messages.append("Processing the DataFrame...")
403
- result_df, console_messages = nlp_pipeline(df, console_messages)
404
 
405
  # output_file = "Output_ProjectProposals.xlsx"
406
  output_file = "Output_Proposals.xlsx"
@@ -445,7 +446,7 @@ interface = gr.Interface(
445
 
446
  outputs=[
447
  gr.File(label="Download the processed Excel File containing the ** Project Proposals ** for each Location~Problem paired combination"), # File download output
448
- gr.Textbox(label="Console Messages", lines=10, interactive=False) # Console messages output
449
  ],
450
 
451
 
 
1
  import gradio as gr
2
  import pandas as pd
3
 
4
+ def data_pre_processing(file_responses):
5
  console_messages.append("Starting data pre-processing...")
6
  # Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
7
 
 
65
  # Different return can be used to check the processing
66
  console_messages.append("Data pre-processing completed.")
67
  # return file_responses
68
+ return merged_dataset
69
 
70
  except Exception as e:
71
  console_messages.append(f"Error during data pre-processing: {str(e)}")
72
  # return str(e), console_messages
73
+ return None
74
 
75
 
76
 
 
146
 
147
 
148
  def text_processing_for_domain(text):
149
+ console_messages.append("Entering Text processing function for Domain identification")
150
+
151
  # Text Cleaning
152
  text = re.sub(r'[^\w\s]', '', text)
153
  text = re.sub(r'\d+', '', text)
 
176
  inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
177
  with torch.no_grad():
178
  outputs = model(**inputs)
179
+
180
+ console_messages.append("Exiting Text processing function for Domain identification")
181
  return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
182
 
183
 
 
229
 
230
 
231
  def extract_problem_domains(df,
 
232
  text_column='Problem_Description',
233
  cluster_range=(10, 50),
234
  top_words=17,
 
356
 
357
 
358
  # def nlp_pipeline(original_df):
359
+ def nlp_pipeline(original_df):
360
  console_messages.append("Starting NLP pipeline...")
361
 
362
  # Data Preprocessing
363
+ processed_df = data_pre_processing(original_df) # merged_dataset
364
 
365
 
366
  # Starting the Pipeline for Domain Extraction
 
370
 
371
  # Domain Clustering
372
  try:
373
+ domain_df, optimal_n_clusters = extract_problem_domains(processed_df)
374
  # print(f"Optimal clusters: {optimal_clusters}")
375
  # print(result_df.head())
376
  # console_messages.append(f"Optimal clusters: {optimal_n_clusters}")
377
 
378
  console_messages.append("NLP pipeline completed.")
379
+ return domain_df
380
  except Exception as e:
381
  # print(f"Error in extract_problem_domains: {e}")
382
  console_messages.append(f"Error in extract_problem_domains: {str(e)}")
383
+ return processed_df
384
+ # return domain_df
385
 
386
 
387
  # problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
 
390
 
391
 
392
 
393
+ console_messages = []
394
  def process_excel(file):
 
395
  console_messages.append("Processing starts. Reading the uploaded Excel file...")
396
  # Ensure the file path is correct
397
  file_path = file.name if hasattr(file, 'name') else file
 
401
  try:
402
  # Process the DataFrame
403
  console_messages.append("Processing the DataFrame...")
404
+ result_df = nlp_pipeline(df)
405
 
406
  # output_file = "Output_ProjectProposals.xlsx"
407
  output_file = "Output_Proposals.xlsx"
 
446
 
447
  outputs=[
448
  gr.File(label="Download the processed Excel File containing the ** Project Proposals ** for each Location~Problem paired combination"), # File download output
449
+ gr.Textbox(label="Console Messages", lines=100, interactive=False) # Console messages output
450
  ],
451
 
452