Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
|
4 |
-
def data_pre_processing(file_responses
|
5 |
console_messages.append("Starting data pre-processing...")
|
6 |
# Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
|
7 |
|
@@ -65,12 +65,12 @@ def data_pre_processing(file_responses, console_messages):
|
|
65 |
# Different return can be used to check the processing
|
66 |
console_messages.append("Data pre-processing completed.")
|
67 |
# return file_responses
|
68 |
-
return merged_dataset
|
69 |
|
70 |
except Exception as e:
|
71 |
console_messages.append(f"Error during data pre-processing: {str(e)}")
|
72 |
# return str(e), console_messages
|
73 |
-
return None
|
74 |
|
75 |
|
76 |
|
@@ -146,6 +146,8 @@ nltk.download('averaged_perceptron_tagger')
|
|
146 |
|
147 |
|
148 |
def text_processing_for_domain(text):
|
|
|
|
|
149 |
# Text Cleaning
|
150 |
text = re.sub(r'[^\w\s]', '', text)
|
151 |
text = re.sub(r'\d+', '', text)
|
@@ -174,7 +176,8 @@ def text_processing_for_domain(text):
|
|
174 |
inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
|
175 |
with torch.no_grad():
|
176 |
outputs = model(**inputs)
|
177 |
-
|
|
|
178 |
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
179 |
|
180 |
|
@@ -226,7 +229,6 @@ import numpy as np
|
|
226 |
|
227 |
|
228 |
def extract_problem_domains(df,
|
229 |
-
console_messages,
|
230 |
text_column='Problem_Description',
|
231 |
cluster_range=(10, 50),
|
232 |
top_words=17,
|
@@ -354,11 +356,11 @@ def extract_problem_domains(df,
|
|
354 |
|
355 |
|
356 |
# def nlp_pipeline(original_df):
|
357 |
-
def nlp_pipeline(original_df
|
358 |
console_messages.append("Starting NLP pipeline...")
|
359 |
|
360 |
# Data Preprocessing
|
361 |
-
processed_df
|
362 |
|
363 |
|
364 |
# Starting the Pipeline for Domain Extraction
|
@@ -368,18 +370,18 @@ def nlp_pipeline(original_df, console_messages):
|
|
368 |
|
369 |
# Domain Clustering
|
370 |
try:
|
371 |
-
domain_df, optimal_n_clusters = extract_problem_domains(processed_df
|
372 |
# print(f"Optimal clusters: {optimal_clusters}")
|
373 |
# print(result_df.head())
|
374 |
# console_messages.append(f"Optimal clusters: {optimal_n_clusters}")
|
375 |
|
376 |
console_messages.append("NLP pipeline completed.")
|
377 |
-
return domain_df
|
378 |
except Exception as e:
|
379 |
# print(f"Error in extract_problem_domains: {e}")
|
380 |
console_messages.append(f"Error in extract_problem_domains: {str(e)}")
|
381 |
-
return processed_df
|
382 |
-
# return domain_df
|
383 |
|
384 |
|
385 |
# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
|
@@ -388,9 +390,8 @@ def nlp_pipeline(original_df, console_messages):
|
|
388 |
|
389 |
|
390 |
|
391 |
-
|
392 |
def process_excel(file):
|
393 |
-
console_messages = []
|
394 |
console_messages.append("Processing starts. Reading the uploaded Excel file...")
|
395 |
# Ensure the file path is correct
|
396 |
file_path = file.name if hasattr(file, 'name') else file
|
@@ -400,7 +401,7 @@ def process_excel(file):
|
|
400 |
try:
|
401 |
# Process the DataFrame
|
402 |
console_messages.append("Processing the DataFrame...")
|
403 |
-
result_df
|
404 |
|
405 |
# output_file = "Output_ProjectProposals.xlsx"
|
406 |
output_file = "Output_Proposals.xlsx"
|
@@ -445,7 +446,7 @@ interface = gr.Interface(
|
|
445 |
|
446 |
outputs=[
|
447 |
gr.File(label="Download the processed Excel File containing the ** Project Proposals ** for each Location~Problem paired combination"), # File download output
|
448 |
-
gr.Textbox(label="Console Messages", lines=
|
449 |
],
|
450 |
|
451 |
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
|
4 |
+
def data_pre_processing(file_responses):
|
5 |
console_messages.append("Starting data pre-processing...")
|
6 |
# Financial Weights can be anything (ultimately the row-wise weights are aggregated and the corresponding fractions are obtained from that rows' total tax payed)
|
7 |
|
|
|
65 |
# Different return can be used to check the processing
|
66 |
console_messages.append("Data pre-processing completed.")
|
67 |
# return file_responses
|
68 |
+
return merged_dataset
|
69 |
|
70 |
except Exception as e:
|
71 |
console_messages.append(f"Error during data pre-processing: {str(e)}")
|
72 |
# return str(e), console_messages
|
73 |
+
return None
|
74 |
|
75 |
|
76 |
|
|
|
146 |
|
147 |
|
148 |
def text_processing_for_domain(text):
|
149 |
+
console_messages.append("Entering Text processing function for Domain identification")
|
150 |
+
|
151 |
# Text Cleaning
|
152 |
text = re.sub(r'[^\w\s]', '', text)
|
153 |
text = re.sub(r'\d+', '', text)
|
|
|
176 |
inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
|
177 |
with torch.no_grad():
|
178 |
outputs = model(**inputs)
|
179 |
+
|
180 |
+
console_messages.append("Exiting Text processing function for Domain identification")
|
181 |
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
182 |
|
183 |
|
|
|
229 |
|
230 |
|
231 |
def extract_problem_domains(df,
|
|
|
232 |
text_column='Problem_Description',
|
233 |
cluster_range=(10, 50),
|
234 |
top_words=17,
|
|
|
356 |
|
357 |
|
358 |
# def nlp_pipeline(original_df):
|
359 |
+
def nlp_pipeline(original_df):
|
360 |
console_messages.append("Starting NLP pipeline...")
|
361 |
|
362 |
# Data Preprocessing
|
363 |
+
processed_df = data_pre_processing(original_df) # merged_dataset
|
364 |
|
365 |
|
366 |
# Starting the Pipeline for Domain Extraction
|
|
|
370 |
|
371 |
# Domain Clustering
|
372 |
try:
|
373 |
+
domain_df, optimal_n_clusters = extract_problem_domains(processed_df)
|
374 |
# print(f"Optimal clusters: {optimal_clusters}")
|
375 |
# print(result_df.head())
|
376 |
# console_messages.append(f"Optimal clusters: {optimal_n_clusters}")
|
377 |
|
378 |
console_messages.append("NLP pipeline completed.")
|
379 |
+
return domain_df
|
380 |
except Exception as e:
|
381 |
# print(f"Error in extract_problem_domains: {e}")
|
382 |
console_messages.append(f"Error in extract_problem_domains: {str(e)}")
|
383 |
+
return processed_df
|
384 |
+
# return domain_df
|
385 |
|
386 |
|
387 |
# problem_clusters, problem_model = perform_clustering(processed_df['Problem_Description'], n_clusters=10)
|
|
|
390 |
|
391 |
|
392 |
|
393 |
+
console_messages = []
|
394 |
def process_excel(file):
|
|
|
395 |
console_messages.append("Processing starts. Reading the uploaded Excel file...")
|
396 |
# Ensure the file path is correct
|
397 |
file_path = file.name if hasattr(file, 'name') else file
|
|
|
401 |
try:
|
402 |
# Process the DataFrame
|
403 |
console_messages.append("Processing the DataFrame...")
|
404 |
+
result_df = nlp_pipeline(df)
|
405 |
|
406 |
# output_file = "Output_ProjectProposals.xlsx"
|
407 |
output_file = "Output_Proposals.xlsx"
|
|
|
446 |
|
447 |
outputs=[
|
448 |
gr.File(label="Download the processed Excel File containing the ** Project Proposals ** for each Location~Problem paired combination"), # File download output
|
449 |
+
gr.Textbox(label="Console Messages", lines=100, interactive=False) # Console messages output
|
450 |
],
|
451 |
|
452 |
|