SantanuBanerjee commited on
Commit
b7709fc
·
verified ·
1 Parent(s): 7386d73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -13
app.py CHANGED
@@ -159,16 +159,17 @@ from transformers import pipeline
159
  # Load a summarization model
160
  summarizer = pipeline("summarization")
161
 
162
-
163
-
164
- def text_processing_for_domain(unsummarized_text):
165
  try:
166
  # Summarization
167
- text = summarizer(unsummarized_text, max_length=70, min_length=30, do_sample=False)[0]['summary_text']
 
168
  except Exception as e:
169
  print(f"Summarization failed: {e}")
170
- text = unsummarized_text
 
171
 
 
172
  # Text Cleaning
173
  text = re.sub(r'[^\w\s]', '', text)
174
  text = re.sub(r'\d+', '', text)
@@ -194,13 +195,31 @@ def text_processing_for_domain(unsummarized_text):
194
  lemmatized_text = ' '.join([token.lemma_ for token in doc])
195
 
196
  return lemmatized_text # Return the cleaned and lemmatized text
 
 
 
 
 
 
 
197
 
198
- # # Apply Hugging Face Transformers
199
- # inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
200
- # with torch.no_grad():
201
- # outputs = model(**inputs)
202
-
203
- # return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
 
 
 
 
 
 
 
 
 
 
 
204
 
205
 
206
 
@@ -393,8 +412,10 @@ def nlp_pipeline(original_df):
393
  processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
394
 
395
  console_messages.append("Removing entries which could not be allocated to any Problem Domain")
396
- processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
397
-
 
 
398
 
399
 
400
  # Domain Clustering
 
159
  # Load a summarization model
160
  summarizer = pipeline("summarization")
161
 
162
+ def Summarized_text(passed_text):
 
 
163
  try:
164
  # Summarization
165
+ summarize_text = summarizer(passed_text, max_length=70, min_length=30, do_sample=False)[0]['summary_text']
166
+ return summarize_text
167
  except Exception as e:
168
  print(f"Summarization failed: {e}")
169
+ return passed_text
170
+ ###### Will uncomment Summarization during final deployment... as it takes a lot of time
171
 
172
+ def Lemmatize_text(text):
173
  # Text Cleaning
174
  text = re.sub(r'[^\w\s]', '', text)
175
  text = re.sub(r'\d+', '', text)
 
195
  lemmatized_text = ' '.join([token.lemma_ for token in doc])
196
 
197
  return lemmatized_text # Return the cleaned and lemmatized text
198
+
199
+
200
+ from random import random
201
+ def text_processing_for_domain(text):
202
+ # First, get the summarized text
203
+ summarized_text = ""
204
+ # summarized_text = Summarized_text(text)
205
 
206
+ # Then, lemmatize the original text
207
+ lemmatized_text = ""
208
+ lemmatized_text = Lemmatize_text(text)
209
+
210
+ if lemmatized_text and summarized_text:
211
+ # Join both the summarized and lemmatized text
212
+ if random() > 0.5:
213
+ combined_text = summarized_text + " " + lemmatized_text
214
+ else:
215
+ combined_text = lemmatized_text + " " + summarized_text
216
+ return combined_text
217
+ elif summarized_text:
218
+ return summarized_text
219
+ elif lemmatized_text:
220
+ return lemmatized_text
221
+ else:
222
+ return "Sustainability and Longevity" # Default FailSafe
223
 
224
 
225
 
 
412
  processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
413
 
414
  console_messages.append("Removing entries which could not be allocated to any Problem Domain")
415
+ # processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
416
+ # Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
417
+ processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
418
+
419
 
420
 
421
  # Domain Clustering