Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -159,16 +159,17 @@ from transformers import pipeline
|
|
159 |
# Load a summarization model
|
160 |
summarizer = pipeline("summarization")
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
def text_processing_for_domain(unsummarized_text):
|
165 |
try:
|
166 |
# Summarization
|
167 |
-
|
|
|
168 |
except Exception as e:
|
169 |
print(f"Summarization failed: {e}")
|
170 |
-
|
|
|
171 |
|
|
|
172 |
# Text Cleaning
|
173 |
text = re.sub(r'[^\w\s]', '', text)
|
174 |
text = re.sub(r'\d+', '', text)
|
@@ -194,13 +195,31 @@ def text_processing_for_domain(unsummarized_text):
|
|
194 |
lemmatized_text = ' '.join([token.lemma_ for token in doc])
|
195 |
|
196 |
return lemmatized_text # Return the cleaned and lemmatized text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
-
#
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
|
206 |
|
@@ -393,8 +412,10 @@ def nlp_pipeline(original_df):
|
|
393 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
394 |
|
395 |
console_messages.append("Removing entries which could not be allocated to any Problem Domain")
|
396 |
-
processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
|
397 |
-
|
|
|
|
|
398 |
|
399 |
|
400 |
# Domain Clustering
|
|
|
159 |
# Load a summarization model
|
160 |
summarizer = pipeline("summarization")
|
161 |
|
162 |
+
def Summarized_text(passed_text):
|
|
|
|
|
163 |
try:
|
164 |
# Summarization
|
165 |
+
summarize_text = summarizer(passed_text, max_length=70, min_length=30, do_sample=False)[0]['summary_text']
|
166 |
+
return summarize_text
|
167 |
except Exception as e:
|
168 |
print(f"Summarization failed: {e}")
|
169 |
+
return passed_text
|
170 |
+
###### Will uncomment Summarization during final deployment... as it takes a lot of time
|
171 |
|
172 |
+
def Lemmatize_text(text):
|
173 |
# Text Cleaning
|
174 |
text = re.sub(r'[^\w\s]', '', text)
|
175 |
text = re.sub(r'\d+', '', text)
|
|
|
195 |
lemmatized_text = ' '.join([token.lemma_ for token in doc])
|
196 |
|
197 |
return lemmatized_text # Return the cleaned and lemmatized text
|
198 |
+
|
199 |
+
|
200 |
+
from random import random
|
201 |
+
def text_processing_for_domain(text):
|
202 |
+
# First, get the summarized text
|
203 |
+
summarized_text = ""
|
204 |
+
# summarized_text = Summarized_text(text)
|
205 |
|
206 |
+
# Then, lemmatize the original text
|
207 |
+
lemmatized_text = ""
|
208 |
+
lemmatized_text = Lemmatize_text(text)
|
209 |
+
|
210 |
+
if lemmatized_text and summarized_text:
|
211 |
+
# Join both the summarized and lemmatized text
|
212 |
+
if random() > 0.5:
|
213 |
+
combined_text = summarized_text + " " + lemmatized_text
|
214 |
+
else:
|
215 |
+
combined_text = lemmatized_text + " " + summarized_text
|
216 |
+
return combined_text
|
217 |
+
elif summarized_text:
|
218 |
+
return summarized_text
|
219 |
+
elif lemmatized_text:
|
220 |
+
return lemmatized_text
|
221 |
+
else:
|
222 |
+
return "Sustainability and Longevity" # Default FailSafe
|
223 |
|
224 |
|
225 |
|
|
|
412 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
413 |
|
414 |
console_messages.append("Removing entries which could not be allocated to any Problem Domain")
|
415 |
+
# processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
|
416 |
+
# Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
|
417 |
+
processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
|
418 |
+
|
419 |
|
420 |
|
421 |
# Domain Clustering
|