Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -154,7 +154,6 @@ nltk.download('averaged_perceptron_tagger')
|
|
154 |
|
155 |
|
156 |
def text_processing_for_domain(text):
|
157 |
-
console_messages.append("Entering Text processing function for Domain identification")
|
158 |
|
159 |
# Text Cleaning
|
160 |
text = re.sub(r'[^\w\s]', '', text)
|
@@ -179,14 +178,17 @@ def text_processing_for_domain(text):
|
|
179 |
# Lemmatize tokens using SpaCy
|
180 |
doc = nlp(' '.join(tokens))
|
181 |
lemmatized_text = ' '.join([token.lemma_ for token in doc])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
-
# Apply Hugging Face Transformers
|
184 |
-
inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
|
185 |
-
with torch.no_grad():
|
186 |
-
outputs = model(**inputs)
|
187 |
|
188 |
-
console_messages.append("Exiting Text processing function for Domain identification")
|
189 |
-
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
190 |
|
191 |
|
192 |
|
@@ -237,10 +239,10 @@ import numpy as np
|
|
237 |
|
238 |
|
239 |
def extract_problem_domains(df,
|
240 |
-
|
241 |
-
text_column='Problem_Description',
|
242 |
cluster_range=(10, 50),
|
243 |
-
top_words=
|
244 |
method='sentence_transformers'
|
245 |
# method='tfidf_kmeans'
|
246 |
):
|
@@ -373,8 +375,10 @@ def nlp_pipeline(original_df):
|
|
373 |
|
374 |
|
375 |
# Starting the Pipeline for Domain Extraction
|
|
|
376 |
# Apply the text_processing_for_domain function to the DataFrame
|
377 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
|
|
378 |
|
379 |
|
380 |
# Domain Clustering
|
|
|
154 |
|
155 |
|
156 |
def text_processing_for_domain(text):
|
|
|
157 |
|
158 |
# Text Cleaning
|
159 |
text = re.sub(r'[^\w\s]', '', text)
|
|
|
178 |
# Lemmatize tokens using SpaCy
|
179 |
doc = nlp(' '.join(tokens))
|
180 |
lemmatized_text = ' '.join([token.lemma_ for token in doc])
|
181 |
+
|
182 |
+
return lemmatized_text # Return the cleaned and lemmatized text
|
183 |
+
|
184 |
+
# # Apply Hugging Face Transformers
|
185 |
+
# inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
|
186 |
+
# with torch.no_grad():
|
187 |
+
# outputs = model(**inputs)
|
188 |
+
|
189 |
+
# return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
190 |
|
|
|
|
|
|
|
|
|
191 |
|
|
|
|
|
192 |
|
193 |
|
194 |
|
|
|
239 |
|
240 |
|
241 |
def extract_problem_domains(df,
|
242 |
+
text_column='Processed_ProblemDescription_forDomainExtraction',
|
243 |
+
# text_column='Problem_Description',
|
244 |
cluster_range=(10, 50),
|
245 |
+
top_words=30,
|
246 |
method='sentence_transformers'
|
247 |
# method='tfidf_kmeans'
|
248 |
):
|
|
|
375 |
|
376 |
|
377 |
# Starting the Pipeline for Domain Extraction
|
378 |
+
console_messages.append("Entering Text processing function for Domain identification")
|
379 |
# Apply the text_processing_for_domain function to the DataFrame
|
380 |
processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
|
381 |
+
console_messages.append("Exiting Text processing function for Domain identification")
|
382 |
|
383 |
|
384 |
# Domain Clustering
|