SantanuBanerjee commited on
Commit
8c34617
·
verified ·
1 Parent(s): 2b24dfb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -10
app.py CHANGED
@@ -154,7 +154,6 @@ nltk.download('averaged_perceptron_tagger')
154
 
155
 
156
  def text_processing_for_domain(text):
157
- console_messages.append("Entering Text processing function for Domain identification")
158
 
159
  # Text Cleaning
160
  text = re.sub(r'[^\w\s]', '', text)
@@ -179,14 +178,17 @@ def text_processing_for_domain(text):
179
  # Lemmatize tokens using SpaCy
180
  doc = nlp(' '.join(tokens))
181
  lemmatized_text = ' '.join([token.lemma_ for token in doc])
 
 
 
 
 
 
 
 
 
182
 
183
- # Apply Hugging Face Transformers
184
- inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
185
- with torch.no_grad():
186
- outputs = model(**inputs)
187
 
188
- console_messages.append("Exiting Text processing function for Domain identification")
189
- return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
190
 
191
 
192
 
@@ -237,10 +239,10 @@ import numpy as np
237
 
238
 
239
  def extract_problem_domains(df,
240
- # text_column='Processed_ProblemDescription_forDomainExtraction',
241
- text_column='Problem_Description',
242
  cluster_range=(10, 50),
243
- top_words=17,
244
  method='sentence_transformers'
245
  # method='tfidf_kmeans'
246
  ):
@@ -373,8 +375,10 @@ def nlp_pipeline(original_df):
373
 
374
 
375
  # Starting the Pipeline for Domain Extraction
 
376
  # Apply the text_processing_for_domain function to the DataFrame
377
  processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
 
378
 
379
 
380
  # Domain Clustering
 
154
 
155
 
156
  def text_processing_for_domain(text):
 
157
 
158
  # Text Cleaning
159
  text = re.sub(r'[^\w\s]', '', text)
 
178
  # Lemmatize tokens using SpaCy
179
  doc = nlp(' '.join(tokens))
180
  lemmatized_text = ' '.join([token.lemma_ for token in doc])
181
+
182
+ return lemmatized_text # Return the cleaned and lemmatized text
183
+
184
+ # # Apply Hugging Face Transformers
185
+ # inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
186
+ # with torch.no_grad():
187
+ # outputs = model(**inputs)
188
+
189
+ # return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
190
 
 
 
 
 
191
 
 
 
192
 
193
 
194
 
 
239
 
240
 
241
  def extract_problem_domains(df,
242
+ text_column='Processed_ProblemDescription_forDomainExtraction',
243
+ # text_column='Problem_Description',
244
  cluster_range=(10, 50),
245
+ top_words=30,
246
  method='sentence_transformers'
247
  # method='tfidf_kmeans'
248
  ):
 
375
 
376
 
377
  # Starting the Pipeline for Domain Extraction
378
+ console_messages.append("Entering Text processing function for Domain identification")
379
  # Apply the text_processing_for_domain function to the DataFrame
380
  processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
381
+ console_messages.append("Exiting Text processing function for Domain identification")
382
 
383
 
384
  # Domain Clustering