Spaces:

SantanuBanerjee
/

TaxDirection

Sleeping

App Files Files Community

SantanuBanerjee commited on Aug 4, 2024

Commit

b7709fc

verified ·

1 Parent(s): 7386d73

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -13

app.py CHANGED Viewed

@@ -159,16 +159,17 @@ from transformers import pipeline
 # Load a summarization model
 summarizer = pipeline("summarization")
-def text_processing_for_domain(unsummarized_text):
     try:
         # Summarization
-        text = summarizer(unsummarized_text, max_length=70, min_length=30, do_sample=False)[0]['summary_text']
     except Exception as e:
         print(f"Summarization failed: {e}")
-        text = unsummarized_text
     # Text Cleaning
     text = re.sub(r'[^\w\s]', '', text)
     text = re.sub(r'\d+', '', text)
@@ -194,13 +195,31 @@ def text_processing_for_domain(unsummarized_text):
     lemmatized_text = ' '.join([token.lemma_ for token in doc])
     return lemmatized_text  # Return the cleaned and lemmatized text
-    # # Apply Hugging Face Transformers
-    # inputs = tokenizer(lemmatized_text, return_tensors="pt", truncation=False, padding=True)
-    # with torch.no_grad():
-    #     outputs = model(**inputs)
-    # return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
@@ -393,8 +412,10 @@ def nlp_pipeline(original_df):
     processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
     console_messages.append("Removing entries which could not be allocated to any Problem Domain")
-    processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
     # Domain Clustering

 # Load a summarization model
 summarizer = pipeline("summarization")
+def Summarized_text(passed_text):
     try:
         # Summarization
+        summarize_text = summarizer(passed_text, max_length=70, min_length=30, do_sample=False)[0]['summary_text']
+        return summarize_text
     except Exception as e:
         print(f"Summarization failed: {e}")
+        return passed_text
+    ###### Will uncomment Summarization during final deployment... as it takes a lot of time
+def Lemmatize_text(text):
     # Text Cleaning
     text = re.sub(r'[^\w\s]', '', text)
     text = re.sub(r'\d+', '', text)
     lemmatized_text = ' '.join([token.lemma_ for token in doc])
     return lemmatized_text  # Return the cleaned and lemmatized text
+from random import random
+def text_processing_for_domain(text):
+    # First, get the summarized text
+    summarized_text = ""
+    # summarized_text = Summarized_text(text)
+    # Then, lemmatize the original text
+    lemmatized_text = ""
+    lemmatized_text = Lemmatize_text(text)
+    if lemmatized_text and summarized_text:
+        # Join both the summarized and lemmatized text
+        if random() > 0.5:
+            combined_text = summarized_text + "  " + lemmatized_text
+        else:
+            combined_text = lemmatized_text + "  " + summarized_text
+        return combined_text
+    elif summarized_text:
+        return summarized_text
+    elif lemmatized_text:
+        return lemmatized_text
+    else:
+        return "Sustainability and Longevity" # Default FailSafe
     processed_df['Processed_ProblemDescription_forDomainExtraction'] = processed_df['Problem_Description'].apply(text_processing_for_domain)
     console_messages.append("Removing entries which could not be allocated to any Problem Domain")
+    # processed_df = processed_df.dropna(subset=['Processed_ProblemDescription_forDomainExtraction'], axis=0)
+    # Drop rows where 'Processed_ProblemDescription_forDomainExtraction' contains empty arrays
+    processed_df = processed_df[processed_df['Processed_ProblemDescription_forDomainExtraction'].apply(lambda x: len(x) > 0)]
     # Domain Clustering