Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 11

Commit

770037f

verified ·

1 Parent(s): b5db7e8

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -18

app.py CHANGED Viewed

@@ -121,23 +121,52 @@ def preprocess_text(text):
     return formatted_text
-def generate_summary(text, model, tokenizer):
-    """Generate summary for single abstract"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
-    # Check if abstract is too short
     word_count = len(text.split())
-    if word_count < 50:  # Threshold for "short" abstracts
-        return text  # Return original text for very short abstracts
-    # Preprocess the text first
     formatted_text = preprocess_text(text)
-    # Adjust generation parameters based on input length
-    max_length = min(150, word_count + 50)  # Dynamic max length
-    min_length = min(50, word_count)  # Dynamic min length
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
@@ -146,20 +175,26 @@ def generate_summary(text, model, tokenizer):
             **{
                 "input_ids": inputs["input_ids"],
                 "attention_mask": inputs["attention_mask"],
-                "max_length": max_length,
-                "min_length": min_length,
-                "num_beams": 4,
-                "length_penalty": 2.0,
                 "early_stopping": True,
-                "no_repeat_ngram_size": 3
             }
         )
     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    # Post-process summary
     if summary.lower() == text.lower() or len(summary.split()) / word_count > 0.9:
-        return text  # Return original if summary is too similar
     return summary
@@ -242,7 +277,10 @@ def main():
                             progress_bar = st.progress(0)
                             for idx, abstract in enumerate(df['Abstract']):
-                                summary = generate_summary(abstract, model, tokenizer)
                                 summaries.append(summary)
                                 progress_bar.progress((idx + 1) / len(df))

     return formatted_text
+def post_process_summary(summary):
+    """Clean up and improve summary coherence"""
+    if not summary:
+        return summary
+    # Split into sentences
+    sentences = [s.strip() for s in summary.split('.')]
+    sentences = [s for s in sentences if s]  # Remove empty sentences
+    # Fix common issues
+    processed_sentences = []
+    for i, sentence in enumerate(sentences):
+        # Remove redundant words/phrases
+        sentence = sentence.replace(" and and ", " and ")
+        sentence = sentence.replace("appointment and appointment", "appointment")
+        # Fix common grammatical issues
+        sentence = sentence.replace("Cancers distress", "Cancer distress")
+        sentence = sentence.replace("  ", " ")  # Remove double spaces
+        # Capitalize first letter of each sentence
+        sentence = sentence.capitalize()
+        # Add to processed sentences if not empty
+        if sentence.strip():
+            processed_sentences.append(sentence)
+    # Join sentences with proper spacing and punctuation
+    cleaned_summary = '. '.join(processed_sentences)
+    if cleaned_summary and not cleaned_summary.endswith('.'):
+        cleaned_summary += '.'
+    return cleaned_summary
+def improve_summary_generation(text, model, tokenizer):
+    """Enhanced version of generate_summary with better parameters and post-processing"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
     word_count = len(text.split())
+    if word_count < 50:
+        return text
     formatted_text = preprocess_text(text)
+    # Adjust generation parameters for better coherence
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
             **{
                 "input_ids": inputs["input_ids"],
                 "attention_mask": inputs["attention_mask"],
+                "max_length": min(200, word_count + 50),
+                "min_length": min(50, word_count),
+                "num_beams": 5,  # Increased from 4
+                "length_penalty": 1.5,  # Adjusted from 2.0
                 "early_stopping": True,
+                "no_repeat_ngram_size": 3,
+                "temperature": 0.7,  # Added temperature for better diversity
+                "top_p": 0.9,  # Added top_p sampling
+                "repetition_penalty": 1.2  # Added repetition penalty
             }
         )
     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    # Apply post-processing
+    summary = post_process_summary(summary)
+    # Check if summary is too similar to original
     if summary.lower() == text.lower() or len(summary.split()) / word_count > 0.9:
+        return text
     return summary
                             progress_bar = st.progress(0)
                             for idx, abstract in enumerate(df['Abstract']):
+                                # Replace this line
+                                # summary = generate_summary(abstract, model, tokenizer)
+                                # With this line
+                                summary = improve_summary_generation(abstract, model, tokenizer)
                                 summaries.append(summary)
                                 progress_bar.progress((idx + 1) / len(df))