Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 12

Commit

883d34a

verified ·

1 Parent(s): d0820e9

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -7

app.py CHANGED Viewed

@@ -151,11 +151,18 @@ def post_process_summary(summary):
     return cleaned_summary
 def improve_summary_generation(text, model, tokenizer):
     # Add a more specific prompt
     formatted_text = (
-        "Summarize the following medical research paper, focusing on: "
-        "1) Study objectives 2) Methods 3) Key findings 4) Main conclusions. "
-        "Text: " + preprocess_text(text)
     )
     # Adjust generation parameters
@@ -179,11 +186,76 @@ def improve_summary_generation(text, model, tokenizer):
     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def post_process_summary(summary):
     """Enhanced post-processing to catch common errors"""
     if not summary:
         return summary
     # Remove contradictory age statements
     age_statements = []
     lines = summary.split('.')
@@ -199,8 +271,21 @@ def post_process_summary(summary):
     seen_content = set()
     unique_lines = []
     for line in cleaned_lines:
-        line_core = ' '.join(sorted(line.lower().split()))  # Normalize for comparison
-        if line_core not in seen_content:
             seen_content.add(line_core)
             unique_lines.append(line)
@@ -208,7 +293,13 @@ def post_process_summary(summary):
     cleaned_summary = '. '.join(s.strip() for s in unique_lines if s.strip())
     if cleaned_summary and not cleaned_summary.endswith('.'):
         cleaned_summary += '.'
     return cleaned_summary
 def generate_focused_summary(question, abstracts, model, tokenizer):

     return cleaned_summary
 def improve_summary_generation(text, model, tokenizer):
+    """Generate improved summary with better prompt and validation"""
+    if not isinstance(text, str) or not text.strip():
+        return "No abstract available to summarize."
     # Add a more specific prompt
     formatted_text = (
+        "Summarize this medical research paper following this structure exactly:\n"
+        "1. Background and objectives\n"
+        "2. Methods\n"
+        "3. Key findings with specific numbers/percentages\n"
+        "4. Main conclusions\n"
+        "Original text: " + preprocess_text(text)
     )
     # Adjust generation parameters
     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    # Post-process the summary
+    processed_summary = post_process_summary(summary)
+    # Validate the summary
+    if not validate_summary(processed_summary, text):
+        # If validation fails, try one more time with different parameters
+        with torch.no_grad():
+            summary_ids = model.generate(
+                **{
+                    "input_ids": inputs["input_ids"],
+                    "attention_mask": inputs["attention_mask"],
+                    "max_length": 200,
+                    "min_length": 50,
+                    "num_beams": 4,
+                    "length_penalty": 2.0,
+                    "no_repeat_ngram_size": 4,
+                    "temperature": 0.8,
+                    "repetition_penalty": 2.0
+                }
+            )
+        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        processed_summary = post_process_summary(summary)
+    return processed_summary
+def validate_summary(summary, original_text):
+    """Validate summary content against original text"""
+    import re
+    # Don't validate empty summaries
+    if not summary or not original_text:
+        return False
+    # Check for age inconsistencies
+    age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
+    if len(age_mentions) > 1:  # Multiple age mentions
+        return False
+    # Check for repetitive sentences
+    sentences = summary.split('.')
+    unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
+    if len(sentences) - len(unique_sentences) > 1:  # More than one duplicate
+        return False
+    # Check summary isn't too long or too short compared to original
+    summary_words = len(summary.split())
+    original_words = len(original_text.split())
+    if summary_words < 20 or summary_words > original_words * 0.8:
+        return False
+    # Check for common error patterns
+    error_patterns = [
+        r'mean.*mean',
+        r'median.*median',
+        r'results.*results',
+        r'conclusion.*conclusion',
+        r'significance.*significance'
+    ]
+    for pattern in error_patterns:
+        if len(re.findall(pattern, summary.lower())) > 1:
+            return False
+    return True
 def post_process_summary(summary):
     """Enhanced post-processing to catch common errors"""
     if not summary:
         return summary
     # Remove contradictory age statements
     age_statements = []
     lines = summary.split('.')
     seen_content = set()
     unique_lines = []
     for line in cleaned_lines:
+        # Skip empty lines
+        if not line.strip():
+            continue
+        # Normalize for comparison
+        line_core = ' '.join(sorted(line.lower().split()))
+        # Check for near-duplicates
+        duplicate = False
+        for seen in seen_content:
+            if line_core in seen or seen in line_core:
+                duplicate = True
+                break
+        if not duplicate:
             seen_content.add(line_core)
             unique_lines.append(line)
     cleaned_summary = '. '.join(s.strip() for s in unique_lines if s.strip())
     if cleaned_summary and not cleaned_summary.endswith('.'):
         cleaned_summary += '.'
+    # Additional cleaning
+    cleaned_summary = cleaned_summary.replace(" and and ", " and ")
+    cleaned_summary = cleaned_summary.replace("results showed", "")
+    cleaned_summary = cleaned_summary.replace("results indicated", "")
+    cleaned_summary = cleaned_summary.replace("  ", " ")
     return cleaned_summary
 def generate_focused_summary(question, abstracts, model, tokenizer):