Spaces:

pendar02
/

biomedical

Running

App Files Files Community

pendar02 commited on Jan 12

Commit

7ab41f7

verified ·

1 Parent(s): 234816f

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -76

app.py CHANGED Viewed

@@ -118,121 +118,115 @@ def preprocess_text(text):
     return formatted_text
 def post_process_summary(summary):
-    """Clean up and improve summary coherence"""
     if not summary:
         return summary
     # Split into sentences
     sentences = [s.strip() for s in summary.split('.')]
     sentences = [s for s in sentences if s]  # Remove empty sentences
-    # Fix common issues
     processed_sentences = []
-    for i, sentence in enumerate(sentences):
-        # Remove redundant words/phrases
-        sentence = sentence.replace(" and and ", " and ")
-        sentence = sentence.replace("appointment and appointment", "appointment")
-        # Fix common grammatical issues
-        sentence = sentence.replace("Cancers distress", "Cancer distress")
-        sentence = sentence.replace("  ", " ")  # Remove double spaces
-        # Capitalize first letter of each sentence
         sentence = sentence.capitalize()
-        # Add to processed sentences if not empty
-        if sentence.strip():
             processed_sentences.append(sentence)
-    # Join sentences with proper spacing and punctuation
     cleaned_summary = '. '.join(processed_sentences)
-    if cleaned_summary and not cleaned_summary.endswith('.'):
-        cleaned_summary += '.'
-    return cleaned_summary
 def improve_summary_generation(text, model, tokenizer):
-    """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
-    # Add a more specific prompt
     formatted_text = (
-        "Summarize this medical research paper following this structure exactly:\n"
-        "1. Background and objectives\n"
         "2. Methods\n"
-        "3. Key findings with specific numbers/percentages\n"
-        "4. Main conclusions\n"
-        "Original text: " + preprocess_text(text)
     )
-    # Adjust generation parameters
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        summary_ids = model.generate(
-            **{
-                "input_ids": inputs["input_ids"],
-                "attention_mask": inputs["attention_mask"],
-                "max_length": 200,
-                "min_length": 50,
-                "num_beams": 5,
-                "length_penalty": 1.5,
-                "no_repeat_ngram_size": 3,
-                "temperature": 0.7,
-                "repetition_penalty": 1.5
-            }
-        )
-    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
     # Post-process the summary
-    processed_summary = post_process_summary(summary)
     # Validate the summary
     if not validate_summary(processed_summary, text):
-        # If validation fails, try one more time with different parameters
-        with torch.no_grad():
-            summary_ids = model.generate(
-                **{
-                    "input_ids": inputs["input_ids"],
-                    "attention_mask": inputs["attention_mask"],
-                    "max_length": 200,
-                    "min_length": 50,
-                    "num_beams": 4,
-                    "length_penalty": 2.0,
-                    "no_repeat_ngram_size": 4,
-                    "temperature": 0.8,
-                    "repetition_penalty": 2.0
-                }
-            )
-        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        processed_summary = post_process_summary(summary)
     return processed_summary
 def validate_summary(summary, original_text):
-    """Validate summary content against original text"""
-    # Check for age inconsistencies
-    age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
-    if len(age_mentions) > 1:  # Multiple age mentions
-        return False
-    # Check for repetitive sentences
-    sentences = summary.split('.')
-    unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
-    if len(sentences) - len(unique_sentences) > 1:  # More than one duplicate
         return False
-    # Check summary isn't too long or too short compared to original
-    summary_words = len(summary.split())
-    original_words = len(original_text.split())
-    if summary_words < 20 or summary_words > original_words * 0.8:
         return False
     return True
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
     # Preprocess each abstract

     return formatted_text
 def post_process_summary(summary):
+    """Clean up and improve summary coherence."""
     if not summary:
         return summary
     # Split into sentences
     sentences = [s.strip() for s in summary.split('.')]
     sentences = [s for s in sentences if s]  # Remove empty sentences
+    # Correct common issues
     processed_sentences = []
+    for sentence in sentences:
+        # Remove redundant phrases
+        sentence = re.sub(r"\b(and and|appointment and appointment)\b", "and", sentence)
+        # Ensure first letter capitalization
         sentence = sentence.capitalize()
+        # Avoid duplicates
+        if sentence not in processed_sentences:
             processed_sentences.append(sentence)
+    # Join sentences with proper punctuation
     cleaned_summary = '. '.join(processed_sentences)
+    return cleaned_summary if cleaned_summary.endswith('.') else cleaned_summary + '.'
 def improve_summary_generation(text, model, tokenizer):
+    """Generate improved summary with better prompt and validation."""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
+    # Add a structured prompt for summarization
     formatted_text = (
+        "Summarize this biomedical research abstract into the following structure:\n"
+        "1. Background and Objectives\n"
         "2. Methods\n"
+        "3. Key Findings (include any percentages or numbers)\n"
+        "4. Conclusions\n"
+        f"Abstract:\n{text.strip()}"
     )
+    # Prepare input tokens
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    # Generate summary with adjusted parameters
+    try:
+        with torch.no_grad():
+            summary_ids = model.generate(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                max_length=300,  # Increased for more detailed summaries
+                min_length=100,  # Ensure summaries are not too short
+                num_beams=5,
+                length_penalty=1.5,
+                no_repeat_ngram_size=3,
+                temperature=0.7,
+                repetition_penalty=1.3,
+            )
+        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    except Exception as e:
+        return f"Error in generation: {str(e)}"
     # Post-process the summary
+    return post_process_summary(summary)
     # Validate the summary
     if not validate_summary(processed_summary, text):
+    # Retry with alternate generation parameters
+    with torch.no_grad():
+        summary_ids = model.generate(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            max_length=250,
+            min_length=50,
+            num_beams=4,
+            length_penalty=2.0,
+            no_repeat_ngram_size=4,
+            temperature=0.8,
+            repetition_penalty=1.5,
+        )
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    processed_summary = post_process_summary(summary)
     return processed_summary
 def validate_summary(summary, original_text):
+    """Validate summary content against original text."""
+    # Check for common validation points
+    if not summary or len(summary.split()) < 20:
+        return False  # Too short
+    if len(summary.split()) > len(original_text.split()) * 0.8:
+        return False  # Too long
+    # Ensure structure is maintained (e.g., headings are present)
+    required_sections = ["background and objectives", "methods", "key findings", "conclusions"]
+    if not all(section.lower() in summary.lower() for section in required_sections):
         return False
+    # Ensure no repetitive sentences
+    sentences = summary.split('.')
+    if len(sentences) != len(set(sentences)):
         return False
     return True
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
     # Preprocess each abstract