Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 12

Commit

0d0c8c3

verified ·

1 Parent(s): 7bd75d7

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -92

app.py CHANGED Viewed

@@ -117,122 +117,173 @@ def preprocess_text(text):
     return formatted_text
-def post_process_summary(summary):
-    """Clean up and improve summary coherence"""
-    if not summary:
-        return summary
-    # Split into sentences
-    sentences = [s.strip() for s in summary.split('.')]
-    sentences = [s for s in sentences if s]  # Remove empty sentences
-    # Fix common issues
-    processed_sentences = []
-    for i, sentence in enumerate(sentences):
-        # Remove redundant words/phrases
-        sentence = sentence.replace(" and and ", " and ")
-        sentence = sentence.replace("appointment and appointment", "appointment")
-        # Fix common grammatical issues
-        sentence = sentence.replace("Cancers distress", "Cancer distress")
-        sentence = sentence.replace("  ", " ")  # Remove double spaces
-        # Capitalize first letter of each sentence
-        sentence = sentence.capitalize()
-        # Add to processed sentences if not empty
-        if sentence.strip():
-            processed_sentences.append(sentence)
-    # Join sentences with proper spacing and punctuation
-    cleaned_summary = '. '.join(processed_sentences)
-    if cleaned_summary and not cleaned_summary.endswith('.'):
-        cleaned_summary += '.'
-    return cleaned_summary
 def improve_summary_generation(text, model, tokenizer):
-    """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
-    # Add a more specific prompt
     formatted_text = (
-        "Summarize this medical research paper following this structure exactly:\n"
-        "1. Background and objectives\n"
-        "2. Methods\n"
-        "3. Key findings with specific numbers/percentages\n"
-        "4. Main conclusions\n"
         "Original text: " + preprocess_text(text)
     )
-    # Adjust generation parameters
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        summary_ids = model.generate(
-            **{
-                "input_ids": inputs["input_ids"],
-                "attention_mask": inputs["attention_mask"],
-                "max_length": 200,
-                "min_length": 50,
-                "num_beams": 5,
-                "length_penalty": 1.5,
-                "no_repeat_ngram_size": 3,
-                "temperature": 0.7,
-                "repetition_penalty": 1.5
-            }
-        )
-    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    # Post-process the summary
-    processed_summary = post_process_summary(summary)
-    # Validate the summary
-    if not validate_summary(processed_summary, text):
-        # If validation fails, try one more time with different parameters
-        with torch.no_grad():
-            summary_ids = model.generate(
-                **{
-                    "input_ids": inputs["input_ids"],
-                    "attention_mask": inputs["attention_mask"],
-                    "max_length": 200,
-                    "min_length": 50,
-                    "num_beams": 4,
-                    "length_penalty": 2.0,
-                    "no_repeat_ngram_size": 4,
-                    "temperature": 0.8,
-                    "repetition_penalty": 2.0
-                }
-            )
-        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        processed_summary = post_process_summary(summary)
-    return processed_summary
 def validate_summary(summary, original_text):
-    """Validate summary content against original text"""
-    # Check for age inconsistencies
-    age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
-    if len(age_mentions) > 1:  # Multiple age mentions
         return False
-    # Check for repetitive sentences
-    sentences = summary.split('.')
-    unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
-    if len(sentences) - len(unique_sentences) > 1:  # More than one duplicate
         return False
-    # Check summary isn't too long or too short compared to original
-    summary_words = len(summary.split())
-    original_words = len(original_text.split())
-    if summary_words < 20 or summary_words > original_words * 0.8:
         return False
     return True
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
     # Preprocess each abstract

     return formatted_text
 def improve_summary_generation(text, model, tokenizer):
+    """Generate improved summary with better prompt engineering and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
+    # Create a more structured prompt that enforces accurate reporting
     formatted_text = (
+        "Summarize this medical research paper accurately and concisely. "
+        "Include only factual information from the text. "
+        "Structure the summary as follows:\n"
+        "1. OBJECTIVE: State the main purpose and study population\n"
+        "2. METHODS: Describe key methodological elements\n"
+        "3. RESULTS: Report specific findings with exact numbers/percentages\n"
+        "4. CONCLUSION: State main implications\n\n"
         "Original text: " + preprocess_text(text)
     )
+    # First attempt with conservative parameters
+    summary = generate_summary_attempt(formatted_text, model, tokenizer,
+                                     conservative_params=True)
+    # Validate the generated summary
+    if not validate_summary(summary, text):
+        # If validation fails, try again with different parameters
+        summary = generate_summary_attempt(formatted_text, model, tokenizer,
+                                         conservative_params=False)
+    return post_process_summary(summary)
+def generate_summary_attempt(formatted_text, model, tokenizer, conservative_params=True):
+    """Generate a summary with specified parameters"""
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    params = {
+        "input_ids": inputs["input_ids"],
+        "attention_mask": inputs["attention_mask"],
+        "max_length": 250,  # Increased for better coverage
+        "min_length": 100,  # Increased to ensure comprehensive summary
+        "early_stopping": True,
+        "no_repeat_ngram_size": 3,
+    }
+    if conservative_params:
+        params.update({
+            "num_beams": 5,
+            "length_penalty": 1.5,
+            "temperature": 0.7,
+            "top_p": 0.9,
+            "repetition_penalty": 1.5
+        })
+    else:
+        params.update({
+            "num_beams": 4,
+            "length_penalty": 2.0,
+            "temperature": 0.8,
+            "top_p": 0.95,
+            "repetition_penalty": 2.0
+        })
+    with torch.no_grad():
+        summary_ids = model.generate(**params)
+    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def validate_summary(summary, original_text):
+    """Enhanced validation of summary content"""
+    if not summary or not original_text:
         return False
+    # Extract numerical values from both texts
+    original_numbers = set(re.findall(r'(\d+(?:\.\d+)?)\s*%', original_text))
+    summary_numbers = set(re.findall(r'(\d+(?:\.\d+)?)\s*%', summary))
+    # Check if key percentages are preserved
+    if not summary_numbers.issubset(original_numbers):
         return False
+    # Check for contradictions in methodology statements
+    methods_original = extract_methods(original_text)
+    methods_summary = extract_methods(summary)
+    if methods_summary and not any(m in original_text.lower() for m in methods_summary):
         return False
+    # Verify no hallucinated content
+    sentences = summary.split('.')
+    for sentence in sentences:
+        # Check if key claims in summary are supported by original
+        if sentence.strip() and not is_supported_by_original(sentence, original_text):
+            return False
+    return True
+def extract_methods(text):
+    """Extract methodology-related terms"""
+    method_keywords = ['study', 'survey', 'analysis', 'trial', 'experiment']
+    methods = []
+    for keyword in method_keywords:
+        pattern = fr'{keyword}\s+\w+'
+        matches = re.findall(pattern, text.lower())
+        methods.extend(matches)
+    return methods
+def is_supported_by_original(claim, original):
+    """Check if a claim from summary is supported by original text"""
+    # Remove common filler phrases
+    claim = re.sub(r'(this study|the study|results show|we found that)', '', claim.lower()).strip()
+    # Split into key phrases
+    key_phrases = [p.strip() for p in claim.split(' and ')]
+    # Check if each key phrase has supporting evidence
+    for phrase in key_phrases:
+        if phrase and not has_supporting_evidence(phrase, original.lower()):
+            return False
     return True
+def has_supporting_evidence(phrase, original):
+    """Check if there's supporting evidence for a phrase"""
+    # Convert to word sets for flexible matching
+    phrase_words = set(phrase.split())
+    original_sentences = [set(s.split()) for s in original.split('.')]
+    # Check if any sentence contains most of the phrase words
+    return any(len(phrase_words.intersection(sent)) >= len(phrase_words) * 0.7
+              for sent in original_sentences)
+def post_process_summary(summary):
+    """Enhanced post-processing of generated summary"""
+    if not summary:
+        return summary
+    # Split into sections based on the structured format
+    sections = []
+    current_section = []
+    for line in summary.split('\n'):
+        line = line.strip()
+        if any(marker in line.upper() for marker in ['OBJECTIVE:', 'METHODS:', 'RESULTS:', 'CONCLUSION:']):
+            if current_section:
+                sections.append(' '.join(current_section))
+            current_section = [line]
+        elif line:
+            current_section.append(line)
+    if current_section:
+        sections.append(' '.join(current_section))
+    # Clean up each section
+    cleaned_sections = []
+    for section in sections:
+        # Fix common issues
+        section = re.sub(r'\s+', ' ', section)  # Remove multiple spaces
+        section = re.sub(r'(\d+)\s*%', r'\1%', section)  # Fix percentage formatting
+        section = re.sub(r'(\.|,)\s*(\d)', r'\1 \2', section)  # Fix number spacing
+        cleaned_sections.append(section)
+    # Join sections with proper spacing
+    final_summary = '\n'.join(cleaned_sections)
+    # Ensure proper ending
+    if final_summary and not final_summary.endswith('.'):
+        final_summary += '.'
+    return final_summary
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
     # Preprocess each abstract