Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 12

Commit

005d6b8

verified ·

1 Parent(s): 054584c

Update app.py

Browse files

Files changed (1) hide show

app.py +245 -145

app.py CHANGED Viewed

@@ -107,19 +107,57 @@ def verify_facts(summary, original_text):
     def extract_numbers(text):
         return set(re.findall(r'(\d+\.?\d*)%?', text))
     original_numbers = extract_numbers(original_text)
     summary_numbers = extract_numbers(summary)
-    # Check if all numbers from original are in summary
-    missing_numbers = original_numbers - summary_numbers
-    # Extract key phrases indicating relationships
     relationship_patterns = [
         r'associated with',
         r'predicted',
         r'correlated with',
         r'relationship between',
-        r'linked to'
     ]
     def extract_relationships(text):
@@ -127,7 +165,6 @@ def verify_facts(summary, original_text):
         for pattern in relationship_patterns:
             matches = re.finditer(pattern, text.lower())
             for match in matches:
-                # Get surrounding context
                 start = max(0, match.start() - 50)
                 end = min(len(text), match.end() + 50)
                 relationships.append(text[start:end].strip())
@@ -139,13 +176,16 @@ def verify_facts(summary, original_text):
     # Check for contradictions
     def find_contradictions(summary, original):
         contradictions = []
-        # Common contradiction patterns
         neg_patterns = [
             (r'no association', r'associated with'),
             (r'did not predict', r'predicted'),
             (r'was not significant', r'was significant'),
             (r'decreased', r'increased'),
-            (r'lower', r'higher')
         ]
         for pos, neg in neg_patterns:
@@ -157,176 +197,236 @@ def verify_facts(summary, original_text):
     contradictions = find_contradictions(summary, original_text)
     return {
-        'missing_numbers': missing_numbers,
         'missing_relationships': original_relationships - summary_relationships,
         'contradictions': contradictions,
-        'is_valid': len(missing_numbers) == 0 and len(contradictions) == 0
     }
 def preprocess_text(text):
     """Preprocess text to add appropriate formatting before summarization"""
     if not isinstance(text, str) or not text.strip():
         return text
-    # Split text into sentences (basic implementation)
-    sentences = [s.strip() for s in text.replace('. ', '.\n').split('\n')]
-    # Remove empty sentences and extra whitespace
-    sentences = [re.sub(r'\s+', ' ', s).strip() for s in sentences if s.strip()]
-    # Join with proper line breaks
-    formatted_text = '\n'.join(sentences)
-    return formatted_text
-def post_process_summary(summary):
-    """Enhanced post-processing for better structure and completeness"""
-    if not summary:
-        return summary
-    # Split into sections
-    sections = summary.split('\n')
-    processed_sections = []
-    for section in sections:
-        if not section.strip():
-            continue
-        # Remove redundant section headers
-        section = re.sub(r'^(Background and objectives|Methods|Results|Conclusions):\s*', '', section)
-        # Split into sentences
-        sentences = [s.strip() for s in section.split('.')]
-        sentences = [s for s in sentences if s]
-        processed_sentences = []
-        for i, sentence in enumerate(sentences):
-            # Fix common issues
-            sentence = re.sub(r'\s+', ' ', sentence)  # Fix spacing
-            sentence = re.sub(r'(\d+)\s*%', r'\1%', sentence)  # Fix percentage formatting
-            sentence = re.sub(r'\(\s*([Nn])\s*=\s*(\d+)\s*\)', r'(n=\2)', sentence)  # Fix sample size formatting
-            # Fix common phrase issues
-            sentence = sentence.replace(" and and ", " and ")
-            sentence = sentence.replace("appointment and appointment", "appointment")
-            sentence = sentence.replace("Cancers distress", "Cancer distress")
-            # Remove redundant phrases
-            sentence = re.sub(r'(?i)the aim of (the|this) study was to', '', sentence)
-            sentence = re.sub(r'(?i)this study aimed to', '', sentence)
-            # Capitalize first letter
-            sentence = sentence.capitalize()
-            if sentence.strip():
-                processed_sentences.append(sentence)
-        if processed_sentences:
-            section = '. '.join(processed_sentences)
-            if not section.endswith('.'):
-                section += '.'
-            processed_sections.append(section)
-    # Ensure key sections are present
-    required_sections = ['Background and objectives', 'Methods', 'Key findings', 'Conclusions']
-    final_sections = []
-    for i, section in enumerate(processed_sections):
-        if i < len(required_sections):
-            final_sections.append(f"{required_sections[i]}: {section}")
-        else:
-            final_sections.append(section)
-    return '\n\n'.join(final_sections)
-def improve_summary_generation(text, model, tokenizer):
     """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
-    # Add a more specific prompt with strict guidelines
     formatted_text = (
-        "Generate a precise summary of this medical research paper following these strict guidelines:\n"
-        "1. Background and objectives: State ONLY the actual study purpose and population - no assumptions\n"
-        "2. Methods: Include ONLY methods explicitly mentioned in the text\n"
-        "3. Key findings: Report ALL numerical results and statistical relationships\n"
-        "4. Conclusions: State ONLY conclusions directly supported by the reported results\n\n"
-        "Requirements:\n"
-        "- Include ALL percentages and numbers from the original text\n"
-        "- Do not repeat section headers\n"
-        "- Do not make claims beyond what's explicitly stated\n"
-        "- Maintain the original meaning without contradiction\n"
-        "- Do not introduce new information\n\n"
-        "Original text: " + preprocess_text(text)
     )
-    # Tokenize input
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    def generate_attempt(temperature, num_beams, length_penalty):
-        with torch.no_grad():
-            return model.generate(
-                **{
-                    "input_ids": inputs["input_ids"],
-                    "attention_mask": inputs["attention_mask"],
-                    "max_length": 300,  # Increased to ensure all facts are included
-                    "min_length": 100,  # Increased to encourage more complete summaries
-                    "num_beams": num_beams,
-                    "length_penalty": length_penalty,
-                    "no_repeat_ngram_size": 3,
-                    "temperature": temperature,
-                    "repetition_penalty": 2.0,  # Increased to reduce repetition
-                    "do_sample": True  # Enable sampling for more diverse outputs
-                }
-            )
-    # Try different parameter combinations until we get a valid summary
     parameter_combinations = [
-        {"temperature": 0.7, "num_beams": 5, "length_penalty": 1.5},
-        {"temperature": 0.5, "num_beams": 8, "length_penalty": 2.0},
-        {"temperature": 0.3, "num_beams": 10, "length_penalty": 2.5}
     ]
     best_summary = None
-    best_verification = None
-    for params in parameter_combinations:
-        summary_ids = generate_attempt(**params)
-        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        processed_summary = post_process_summary(summary)
-        # Verify facts in the summary
-        verification = verify_facts(processed_summary, text)
-        if verification['is_valid']:
-            return processed_summary
-        # Keep track of best attempt
-        if best_verification is None or \
-           len(verification['missing_numbers']) < len(best_verification['missing_numbers']):
-            best_summary = processed_summary
-            best_verification = verification
-    # If no perfect summary was generated, use the best attempt
-    # Add missing information if necessary
-    if best_verification and best_verification['missing_numbers']:
-        # Attempt to add missing numerical information
-        additional_info = []
-        original_sentences = text.split('.')
-        for num in best_verification['missing_numbers']:
-            # Find sentences containing the missing number
-            for sentence in original_sentences:
-                if str(num) in sentence:
-                    additional_info.append(sentence.strip())
-                    break
-        if additional_info:
-            best_summary += "\n\nAdditional key findings: " + ". ".join(additional_info) + "."
     return best_summary
 def validate_summary(summary, original_text):
     """Validate summary content against original text"""
     # Perform fact verification

     def extract_numbers(text):
         return set(re.findall(r'(\d+\.?\d*)%?', text))
+    # Extract statistical significance statements
+    def extract_significance(text):
+        patterns = [
+            r'[pP][\s-]value.*?(?:=|was|of)\s*([<>]?\s*\d+\.?\d*)',
+            r'significant(?:ly)?\s+(?:difference|increase|decrease|change|association)',
+            r'statistical(?:ly)?\s+significant',
+            r'[pP]\s*[<>]\s*\d+\.?\d*'
+        ]
+        findings = []
+        for pattern in patterns:
+            matches = re.finditer(pattern, text, re.IGNORECASE)
+            for match in matches:
+                # Get surrounding context
+                start = max(0, match.start() - 50)
+                end = min(len(text), match.end() + 50)
+                findings.append(text[start:end].strip())
+        return set(findings)
     original_numbers = extract_numbers(original_text)
     summary_numbers = extract_numbers(summary)
+    original_significance = extract_significance(original_text)
+    summary_significance = extract_significance(summary)
+    # Check for temporal sequence preservation
+    def extract_temporal_markers(text):
+        markers = [
+            r'(?:after|following|within)\s+(\d+)\s*(?:weeks?|months?|years?)',
+            r'at\s+(\d+)\s*(?:weeks?|months?|years?)',
+            r'(?:baseline|initial|follow-up|final)'
+        ]
+        sequence = []
+        for pattern in markers:
+            matches = re.finditer(pattern, text, re.IGNORECASE)
+            for match in matches:
+                sequence.append(match.group())
+        return sequence
+    original_sequence = extract_temporal_markers(original_text)
+    summary_sequence = extract_temporal_markers(summary)
+    # Extract relationships
     relationship_patterns = [
         r'associated with',
         r'predicted',
         r'correlated with',
         r'relationship between',
+        r'linked to',
+        r'impact(ed)? on',
+        r'effect(ed)? on',
+        r'influenced?',
+        r'dependent on'
     ]
     def extract_relationships(text):
         for pattern in relationship_patterns:
             matches = re.finditer(pattern, text.lower())
             for match in matches:
                 start = max(0, match.start() - 50)
                 end = min(len(text), match.end() + 50)
                 relationships.append(text[start:end].strip())
     # Check for contradictions
     def find_contradictions(summary, original):
         contradictions = []
         neg_patterns = [
             (r'no association', r'associated with'),
             (r'did not predict', r'predicted'),
             (r'was not significant', r'was significant'),
             (r'decreased', r'increased'),
+            (r'lower', r'higher'),
+            (r'negative', r'positive'),
+            (r'no effect', r'had effect'),
+            (r'no difference', r'difference'),
+            (r'no change', r'changed')
         ]
         for pos, neg in neg_patterns:
     contradictions = find_contradictions(summary, original_text)
+    # Check for internal consistency
+    def check_internal_consistency(summary):
+        inconsistencies = []
+        # Check for contradicting statements within the summary
+        for pos, neg in find_contradictions(summary, summary):
+            inconsistencies.append(f"Internal contradiction: {pos} vs {neg}")
+        return inconsistencies
+    internal_inconsistencies = check_internal_consistency(summary)
     return {
+        'missing_numbers': original_numbers - summary_numbers,
+        'incorrect_numbers': summary_numbers - original_numbers,
+        'missing_significance': original_significance - summary_significance,
         'missing_relationships': original_relationships - summary_relationships,
+        'temporal_sequence_preserved': all(marker in ' '.join(summary_sequence) for marker in original_sequence),
         'contradictions': contradictions,
+        'internal_inconsistencies': internal_inconsistencies,
+        'is_valid': (len(original_numbers - summary_numbers) == 0 and
+                    len(contradictions) == 0 and
+                    len(internal_inconsistencies) == 0)
     }
 def preprocess_text(text):
     """Preprocess text to add appropriate formatting before summarization"""
     if not isinstance(text, str) or not text.strip():
         return text
+    # Standardize spacing and line breaks
+    text = re.sub(r'\s+', ' ', text)
+    text = text.replace('. ', '.\n')
+    # Fix common formatting issues
+    text = re.sub(r'(?<=[.!?])\s*(?=[A-Z])', '\n', text)  # Add breaks after sentences
+    text = re.sub(r'\(\s*([Nn])\s*=\s*(\d+)\s*\)', r'(n=\2)', text)  # Standardize sample size format
+    text = re.sub(r'(\d+)\s*%', r'\1%', text)  # Fix percentage format
+    text = re.sub(r'([Pp])\s*([<>])\s*(\d)', r'\1\2\3', text)  # Fix p-value format
+    # Split into sentences and clean each
+    sentences = [s.strip() for s in text.split('\n')]
+    sentences = [s for s in sentences if s]
+    return '\n'.join(sentences)
+def improve_summary_generation(text, model, tokenizer, max_attempts=3):
     """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
     formatted_text = (
+        "Summarize this medical research paper, strictly following these rules:\n\n"
+        "1. Background and objectives:\n"
+        "   - State ONLY the main purpose and study population\n"
+        "   - Include sample size if mentioned (format as n=X)\n"
+        "   - No methodology details here\n\n"
+        "2. Methods:\n"
+        "   - List the specific procedures and measurements used\n"
+        "   - Include timeframes and follow-up periods\n"
+        "   - No results here\n\n"
+        "3. Key findings:\n"
+        "   - Report ALL numerical results (%, numbers, p-values)\n"
+        "   - Include ALL statistical relationships\n"
+        "   - Present findings in chronological order\n\n"
+        "4. Conclusions:\n"
+        "   - State ONLY conclusions directly supported by the results\n"
+        "   - Include practical implications if mentioned\n"
+        "   - No new information\n\n"
+        "Important:\n"
+        "- Keep each section separate and clearly labeled\n"
+        "- Use exact numbers from the text\n"
+        "- Maintain original relationships between variables\n"
+        "- No speculation or external information\n\n"
+        "Original text:\n" + preprocess_text(text)
     )
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     parameter_combinations = [
+        {"temperature": 0.1, "num_beams": 12, "length_penalty": 2.0, "top_k": 50},
+        {"temperature": 0.05, "num_beams": 15, "length_penalty": 2.5, "top_k": 30},
+        {"temperature": 0.0, "num_beams": 20, "length_penalty": 3.0, "top_k": 10}
     ]
     best_summary = None
+    best_score = -1
+    attempts = 0
+    while attempts < max_attempts:
+        for params in parameter_combinations:
+            with torch.no_grad():
+                summary_ids = model.generate(
+                    **{
+                        "input_ids": inputs["input_ids"],
+                        "attention_mask": inputs["attention_mask"],
+                        "max_length": 300,
+                        "min_length": 100,
+                        "num_beams": params["num_beams"],
+                        "length_penalty": params["length_penalty"],
+                        "no_repeat_ngram_size": 3,
+                        "temperature": params["temperature"],
+                        "top_k": params["top_k"],
+                        "repetition_penalty": 2.5,
+                        "do_sample": params["temperature"] > 0.0
+                    }
+                )
+            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+            processed_summary = post_process_summary(summary)
+            score = score_summary(processed_summary, text)
+            if score > best_score:
+                best_summary = processed_summary
+                best_score = score
+            if score > 0.8:  # Good enough threshold
+                return best_summary
+        attempts += 1
+        # Adjust parameters for next attempt if needed
+        parameter_combinations = [
+            {**params,
+             "num_beams": params["num_beams"] + 5,
+             "length_penalty": params["length_penalty"] + 0.5}
+            for params in parameter_combinations
+        ]
     return best_summary
+def score_summary(summary, original_text):
+    """Score summary quality based on multiple factors"""
+    score = 1.0
+    # Verify facts
+    verification = verify_facts(summary, original_text)
+    if not verification['is_valid']:
+        score -= 0.3
+    # Check numbers
+    if verification['missing_numbers']:
+        score -= 0.1 * len(verification['missing_numbers'])
+    if verification['incorrect_numbers']:
+        score -= 0.2 * len(verification['incorrect_numbers'])
+    # Check statistical significance preservation
+    if verification['missing_significance']:
+        score -= 0.1
+    # Check temporal sequence
+    if not verification['temporal_sequence_preserved']:
+        score -= 0.1
+    # Check for contradictions and inconsistencies
+    if verification['contradictions']:
+        score -= 0.2 * len(verification['contradictions'])
+    if verification['internal_inconsistencies']:
+        score -= 0.2 * len(verification['internal_inconsistencies'])
+    # Check section structure and content
+    required_sections = ['Background and objectives', 'Methods', 'Key findings', 'Conclusions']
+    section_content = {}
+    current_section = None
+    for line in summary.split('\n'):
+        for section in required_sections:
+            if section.lower() in line.lower():
+                current_section = section
+                section_content[section] = []
+                break
+        if current_section and not any(section.lower() in line.lower() for section in required_sections):
+            section_content[current_section].append(line.strip())
+    for section in required_sections:
+        if section not in section_content:
+            score -= 0.15  # Missing section
+        elif not section_content[section]:
+            score -= 0.1   # Empty section
+        elif len(' '.join(section_content[section]).split()) < 10:
+            score -= 0.05  # Too short
+def post_process_summary(summary):
+    """Enhanced post-processing focused on maintaining structure and removing artifacts"""
+    if not summary:
+        return summary
+    # Clean up section headers
+    summary = re.sub(r'(?i)background and objectives:?\s*background and objectives:?',
+                    'Background and objectives:', summary)
+    summary = re.sub(r'(?i)methods:?\s*methods:?', 'Methods:', summary)
+    summary = re.sub(r'(?i)(key )?findings:?\s*(key )?findings:?', 'Key findings:', summary)
+    summary = re.sub(r'(?i)conclusions?:?\s*conclusions?:?', 'Conclusions:', summary)
+    summary = re.sub(r'(?i)materials and methods:?', 'Methods:', summary)
+    summary = re.sub(r'(?i)objectives?:?', '', summary)
+    summary = re.sub(r'(?i)results:?', '', summary)
+    # Remove instruction artifacts
+    summary = re.sub(r'(?i)state only|include only|report all|no assumptions', '', summary)
+    # Split into sections and clean each
+    sections = re.split(r'(?i)(Background and objectives:|Methods:|Key findings:|Conclusions:)', summary)
+    sections = [s.strip() for s in sections if s.strip()]
+    # Reorganize into proper sections
+    organized_sections = {
+        'Background and objectives': '',
+        'Methods': '',
+        'Key findings': '',
+        'Conclusions': ''
+    }
+    current_section = None
+    for item in sections:
+        if item in organized_sections:
+            current_section = item
+        elif current_section:
+            organized_sections[current_section] = item.strip()
+    # Build final summary
+    final_sections = []
+    for section, content in organized_sections.items():
+        if content:
+            # Clean up the content
+            content = re.sub(r'\s+', ' ', content)  # Fix spacing
+            content = re.sub(r'\.+', '.', content)  # Fix multiple periods
+            content = content.strip('.: ')  # Remove trailing periods and spaces
+            # Add to final sections
+            final_sections.append(f"{section}: {content}.")
+    return '\n\n'.join(final_sections)
 def validate_summary(summary, original_text):
     """Validate summary content against original text"""
     # Perform fact verification