Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 12

Commit

0f40536

verified ·

1 Parent(s): 005d6b8

Update app.py

Browse files

Files changed (1) hide show

app.py +229 -299

app.py CHANGED Viewed

@@ -27,17 +27,112 @@ if 'processing_started' not in st.session_state:
 if 'focused_summary_generated' not in st.session_state:
     st.session_state.focused_summary_generated = False
 def load_model(model_type):
     """Load appropriate model based on type with proper memory management"""
     try:
-        # Clear any existing cached data
         gc.collect()
         torch.cuda.empty_cache()
-        device = "cpu"  # Force CPU usage
         if model_type == "summarize":
-            # Load the new fine-tuned model directly
             model = AutoModelForSeq2SeqLM.from_pretrained(
                 "pendar02/bart-large-pubmedd",
                 cache_dir="./models",
@@ -48,7 +143,7 @@ def load_model(model_type):
                 "pendar02/bart-large-pubmedd",
                 cache_dir="./models"
             )
-        else:  # question_focused
             base_model = AutoModelForSeq2SeqLM.from_pretrained(
                 "GanjinZero/biobart-base",
                 cache_dir="./models",
@@ -73,7 +168,6 @@ def load_model(model_type):
         raise
 def cleanup_model(model, tokenizer):
-    """Properly cleanup model resources"""
     try:
         del model
         del tokenizer
@@ -82,15 +176,12 @@ def cleanup_model(model, tokenizer):
     except Exception:
         pass
-@st.cache_data
 def process_excel(uploaded_file):
-    """Process uploaded Excel file"""
     try:
         df = pd.read_excel(uploaded_file)
         required_columns = ['Abstract', 'Article Title', 'Authors',
                           'Source Title', 'Publication Year', 'DOI', 'Times Cited, All Databases']
-        # Check required columns
         missing_columns = [col for col in required_columns if col not in df.columns]
         if missing_columns:
             st.error(f"Missing required columns: {', '.join(missing_columns)}")
@@ -101,305 +192,144 @@ def process_excel(uploaded_file):
         st.error(f"Error processing file: {str(e)}")
         return None
-def verify_facts(summary, original_text):
-    """Verify that key facts in the summary match the original text"""
-    # Extract numbers and percentages
-    def extract_numbers(text):
-        return set(re.findall(r'(\d+\.?\d*)%?', text))
-    # Extract statistical significance statements
-    def extract_significance(text):
-        patterns = [
-            r'[pP][\s-]value.*?(?:=|was|of)\s*([<>]?\s*\d+\.?\d*)',
-            r'significant(?:ly)?\s+(?:difference|increase|decrease|change|association)',
-            r'statistical(?:ly)?\s+significant',
-            r'[pP]\s*[<>]\s*\d+\.?\d*'
-        ]
-        findings = []
-        for pattern in patterns:
-            matches = re.finditer(pattern, text, re.IGNORECASE)
-            for match in matches:
-                # Get surrounding context
-                start = max(0, match.start() - 50)
-                end = min(len(text), match.end() + 50)
-                findings.append(text[start:end].strip())
-        return set(findings)
-    original_numbers = extract_numbers(original_text)
-    summary_numbers = extract_numbers(summary)
-    original_significance = extract_significance(original_text)
-    summary_significance = extract_significance(summary)
-    # Check for temporal sequence preservation
-    def extract_temporal_markers(text):
-        markers = [
-            r'(?:after|following|within)\s+(\d+)\s*(?:weeks?|months?|years?)',
-            r'at\s+(\d+)\s*(?:weeks?|months?|years?)',
-            r'(?:baseline|initial|follow-up|final)'
-        ]
-        sequence = []
-        for pattern in markers:
-            matches = re.finditer(pattern, text, re.IGNORECASE)
-            for match in matches:
-                sequence.append(match.group())
-        return sequence
-    original_sequence = extract_temporal_markers(original_text)
-    summary_sequence = extract_temporal_markers(summary)
-    # Extract relationships
-    relationship_patterns = [
-        r'associated with',
-        r'predicted',
-        r'correlated with',
-        r'relationship between',
-        r'linked to',
-        r'impact(ed)? on',
-        r'effect(ed)? on',
-        r'influenced?',
-        r'dependent on'
-    ]
-    def extract_relationships(text):
-        relationships = []
-        for pattern in relationship_patterns:
-            matches = re.finditer(pattern, text.lower())
-            for match in matches:
-                start = max(0, match.start() - 50)
-                end = min(len(text), match.end() + 50)
-                relationships.append(text[start:end].strip())
-        return set(relationships)
-    original_relationships = extract_relationships(original_text)
-    summary_relationships = extract_relationships(summary)
-    # Check for contradictions
-    def find_contradictions(summary, original):
-        contradictions = []
-        neg_patterns = [
-            (r'no association', r'associated with'),
-            (r'did not predict', r'predicted'),
-            (r'was not significant', r'was significant'),
-            (r'decreased', r'increased'),
-            (r'lower', r'higher'),
-            (r'negative', r'positive'),
-            (r'no effect', r'had effect'),
-            (r'no difference', r'difference'),
-            (r'no change', r'changed')
-        ]
-        for pos, neg in neg_patterns:
-            if (re.search(pos, summary.lower()) and re.search(neg, original.lower())) or \
-               (re.search(neg, summary.lower()) and re.search(pos, original.lower())):
-                contradictions.append(f"Contradiction found: {pos} vs {neg}")
-        return contradictions
-    contradictions = find_contradictions(summary, original_text)
-    # Check for internal consistency
-    def check_internal_consistency(summary):
-        inconsistencies = []
-        # Check for contradicting statements within the summary
-        for pos, neg in find_contradictions(summary, summary):
-            inconsistencies.append(f"Internal contradiction: {pos} vs {neg}")
-        return inconsistencies
-    internal_inconsistencies = check_internal_consistency(summary)
-    return {
-        'missing_numbers': original_numbers - summary_numbers,
-        'incorrect_numbers': summary_numbers - original_numbers,
-        'missing_significance': original_significance - summary_significance,
-        'missing_relationships': original_relationships - summary_relationships,
-        'temporal_sequence_preserved': all(marker in ' '.join(summary_sequence) for marker in original_sequence),
-        'contradictions': contradictions,
-        'internal_inconsistencies': internal_inconsistencies,
-        'is_valid': (len(original_numbers - summary_numbers) == 0 and
-                    len(contradictions) == 0 and
-                    len(internal_inconsistencies) == 0)
-    }
-def preprocess_text(text):
-    """Preprocess text to add appropriate formatting before summarization"""
-    if not isinstance(text, str) or not text.strip():
-        return text
-    # Standardize spacing and line breaks
-    text = re.sub(r'\s+', ' ', text)
-    text = text.replace('. ', '.\n')
-    # Fix common formatting issues
-    text = re.sub(r'(?<=[.!?])\s*(?=[A-Z])', '\n', text)  # Add breaks after sentences
-    text = re.sub(r'\(\s*([Nn])\s*=\s*(\d+)\s*\)', r'(n=\2)', text)  # Standardize sample size format
-    text = re.sub(r'(\d+)\s*%', r'\1%', text)  # Fix percentage format
-    text = re.sub(r'([Pp])\s*([<>])\s*(\d)', r'\1\2\3', text)  # Fix p-value format
-    # Split into sentences and clean each
-    sentences = [s.strip() for s in text.split('\n')]
-    sentences = [s for s in sentences if s]
-    return '\n'.join(sentences)
 def improve_summary_generation(text, model, tokenizer, max_attempts=3):
     """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
-    formatted_text = (
-        "Summarize this medical research paper, strictly following these rules:\n\n"
-        "1. Background and objectives:\n"
-        "   - State ONLY the main purpose and study population\n"
-        "   - Include sample size if mentioned (format as n=X)\n"
-        "   - No methodology details here\n\n"
-        "2. Methods:\n"
-        "   - List the specific procedures and measurements used\n"
-        "   - Include timeframes and follow-up periods\n"
-        "   - No results here\n\n"
-        "3. Key findings:\n"
-        "   - Report ALL numerical results (%, numbers, p-values)\n"
-        "   - Include ALL statistical relationships\n"
-        "   - Present findings in chronological order\n\n"
-        "4. Conclusions:\n"
-        "   - State ONLY conclusions directly supported by the results\n"
-        "   - Include practical implications if mentioned\n"
-        "   - No new information\n\n"
-        "Important:\n"
-        "- Keep each section separate and clearly labeled\n"
-        "- Use exact numbers from the text\n"
-        "- Maintain original relationships between variables\n"
-        "- No speculation or external information\n\n"
-        "Original text:\n" + preprocess_text(text)
-    )
-    inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    parameter_combinations = [
-        {"temperature": 0.1, "num_beams": 12, "length_penalty": 2.0, "top_k": 50},
-        {"temperature": 0.05, "num_beams": 15, "length_penalty": 2.5, "top_k": 30},
-        {"temperature": 0.0, "num_beams": 20, "length_penalty": 3.0, "top_k": 10}
-    ]
-    best_summary = None
-    best_score = -1
-    attempts = 0
-    while attempts < max_attempts:
-        for params in parameter_combinations:
-            with torch.no_grad():
-                summary_ids = model.generate(
-                    **{
-                        "input_ids": inputs["input_ids"],
-                        "attention_mask": inputs["attention_mask"],
-                        "max_length": 300,
-                        "min_length": 100,
-                        "num_beams": params["num_beams"],
-                        "length_penalty": params["length_penalty"],
-                        "no_repeat_ngram_size": 3,
-                        "temperature": params["temperature"],
-                        "top_k": params["top_k"],
-                        "repetition_penalty": 2.5,
-                        "do_sample": params["temperature"] > 0.0
-                    }
-                )
-            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-            processed_summary = post_process_summary(summary)
-            score = score_summary(processed_summary, text)
-            if score > best_score:
-                best_summary = processed_summary
-                best_score = score
-            if score > 0.8:  # Good enough threshold
-                return best_summary
-        attempts += 1
-        # Adjust parameters for next attempt if needed
         parameter_combinations = [
-            {**params,
-             "num_beams": params["num_beams"] + 5,
-             "length_penalty": params["length_penalty"] + 0.5}
-            for params in parameter_combinations
         ]
-    return best_summary
-def score_summary(summary, original_text):
-    """Score summary quality based on multiple factors"""
-    score = 1.0
-    # Verify facts
-    verification = verify_facts(summary, original_text)
-    if not verification['is_valid']:
-        score -= 0.3
-    # Check numbers
-    if verification['missing_numbers']:
-        score -= 0.1 * len(verification['missing_numbers'])
-    if verification['incorrect_numbers']:
-        score -= 0.2 * len(verification['incorrect_numbers'])
-    # Check statistical significance preservation
-    if verification['missing_significance']:
-        score -= 0.1
-    # Check temporal sequence
-    if not verification['temporal_sequence_preserved']:
-        score -= 0.1
-    # Check for contradictions and inconsistencies
-    if verification['contradictions']:
-        score -= 0.2 * len(verification['contradictions'])
-    if verification['internal_inconsistencies']:
-        score -= 0.2 * len(verification['internal_inconsistencies'])
-    # Check section structure and content
-    required_sections = ['Background and objectives', 'Methods', 'Key findings', 'Conclusions']
-    section_content = {}
-    current_section = None
-    for line in summary.split('\n'):
-        for section in required_sections:
-            if section.lower() in line.lower():
-                current_section = section
-                section_content[section] = []
-                break
-        if current_section and not any(section.lower() in line.lower() for section in required_sections):
-            section_content[current_section].append(line.strip())
-    for section in required_sections:
-        if section not in section_content:
-            score -= 0.15  # Missing section
-        elif not section_content[section]:
-            score -= 0.1   # Empty section
-        elif len(' '.join(section_content[section]).split()) < 10:
-            score -= 0.05  # Too short
 def post_process_summary(summary):
     """Enhanced post-processing focused on maintaining structure and removing artifacts"""
     if not summary:
         return summary
     # Clean up section headers
-    summary = re.sub(r'(?i)background and objectives:?\s*background and objectives:?',
-                    'Background and objectives:', summary)
-    summary = re.sub(r'(?i)methods:?\s*methods:?', 'Methods:', summary)
-    summary = re.sub(r'(?i)(key )?findings:?\s*(key )?findings:?', 'Key findings:', summary)
-    summary = re.sub(r'(?i)conclusions?:?\s*conclusions?:?', 'Conclusions:', summary)
-    summary = re.sub(r'(?i)materials and methods:?', 'Methods:', summary)
-    summary = re.sub(r'(?i)objectives?:?', '', summary)
-    summary = re.sub(r'(?i)results:?', '', summary)
-    # Remove instruction artifacts
-    summary = re.sub(r'(?i)state only|include only|report all|no assumptions', '', summary)
-    # Split into sections and clean each
     sections = re.split(r'(?i)(Background and objectives:|Methods:|Key findings:|Conclusions:)', summary)
     sections = [s.strip() for s in sections if s.strip()]
-    # Reorganize into proper sections
     organized_sections = {
         'Background and objectives': '',
         'Methods': '',
@@ -412,21 +342,21 @@ def post_process_summary(summary):
         if item in organized_sections:
             current_section = item
         elif current_section:
-            organized_sections[current_section] = item.strip()
     # Build final summary
     final_sections = []
     for section, content in organized_sections.items():
         if content:
-            # Clean up the content
-            content = re.sub(r'\s+', ' ', content)  # Fix spacing
-            content = re.sub(r'\.+', '.', content)  # Fix multiple periods
-            content = content.strip('.: ')  # Remove trailing periods and spaces
-            # Add to final sections
-            final_sections.append(f"{section}: {content}.")
     return '\n\n'.join(final_sections)
 def validate_summary(summary, original_text):
     """Validate summary content against original text"""
     # Perform fact verification

 if 'focused_summary_generated' not in st.session_state:
     st.session_state.focused_summary_generated = False
+def extract_biomedical_facts(text):
+    """Extract biomedical-specific facts and measurements"""
+    facts = {
+        'p_values': [],
+        'measurements': [],
+        'demographics': [],
+        'statistical_measures': [],
+        'timeframes': []
+    }
+    # P-value patterns
+    p_value_patterns = [
+        r'[pP][\s-]*(?:value)?[\s-]*[=<>]\s*\.?\d+\.?\d*e?-?\d*',  # p = 0.001, p<.05, p < 1e-6
+        r'[pP][\s-]*(?:value)?[\s-]*(?:was|of|is|were)\s*\.?\d+\.?\d*e?-?\d*'  # p value was 0.001
+    ]
+    # Statistical measures patterns
+    stat_patterns = [
+        r'(?:CI|confidence interval)[\s:]*(?:\d+\.?\d*%?)?\s*[-–]\s*(?:\d+\.?\d*%?)',  # 95% CI: 1.2-3.4
+        r'(?:OR|odds ratio)[\s:]*(?:\d+\.?\d*)',  # OR: 1.5
+        r'(?:HR|hazard ratio)[\s:]*(?:\d+\.?\d*)',  # HR: 2.1
+        r'(?:RR|relative risk)[\s:]*(?:\d+\.?\d*)',  # RR: 1.3
+        r'(?:SD|standard deviation)[\s:]*[±]?\s*\d+\.?\d*'  # SD: ±2.1
+    ]
+    # Measurement patterns
+    measurement_patterns = [
+        r'\d+\.?\d*\s*(?:mg|kg|ml|mmol|µg|ng|mm|cm|µl|g/dl|mmHg)',  # Units
+        r'\d+\.?\d*\s*(?:weeks?|months?|years?|hours?|days?)',  # Time units
+        r'\d+\.?\d*\s*(?:%|percent|percentage)'  # Percentages
+    ]
+    # Demographic patterns
+    demographic_patterns = [
+        r'(?:mean|median)\s*age[\s:]*(?:was|of|=)?\s*\d+\.?\d*',
+        r'(?:\d+\.?\d*%?\s*(?:men|women|male|female))',
+        r'(?:\d+\.?\d*%?\s*of\s*(?:patients|participants|subjects))',
+        r'(?:[Nn]\s*=\s*\d+)',  # Sample size
+        r'(?:aged?\s*\d+[-–]\d+\s*(?:years?|yrs?)?)'  # Age range
+    ]
+    # Extract all patterns
+    for pattern in p_value_patterns:
+        matches = re.finditer(pattern, text, re.IGNORECASE)
+        facts['p_values'].extend([m.group() for m in matches])
+    for pattern in stat_patterns:
+        matches = re.finditer(pattern, text, re.IGNORECASE)
+        facts['statistical_measures'].extend([m.group() for m in matches])
+    for pattern in measurement_patterns:
+        matches = re.finditer(pattern, text, re.IGNORECASE)
+        facts['measurements'].extend([m.group() for m in matches])
+    for pattern in demographic_patterns:
+        matches = re.finditer(pattern, text, re.IGNORECASE)
+        facts['demographics'].extend([m.group() for m in matches])
+    # Extract timeframes
+    timeframe_patterns = [
+        r'(?:followed|monitored|observed|tracked)\s*(?:for|over|during)\s*\d+\.?\d*\s*(?:weeks?|months?|years?)',
+        r'(?:follow-up|duration)\s*(?:of|was|=)\s*\d+\.?\d*\s*(?:weeks?|months?|years?)',
+        r'\d+[-–]\s*(?:week|month|year)\s*(?:follow-up|period|duration)'
+    ]
+    for pattern in timeframe_patterns:
+        matches = re.finditer(pattern, text, re.IGNORECASE)
+        facts['timeframes'].extend([m.group() for m in matches])
+    return facts
+def identify_abstract_structure(text):
+    """Identify the structure of the biomedical abstract"""
+    # Common section headers in biomedical abstracts
+    section_patterns = {
+        'background': r'(?:background|introduction|objective|purpose|aim)',
+        'methods': r'(?:methods|materials|design|study design|procedure)',
+        'results': r'(?:results|findings|outcome)',
+        'conclusions': r'(?:conclusion|discussion|summary|implications)'
+    }
+    # Check if abstract has clear section headers
+    has_sections = any(
+        re.search(f"{pattern}s?:?", text, re.IGNORECASE)
+        for pattern in section_patterns.values()
+    )
+    if not has_sections:
+        return "unstructured"
+    # Identify present sections
+    present_sections = []
+    for section, pattern in section_patterns.items():
+        if re.search(f"{pattern}s?:?", text, re.IGNORECASE):
+            present_sections.append(section)
+    return present_sections
 def load_model(model_type):
     """Load appropriate model based on type with proper memory management"""
     try:
         gc.collect()
         torch.cuda.empty_cache()
+        device = "cpu"
         if model_type == "summarize":
             model = AutoModelForSeq2SeqLM.from_pretrained(
                 "pendar02/bart-large-pubmedd",
                 cache_dir="./models",
                 "pendar02/bart-large-pubmedd",
                 cache_dir="./models"
             )
+        else:
             base_model = AutoModelForSeq2SeqLM.from_pretrained(
                 "GanjinZero/biobart-base",
                 cache_dir="./models",
         raise
 def cleanup_model(model, tokenizer):
     try:
         del model
         del tokenizer
     except Exception:
         pass
 def process_excel(uploaded_file):
     try:
         df = pd.read_excel(uploaded_file)
         required_columns = ['Abstract', 'Article Title', 'Authors',
                           'Source Title', 'Publication Year', 'DOI', 'Times Cited, All Databases']
         missing_columns = [col for col in required_columns if col not in df.columns]
         if missing_columns:
             st.error(f"Missing required columns: {', '.join(missing_columns)}")
         st.error(f"Error processing file: {str(e)}")
         return None
 def improve_summary_generation(text, model, tokenizer, max_attempts=3):
     """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
+    try:
+        # Identify abstract structure and extract facts
+        structure = identify_abstract_structure(text)
+        facts = extract_biomedical_facts(text)
+        # Build prompt based on structure
+        if structure == "unstructured":
+            section_prompt = (
+                "Organize this unstructured biomedical abstract into clear sections:\n"
+                "1. Background/Objectives\n"
+                "2. Methods\n"
+                "3. Results\n"
+                "4. Conclusions\n\n"
+            )
+        else:
+            section_prompt = "Summarize while maintaining these sections:\n"
+            for section in structure:
+                section_prompt += f"- {section.capitalize()}\n"
+        formatted_text = (
+            f"{section_prompt}\n"
+            "Requirements:\n"
+            "- Include ALL statistical findings (p-values, CIs, ORs)\n"
+            "- Preserve ALL demographic information\n"
+            "- Maintain ALL measurements and units\n"
+            "- Keep ALL timeframes and follow-up periods\n"
+            "- Report numerical results with original precision\n"
+            "- Preserve relationships between variables\n"
+            "- Maintain chronological order of findings\n\n"
+            "Original text:\n" + text
+        )
+        inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
         parameter_combinations = [
+            {"temperature": 0.1, "num_beams": 12, "length_penalty": 2.0, "top_k": 50},
+            {"temperature": 0.05, "num_beams": 15, "length_penalty": 2.5, "top_k": 30},
+            {"temperature": 0.0, "num_beams": 20, "length_penalty": 3.0, "top_k": 10}
         ]
+        best_summary = None
+        best_score = -1
+        attempts = 0
+        while attempts < max_attempts:
+            for params in parameter_combinations:
+                try:
+                    with torch.no_grad():
+                        summary_ids = model.generate(
+                            **{
+                                "input_ids": inputs["input_ids"],
+                                "attention_mask": inputs["attention_mask"],
+                                "max_length": 300,
+                                "min_length": 100,
+                                "num_beams": params["num_beams"],
+                                "length_penalty": params["length_penalty"],
+                                "no_repeat_ngram_size": 3,
+                                "temperature": params["temperature"],
+                                "top_k": params["top_k"],
+                                "repetition_penalty": 2.5,
+                                "do_sample": params["temperature"] > 0.0
+                            }
+                        )
+                    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+                    if not summary:
+                        continue
+                    processed_summary = post_process_summary(summary)
+                    if not processed_summary:
+                        continue
+                    # Validate biomedical content
+                    summary_facts = extract_biomedical_facts(processed_summary)
+                    missing_facts = {k: set(v) - set(summary_facts[k]) for k, v in facts.items()}
+                    # Calculate score
+                    score = 1.0
+                    for category, missing in missing_facts.items():
+                        if missing:
+                            score -= 0.1 * len(missing)
+                    if score > best_score:
+                        best_summary = processed_summary
+                        best_score = score
+                    if score > 0.8:
+                        return best_summary
+                except Exception as e:
+                    print(f"Error in generation attempt: {str(e)}")
+                    continue
+            attempts += 1
+            parameter_combinations = [
+                {**params,
+                 "num_beams": params["num_beams"] + 5,
+                 "length_penalty": params["length_penalty"] + 0.5}
+                for params in parameter_combinations
+            ]
+        return best_summary if best_summary is not None else "Unable to generate a satisfactory summary."
+    except Exception as e:
+        print(f"Error in summary generation: {str(e)}")
+        return "Error generating summary."
 def post_process_summary(summary):
     """Enhanced post-processing focused on maintaining structure and removing artifacts"""
     if not summary:
         return summary
     # Clean up section headers
+    header_mappings = {
+        r'(?i)background.*objectives?:?': 'Background and objectives:',
+        r'(?i)(materials?\s*and\s*)?methods?:?': 'Methods:',
+        r'(?i)(key\s*)?findings?:?|results?:?': 'Key findings:',
+        r'(?i)conclusions?:?': 'Conclusions:',
+        r'(?i)(study\s*)?aims?:?|goals?:?|purpose:?': '',
+        r'(?i)objectives?:?': '',
+        r'(?i)outcomes?:?': '',
+        r'(?i)discussion:?': ''
+    }
+    for pattern, replacement in header_mappings.items():
+        summary = re.sub(pattern, replacement, summary)
+    # Split into sections and clean
     sections = re.split(r'(?i)(Background and objectives:|Methods:|Key findings:|Conclusions:)', summary)
     sections = [s.strip() for s in sections if s.strip()]
+    # Reorganize sections
     organized_sections = {
         'Background and objectives': '',
         'Methods': '',
         if item in organized_sections:
             current_section = item
         elif current_section:
+            # Clean up content
+            content = re.sub(r'\s+', ' ', item)  # Fix spacing
+            content = re.sub(r'\.+', '.', content)  # Fix multiple periods
+            content = content.strip('.: ')  # Remove trailing periods and spaces
+            organized_sections[current_section] = content
     # Build final summary
     final_sections = []
     for section, content in organized_sections.items():
         if content:
+            final_sections.append(f"{section} {content}.")
     return '\n\n'.join(final_sections)
+# The rest of your app.py code (main function, UI components, etc.) remains the same...
 def validate_summary(summary, original_text):
     """Validate summary content against original text"""
     # Perform fact verification