Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 12

Commit

0a57b0f

verified ·

1 Parent(s): 0f40536

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -225

app.py CHANGED Viewed

@@ -27,103 +27,52 @@ if 'processing_started' not in st.session_state:
 if 'focused_summary_generated' not in st.session_state:
     st.session_state.focused_summary_generated = False
-def extract_biomedical_facts(text):
-    """Extract biomedical-specific facts and measurements"""
-    facts = {
-        'p_values': [],
-        'measurements': [],
-        'demographics': [],
-        'statistical_measures': [],
-        'timeframes': []
-    }
-    # P-value patterns
-    p_value_patterns = [
-        r'[pP][\s-]*(?:value)?[\s-]*[=<>]\s*\.?\d+\.?\d*e?-?\d*',  # p = 0.001, p<.05, p < 1e-6
-        r'[pP][\s-]*(?:value)?[\s-]*(?:was|of|is|were)\s*\.?\d+\.?\d*e?-?\d*'  # p value was 0.001
-    ]
-    # Statistical measures patterns
-    stat_patterns = [
-        r'(?:CI|confidence interval)[\s:]*(?:\d+\.?\d*%?)?\s*[-–]\s*(?:\d+\.?\d*%?)',  # 95% CI: 1.2-3.4
-        r'(?:OR|odds ratio)[\s:]*(?:\d+\.?\d*)',  # OR: 1.5
-        r'(?:HR|hazard ratio)[\s:]*(?:\d+\.?\d*)',  # HR: 2.1
-        r'(?:RR|relative risk)[\s:]*(?:\d+\.?\d*)',  # RR: 1.3
-        r'(?:SD|standard deviation)[\s:]*[±]?\s*\d+\.?\d*'  # SD: ±2.1
-    ]
-    # Measurement patterns
-    measurement_patterns = [
-        r'\d+\.?\d*\s*(?:mg|kg|ml|mmol|µg|ng|mm|cm|µl|g/dl|mmHg)',  # Units
-        r'\d+\.?\d*\s*(?:weeks?|months?|years?|hours?|days?)',  # Time units
-        r'\d+\.?\d*\s*(?:%|percent|percentage)'  # Percentages
-    ]
-    # Demographic patterns
-    demographic_patterns = [
-        r'(?:mean|median)\s*age[\s:]*(?:was|of|=)?\s*\d+\.?\d*',
-        r'(?:\d+\.?\d*%?\s*(?:men|women|male|female))',
-        r'(?:\d+\.?\d*%?\s*of\s*(?:patients|participants|subjects))',
-        r'(?:[Nn]\s*=\s*\d+)',  # Sample size
-        r'(?:aged?\s*\d+[-–]\d+\s*(?:years?|yrs?)?)'  # Age range
-    ]
-    # Extract all patterns
-    for pattern in p_value_patterns:
-        matches = re.finditer(pattern, text, re.IGNORECASE)
-        facts['p_values'].extend([m.group() for m in matches])
-    for pattern in stat_patterns:
-        matches = re.finditer(pattern, text, re.IGNORECASE)
-        facts['statistical_measures'].extend([m.group() for m in matches])
-    for pattern in measurement_patterns:
-        matches = re.finditer(pattern, text, re.IGNORECASE)
-        facts['measurements'].extend([m.group() for m in matches])
-    for pattern in demographic_patterns:
-        matches = re.finditer(pattern, text, re.IGNORECASE)
-        facts['demographics'].extend([m.group() for m in matches])
-    # Extract timeframes
-    timeframe_patterns = [
-        r'(?:followed|monitored|observed|tracked)\s*(?:for|over|during)\s*\d+\.?\d*\s*(?:weeks?|months?|years?)',
-        r'(?:follow-up|duration)\s*(?:of|was|=)\s*\d+\.?\d*\s*(?:weeks?|months?|years?)',
-        r'\d+[-–]\s*(?:week|month|year)\s*(?:follow-up|period|duration)'
-    ]
-    for pattern in timeframe_patterns:
-        matches = re.finditer(pattern, text, re.IGNORECASE)
-        facts['timeframes'].extend([m.group() for m in matches])
-    return facts
-def identify_abstract_structure(text):
-    """Identify the structure of the biomedical abstract"""
-    # Common section headers in biomedical abstracts
-    section_patterns = {
-        'background': r'(?:background|introduction|objective|purpose|aim)',
-        'methods': r'(?:methods|materials|design|study design|procedure)',
-        'results': r'(?:results|findings|outcome)',
-        'conclusions': r'(?:conclusion|discussion|summary|implications)'
     }
-    # Check if abstract has clear section headers
-    has_sections = any(
-        re.search(f"{pattern}s?:?", text, re.IGNORECASE)
-        for pattern in section_patterns.values()
-    )
-    if not has_sections:
-        return "unstructured"
-    # Identify present sections
-    present_sections = []
-    for section, pattern in section_patterns.items():
-        if re.search(f"{pattern}s?:?", text, re.IGNORECASE):
-            present_sections.append(section)
-    return present_sections
 def load_model(model_type):
     """Load appropriate model based on type with proper memory management"""
@@ -192,114 +141,47 @@ def process_excel(uploaded_file):
         st.error(f"Error processing file: {str(e)}")
         return None
-def improve_summary_generation(text, model, tokenizer, max_attempts=3):
     """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
     try:
-        # Identify abstract structure and extract facts
-        structure = identify_abstract_structure(text)
-        facts = extract_biomedical_facts(text)
-        # Build prompt based on structure
-        if structure == "unstructured":
-            section_prompt = (
-                "Organize this unstructured biomedical abstract into clear sections:\n"
-                "1. Background/Objectives\n"
-                "2. Methods\n"
-                "3. Results\n"
-                "4. Conclusions\n\n"
-            )
-        else:
-            section_prompt = "Summarize while maintaining these sections:\n"
-            for section in structure:
-                section_prompt += f"- {section.capitalize()}\n"
         formatted_text = (
-            f"{section_prompt}\n"
-            "Requirements:\n"
-            "- Include ALL statistical findings (p-values, CIs, ORs)\n"
-            "- Preserve ALL demographic information\n"
-            "- Maintain ALL measurements and units\n"
-            "- Keep ALL timeframes and follow-up periods\n"
-            "- Report numerical results with original precision\n"
-            "- Preserve relationships between variables\n"
-            "- Maintain chronological order of findings\n\n"
-            "Original text:\n" + text
         )
         inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        parameter_combinations = [
-            {"temperature": 0.1, "num_beams": 12, "length_penalty": 2.0, "top_k": 50},
-            {"temperature": 0.05, "num_beams": 15, "length_penalty": 2.5, "top_k": 30},
-            {"temperature": 0.0, "num_beams": 20, "length_penalty": 3.0, "top_k": 10}
-        ]
-        best_summary = None
-        best_score = -1
-        attempts = 0
-        while attempts < max_attempts:
-            for params in parameter_combinations:
-                try:
-                    with torch.no_grad():
-                        summary_ids = model.generate(
-                            **{
-                                "input_ids": inputs["input_ids"],
-                                "attention_mask": inputs["attention_mask"],
-                                "max_length": 300,
-                                "min_length": 100,
-                                "num_beams": params["num_beams"],
-                                "length_penalty": params["length_penalty"],
-                                "no_repeat_ngram_size": 3,
-                                "temperature": params["temperature"],
-                                "top_k": params["top_k"],
-                                "repetition_penalty": 2.5,
-                                "do_sample": params["temperature"] > 0.0
-                            }
-                        )
-                    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-                    if not summary:
-                        continue
-                    processed_summary = post_process_summary(summary)
-                    if not processed_summary:
-                        continue
-                    # Validate biomedical content
-                    summary_facts = extract_biomedical_facts(processed_summary)
-                    missing_facts = {k: set(v) - set(summary_facts[k]) for k, v in facts.items()}
-                    # Calculate score
-                    score = 1.0
-                    for category, missing in missing_facts.items():
-                        if missing:
-                            score -= 0.1 * len(missing)
-                    if score > best_score:
-                        best_summary = processed_summary
-                        best_score = score
-                    if score > 0.8:
-                        return best_summary
-                except Exception as e:
-                    print(f"Error in generation attempt: {str(e)}")
-                    continue
-            attempts += 1
-            parameter_combinations = [
-                {**params,
-                 "num_beams": params["num_beams"] + 5,
-                 "length_penalty": params["length_penalty"] + 0.5}
-                for params in parameter_combinations
-            ]
-        return best_summary if best_summary is not None else "Unable to generate a satisfactory summary."
     except Exception as e:
         print(f"Error in summary generation: {str(e)}")
@@ -356,13 +238,12 @@ def post_process_summary(summary):
     return '\n\n'.join(final_sections)
-# The rest of your app.py code (main function, UI components, etc.) remains the same...
 def validate_summary(summary, original_text):
     """Validate summary content against original text"""
     # Perform fact verification
     verification = verify_facts(summary, original_text)
-    if not verification['is_valid']:
         return False
     # Check for age inconsistencies
@@ -386,34 +267,40 @@ def validate_summary(summary, original_text):
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
-    # Preprocess each abstract
-    formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts]
-    combined_input = f"Question: {question} Abstracts: " + " [SEP] ".join(formatted_abstracts)
-    inputs = tokenizer(combined_input, return_tensors="pt", max_length=1024, truncation=True)
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        summary_ids = model.generate(
-            **{
-                "input_ids": inputs["input_ids"],
-                "attention_mask": inputs["attention_mask"],
-                "max_length": 200,
-                "min_length": 50,
-                "num_beams": 4,
-                "length_penalty": 2.0,
-                "early_stopping": True
-            }
-        )
-    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def create_filter_controls(df, sort_column):
     """Create appropriate filter controls based on the selected column"""
     filtered_df = df.copy()
     if sort_column == 'Publication Year':
-        # Year range slider
         year_min = int(df['Publication Year'].min())
         year_max = int(df['Publication Year'].max())
         col1, col2 = st.columns(2)
@@ -433,7 +320,6 @@ def create_filter_controls(df, sort_column):
         ]
     elif sort_column == 'Authors':
-        # Multi-select for authors
         unique_authors = sorted(set(
             author.strip()
             for authors in df['Authors'].dropna()
@@ -451,7 +337,6 @@ def create_filter_controls(df, sort_column):
             ]
     elif sort_column == 'Source Title':
-        # Multi-select for source titles
         unique_sources = sorted(df['Source Title'].unique())
         selected_sources = st.multiselect(
             'Select Sources',
@@ -460,13 +345,7 @@ def create_filter_controls(df, sort_column):
         if selected_sources:
             filtered_df = filtered_df[filtered_df['Source Title'].isin(selected_sources)]
-    elif sort_column == 'Article Title':
-        # Only alphabetical sorting, no filtering
-        pass
     elif sort_column == 'Times Cited':
-        # Cited count range slider
         cited_min = int(df['Times Cited'].min())
         cited_max = int(df['Times Cited'].max())
         col1, col2 = st.columns(2)
@@ -490,19 +369,16 @@ def create_filter_controls(df, sort_column):
 def main():
     st.title("🔬 Biomedical Papers Analysis")
-    # File upload section
     uploaded_file = st.file_uploader(
         "Upload Excel file containing papers",
         type=['xlsx', 'xls'],
         help="File must contain: Abstract, Article Title, Authors, Source Title, Publication Year, DOI"
     )
-    # Question input - moved up but hidden initially
     question_container = st.empty()
     question = ""
     if uploaded_file is not None:
-        # Process Excel file
         if st.session_state.processed_data is None:
             with st.spinner("Processing file..."):
                 df = process_excel(uploaded_file)
@@ -513,15 +389,14 @@ def main():
             df = st.session_state.processed_data
             st.write(f"📊 Loaded {len(df)} papers with abstracts")
-            # Get question before processing
             with question_container:
                 question = st.text_input(
                     "Enter your research question (optional):",
-                    help="If provided, a question-focused summary will be generated after individual summaries"
                 )
             # Single button for both processes
-            if not st.session_state.get('processing_started', False):
                 if st.button("Start Analysis"):
                     st.session_state.processing_started = True

 if 'focused_summary_generated' not in st.session_state:
     st.session_state.focused_summary_generated = False
+def preprocess_text(text):
+    """Preprocess text for summarization"""
+    if not isinstance(text, str) or not text.strip():
+        return text
+    # Clean up whitespace
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    # Fix common formatting issues
+    text = re.sub(r'(\d+)\s*%', r'\1%', text)  # Fix percentage format
+    text = re.sub(r'\(\s*([Nn])\s*=\s*(\d+)\s*\)', r'(n=\2)', text)  # Fix sample size format
+    text = re.sub(r'([Pp])\s*([<>])\s*(\d)', r'\1\2\3', text)  # Fix p-value format
+    return text
+def verify_facts(summary, original_text):
+    """Verify key facts between summary and original text"""
+    # Extract numbers and percentages
+    def extract_numbers(text):
+        return set(re.findall(r'(\d+\.?\d*)%?', text))
+    # Extract relationships
+    def extract_relationships(text):
+        patterns = [
+            r'associated with', r'predicted', r'correlated',
+            r'increased', r'decreased', r'significant'
+        ]
+        found = []
+        for pattern in patterns:
+            if re.search(pattern, text.lower()):
+                found.append(pattern)
+        return set(found)
+    # Get facts from both texts
+    original_numbers = extract_numbers(original_text)
+    summary_numbers = extract_numbers(summary)
+    original_relations = extract_relationships(original_text)
+    summary_relations = extract_relationships(summary)
+    return {
+        'is_valid': summary_numbers.issubset(original_numbers) and
+                   summary_relations.issubset(original_relations),
+        'missing_numbers': original_numbers - summary_numbers,
+        'missing_relations': original_relations - summary_relations
     }
 def load_model(model_type):
     """Load appropriate model based on type with proper memory management"""
         st.error(f"Error processing file: {str(e)}")
         return None
+def improve_summary_generation(text, model, tokenizer):
     """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
     try:
+        # Simplified prompt
         formatted_text = (
+            "Summarize this biomedical abstract into four sections:\n"
+            "1. Background/Objectives: State the main purpose and population\n"
+            "2. Methods: Describe what was done\n"
+            "3. Key findings: Include ALL numerical results and statistical relationships\n"
+            "4. Conclusions: State main implications\n\n"
+            "Important: Preserve all numbers, measurements, and statistical findings.\n\n"
+            "Text: " + preprocess_text(text)
         )
         inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        # Single generation attempt with optimized parameters
+        with torch.no_grad():
+            summary_ids = model.generate(
+                **{
+                    "input_ids": inputs["input_ids"],
+                    "attention_mask": inputs["attention_mask"],
+                    "max_length": 300,
+                    "min_length": 100,
+                    "num_beams": 5,
+                    "length_penalty": 2.0,
+                    "no_repeat_ngram_size": 3,
+                    "temperature": 0.3,
+                    "repetition_penalty": 2.5
+                }
+            )
+        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        if not summary:
+            return "Error: Could not generate summary."
+        return post_process_summary(summary)
     except Exception as e:
         print(f"Error in summary generation: {str(e)}")
     return '\n\n'.join(final_sections)
 def validate_summary(summary, original_text):
     """Validate summary content against original text"""
     # Perform fact verification
     verification = verify_facts(summary, original_text)
+    if not verification.get('is_valid', False):
         return False
     # Check for age inconsistencies
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
+    try:
+        # Preprocess each abstract
+        formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts]
+        combined_input = f"Question: {question}\nSummarize these abstracts to answer the question:\n" + \
+                        "\n---\n".join(formatted_abstracts)
+        inputs = tokenizer(combined_input, return_tensors="pt", max_length=1024, truncation=True)
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            summary_ids = model.generate(
+                **{
+                    "input_ids": inputs["input_ids"],
+                    "attention_mask": inputs["attention_mask"],
+                    "max_length": 300,
+                    "min_length": 100,
+                    "num_beams": 5,
+                    "length_penalty": 2.0,
+                    "temperature": 0.3,
+                    "repetition_penalty": 2.5
+                }
+            )
+        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    except Exception as e:
+        print(f"Error in focused summary generation: {str(e)}")
+        return "Error generating focused summary."
 def create_filter_controls(df, sort_column):
     """Create appropriate filter controls based on the selected column"""
     filtered_df = df.copy()
     if sort_column == 'Publication Year':
         year_min = int(df['Publication Year'].min())
         year_max = int(df['Publication Year'].max())
         col1, col2 = st.columns(2)
         ]
     elif sort_column == 'Authors':
         unique_authors = sorted(set(
             author.strip()
             for authors in df['Authors'].dropna()
             ]
     elif sort_column == 'Source Title':
         unique_sources = sorted(df['Source Title'].unique())
         selected_sources = st.multiselect(
             'Select Sources',
         if selected_sources:
             filtered_df = filtered_df[filtered_df['Source Title'].isin(selected_sources)]
     elif sort_column == 'Times Cited':
         cited_min = int(df['Times Cited'].min())
         cited_max = int(df['Times Cited'].max())
         col1, col2 = st.columns(2)
 def main():
     st.title("🔬 Biomedical Papers Analysis")
     uploaded_file = st.file_uploader(
         "Upload Excel file containing papers",
         type=['xlsx', 'xls'],
         help="File must contain: Abstract, Article Title, Authors, Source Title, Publication Year, DOI"
     )
     question_container = st.empty()
     question = ""
     if uploaded_file is not None:
         if st.session_state.processed_data is None:
             with st.spinner("Processing file..."):
                 df = process_excel(uploaded_file)
             df = st.session_state.processed_data
             st.write(f"📊 Loaded {len(df)} papers with abstracts")
             with question_container:
                 question = st.text_input(
                     "Enter your research question (optional):",
+                    help="If provided, a focused summary will be generated after individual summaries"
                 )
             # Single button for both processes
+                if not st.session_state.get('processing_started', False):
                 if st.button("Start Analysis"):
                     st.session_state.processing_started = True