textmetric-stramlit-1

Sleeping

File size: 14,544 Bytes

feab938
 
 
 
 
 
 
 
 
 
 
 
 
9e1790e
feab938
9e1790e
 
8cc1285
 
 
 
 
 
 
 
feab938

import streamlit as st
import re
import fitz  # PyMuPDF
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
import language_tool_python
from typing import List, Dict, Any, Tuple
from collections import Counter
import json
import traceback
import io
import tempfile
import os
       

# Set JAVA_HOME environment variable
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'

# Optional: Verify Java installation
try:
    java_version = subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT).decode()
    st.write(f"Java Version: {java_version}")
except Exception as e:
    st.error("Java is not installed correctly.")

# ------------------------------
# Analysis Functions
# ------------------------------

def extract_pdf_text_by_page(file) -> List[str]:
    """Extracts text from a PDF file, page by page, using PyMuPDF."""
    file.seek(0)
    with fitz.open(stream=file.read(), filetype="pdf") as doc:
        return [page.get_text("text") for page in doc]

def extract_pdf_text(file) -> str:
    """Extracts text from a PDF file using pdfminer."""
    file.seek(0)
    return extract_text(file, laparams=LAParams())

def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
    """Checks for the presence of required terms in the text."""
    return {term: term.lower() in full_text.lower() for term in search_terms}

def label_authors(full_text: str) -> str:
    """Label authors in the text with 'Authors:' if not already labeled."""
    author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
    match = re.search(author_line_regex, full_text, re.MULTILINE)
    if match:
        authors = match.group(1).strip()
        return full_text.replace(authors, f"Authors: {authors}")
    return full_text

def check_metadata(full_text: str) -> Dict[str, Any]:
    """Check for metadata elements."""
    return {
        "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
        "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
        "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
        "word_count": len(full_text.split()) or "Missing"
    }

def check_disclosures(full_text: str) -> Dict[str, bool]:
    """Check for disclosure statements."""
    search_terms = [
        "author contributions statement",
        "conflict of interest statement",
        "ethics statement",
        "funding statement",
        "data access statement"
    ]
    return check_text_presence(full_text, search_terms)

def check_figures_and_tables(full_text: str) -> Dict[str, bool]:
    """Check for figures and tables."""
    return {
        "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', full_text, re.IGNORECASE)),
        "figures_legends": bool(re.search(r'Figure \d+.*?legend', full_text, re.IGNORECASE)),
        "tables_legends": bool(re.search(r'Table \d+.*?legend', full_text, re.IGNORECASE))
    }

def check_references(full_text: str) -> Dict[str, Any]:
    """Check for references."""
    return {
        "old_references": bool(re.search(r'\b19[0-9]{2}\b', full_text)),
        "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', full_text[:1000], re.IGNORECASE)),
        "reference_count": len(re.findall(r'\[.*?\]', full_text)),
        "self_citations": bool(re.search(r'Self-citation', full_text, re.IGNORECASE))
    }

def check_structure(full_text: str) -> Dict[str, bool]:
    """Check document structure."""
    return {
        "imrad_structure": all(section in full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
        "abstract_structure": "structured abstract" in full_text.lower()
    }

def check_language_issues(full_text: str) -> Dict[str, Any]:
    """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
    language_tool = language_tool_python.LanguageTool('en-US')
    matches = language_tool.check(full_text)
    word_count = len(full_text.split())
    issues_count = len(matches)
    issues_per_1000 = (issues_count / word_count) * 1000 if word_count else 0
    
    serializable_matches = [
        {
            "message": match.message,
            "replacements": match.replacements,
            "offset": match.offset,
            "errorLength": match.errorLength,
            "category": match.category,
            "ruleIssueType": match.ruleIssueType,
            "sentence": match.sentence
        }
        for match in matches
    ]
    
    return {
        "issues_count": issues_count,
        "issues_per_1000": issues_per_1000,
        "failed": issues_per_1000 > 20,
        "matches": serializable_matches
    }

def check_language(full_text: str) -> Dict[str, Any]:
    """Check language quality."""
    return {
        "plain_language": bool(re.search(r'plain language summary', full_text, re.IGNORECASE)),
        "readability_issues": False,  # Placeholder for future implementation
        "language_issues": check_language_issues(full_text)
    }

def check_figure_order(full_text: str) -> Dict[str, Any]:
    """Check if figures are referred to in sequential order."""
    figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
    figure_references = re.findall(figure_pattern, full_text, re.IGNORECASE)
    figure_numbers = sorted(set(int(num) for num in figure_references))
    
    is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
    
    if figure_numbers:
        expected_figures = set(range(1, max(figure_numbers) + 1))
        missing_figures = list(expected_figures - set(figure_numbers))
    else:
        missing_figures = None

    duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
    duplicate_numbers = [int(num) for num in duplicates]
    not_mentioned = list(set(figure_references) - set(duplicates))
    
    return {
        "sequential_order": is_sequential,
        "figure_count": len(figure_numbers),
        "missing_figures": missing_figures,
        "figure_order": figure_numbers,
        "duplicate_references": duplicates,
        "not_mentioned": not_mentioned
    }

def check_reference_order(full_text: str) -> Dict[str, Any]:
    """Check if references in the main body text are in order."""
    reference_pattern = r'\[(\d+)\]'
    references = re.findall(reference_pattern, full_text)
    ref_numbers = [int(ref) for ref in references]
    
    max_ref = 0
    out_of_order = []
    for i, ref in enumerate(ref_numbers):
        if ref > max_ref + 1:
            out_of_order.append((i+1, ref))
        max_ref = max(max_ref, ref)
    
    all_refs = set(range(1, max_ref + 1))
    used_refs = set(ref_numbers)
    missing_refs = list(all_refs - used_refs)
    
    return {
        "max_reference": max_ref,
        "out_of_order": out_of_order,
        "missing_references": missing_refs,
        "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
    }

def check_reference_style(full_text: str) -> Dict[str, Any]:
    """Check the reference style used in the paper and identify inconsistencies."""
    reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', full_text, re.IGNORECASE)
    if not reference_section_match:
        return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}

    references_text = reference_section_match.group(1)
    reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
    references = [ref.strip() for ref in reference_list if ref.strip()]

    styles = []
    inconsistent_refs = []
    patterns = {
        "IEEE": r'^\[\d+\]',
        "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
        "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
        "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
        "Vancouver": r'^\d+\.\s',
        "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
    }

    for i, ref in enumerate(references, 1):
        matched = False
        for style, pattern in patterns.items():
            if re.match(pattern, ref):
                styles.append(style)
                matched = True
                break
        if not matched:
            styles.append("Unknown")
            inconsistent_refs.append((i, ref, "Unknown"))

    if not styles:
        return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}

    style_counts = Counter(styles)
    majority_style, majority_count = style_counts.most_common(1)[0]

    for i, style in enumerate(styles, 1):
        if style != majority_style and style != "Unknown":
            inconsistent_refs.append((i, references[i-1], style))

    consistency = majority_count / len(styles)

    return {
        "majority_style": majority_style,
        "inconsistent_refs": inconsistent_refs,
        "consistency": consistency
    }

# ------------------------------
# Annotation Functions
# ------------------------------

def highlight_text(page, words, text, annotation):
    """Highlight text and add annotation."""
    text_instances = find_text_instances(words, text)
    highlighted = False
    for inst in text_instances:
        highlight = page.add_highlight_annot(inst)
        highlight.update()
        comment = page.add_text_annot(inst[:2], annotation)
        comment.update()
        highlighted = True
    return highlighted

def find_text_instances(words, text):
    """Find all instances of text in words."""
    text_lower = text.lower()
    text_words = text_lower.split()
    instances = []
    for i in range(len(words) - len(text_words) + 1):
        if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
            inst = fitz.Rect(words[i][:4])
            for j in range(1, len(text_words)):
                inst = inst | fitz.Rect(words[i+j][:4])
            instances.append(inst)
    return instances

def highlight_issues_in_pdf(file, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> bytes:
    """Highlight inconsistent references and add notes for language issues in a single PDF."""
    try:
        file.seek(0)
        doc = fitz.open(stream=file.read(), filetype="pdf")
        added_notes = set()

        for page_number, page in enumerate(doc, start=1):
            words = page.get_text("words")
            
            if inconsistent_refs:
                for ref_num, ref_text, ref_style in inconsistent_refs:
                    annotation_text = f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be consolidated to {ref_style}."
                    highlight_text(page, words, ref_text, annotation_text)

            if language_matches:
                for match in language_matches:
                    issue_text = match['sentence']
                    error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
                    issue_key = (issue_text, error_message)
                    
                    if issue_key not in added_notes:
                        if highlight_text(page, words, issue_text, error_message):
                            added_notes.add(issue_key)

        annotated_pdf_bytes = doc.write()
        doc.close()
        return annotated_pdf_bytes

    except Exception as e:
        print(f"An error occurred while annotating the PDF: {str(e)}")
        traceback.print_exc()
        return b""

# ------------------------------
# Main Analysis Function
# ------------------------------

def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
    """
    Analyze the uploaded PDF and return analysis results and annotated PDF bytes.
    
    Returns:
        Tuple containing:
            - Analysis results as a dictionary.
            - Annotated PDF as bytes.
    """
    try:
        # The 'file' is a BytesIO object provided by Streamlit
        file.seek(0)
        pages_text = extract_pdf_text_by_page(file)
        full_text = extract_pdf_text(file)
        full_text = label_authors(full_text)

        # Perform analyses
        metadata = check_metadata(full_text)
        disclosures = check_disclosures(full_text)
        figures_and_tables = check_figures_and_tables(full_text)
        figure_order = check_figure_order(full_text)
        references = check_references(full_text)
        reference_order = check_reference_order(full_text)
        reference_style = check_reference_style(full_text)
        structure = check_structure(full_text)
        language = check_language(full_text)

        # Compile results
        results = {
            "metadata": metadata,
            "disclosures": disclosures,
            "figures_and_tables": figures_and_tables,
            "figure_order": figure_order,
            "references": references,
            "reference_order": reference_order,
            "reference_style": reference_style,
            "structure": structure,
            "language": language
        }

        # Handle annotations
        inconsistent_refs = reference_style.get("inconsistent_refs", [])
        language_matches = language.get("language_issues", {}).get("matches", [])

        if inconsistent_refs or language_matches:
            annotated_pdf_bytes = highlight_issues_in_pdf(file, inconsistent_refs, language_matches)
        else:
            annotated_pdf_bytes = None

        return results, annotated_pdf_bytes

    except Exception as e:
        error_message = {
            "error": str(e),
            "traceback": traceback.format_exc()
        }
        return error_message, None

# ------------------------------
# Streamlit Interface
# ------------------------------

def main():
    st.title("PDF Analyzer")
    st.write("Upload a PDF document to analyze its structure, references, language, and more.")

    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])

    if uploaded_file is not None:
        with st.spinner("Analyzing PDF..."):
            results, annotated_pdf = analyze_pdf(uploaded_file)

        st.subheader("Analysis Results")
        st.json(results)

        if annotated_pdf:
            st.subheader("Download Annotated PDF")
            st.download_button(
                label="Download Annotated PDF",
                data=annotated_pdf,
                file_name="annotated.pdf",
                mime="application/pdf"
            )
        else:
            st.success("No issues found. No annotated PDF to download.")

if __name__ == "__main__":
    main()