Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Running

File size: 15,790 Bytes

import gradio as gr
import PyPDF2
import re
import fitz
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
import language_tool_python
from tqdm import tqdm
from typing import List, Dict, Any, Tuple
from collections import Counter
import json
import sys
import traceback
import io
import os
import tempfile

class PDFAnalyzer:
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.pages_text = self.extract_pdf_text_by_page()
        self.full_text = self.extract_pdf_text()
        self.language_tool = language_tool_python.LanguageTool('en-US')

    def extract_pdf_text_by_page(self) -> List[str]:
        """Extracts text from a PDF file, page by page, using PyMuPDF."""
        with fitz.open(self.file_path) as doc:
            return [page.get_text("text") for page in doc]

    def extract_pdf_text(self) -> str:
        """Extracts text from a PDF file using pdfminer."""
        return extract_text(self.file_path, laparams=LAParams())

    def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
        """Checks for the presence of required terms in the text."""
        return {term: term.lower() in self.full_text.lower() for term in search_terms}

    def label_authors(self) -> str:
        """Label authors in the text with 'Authors:' if not already labeled."""
        author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
        match = re.search(author_line_regex, self.full_text, re.MULTILINE)
        if match:
            authors = match.group(1).strip()
            return self.full_text.replace(authors, f"Authors: {authors}")
        return self.full_text

    def check_metadata(self) -> Dict[str, Any]:
        """Check for metadata elements."""
        return {
            "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', self.full_text)),
            "list_of_authors": bool(re.search(r'Authors?:', self.full_text, re.IGNORECASE)),
            "keywords_list": bool(re.search(r'Keywords?:', self.full_text, re.IGNORECASE)),
            "word_count": len(self.full_text.split()) or "Missing"
        }

    def check_disclosures(self) -> Dict[str, bool]:
        """Check for disclosure statements."""
        search_terms = [
            "author contributions statement",
            "conflict of interest statement",
            "ethics statement",
            "funding statement",
            "data access statement"
        ]
        return self.check_text_presence(search_terms)

    def check_figures_and_tables(self) -> Dict[str, bool]:
        """Check for figures and tables."""
        return {
            "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', self.full_text, re.IGNORECASE)),
            "figures_legends": bool(re.search(r'Figure \d+.*?legend', self.full_text, re.IGNORECASE)),
            "tables_legends": bool(re.search(r'Table \d+.*?legend', self.full_text, re.IGNORECASE))
        }

    def check_references(self) -> Dict[str, Any]:
        """Check for references."""
        return {
            "old_references": bool(re.search(r'\b19[0-9]{2}\b', self.full_text)),
            "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', self.full_text[:1000], re.IGNORECASE)),
            "reference_count": len(re.findall(r'\[.*?\]', self.full_text)),
            "self_citations": bool(re.search(r'Self-citation', self.full_text, re.IGNORECASE))
        }

    def check_structure(self) -> Dict[str, bool]:
        """Check document structure."""
        return {
            "imrad_structure": all(section in self.full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
            "abstract_structure": "structured abstract" in self.full_text.lower()
        }

    def check_language_issues(self) -> Dict[str, Any]:
        """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
        matches = self.language_tool.check(self.full_text)
        word_count = len(self.full_text.split())
        issues_count = len(matches)
        issues_per_1000 = (issues_count / word_count) * 1000
        
        serializable_matches = [
            {
                "message": match.message,
                "replacements": match.replacements,
                "offset": match.offset,
                "errorLength": match.errorLength,
                "category": match.category,
                "ruleIssueType": match.ruleIssueType,
                "sentence": match.sentence
            }
            for match in matches
        ]
        
        return {
            "issues_count": issues_count,
            "issues_per_1000": issues_per_1000,
            "failed": issues_per_1000 > 20,
            "matches": serializable_matches
        }

    def check_language(self) -> Dict[str, Any]:
        """Check language quality."""
        return {
            "plain_language": bool(re.search(r'plain language summary', self.full_text, re.IGNORECASE)),
            "readability_issues": False,  # Placeholder for future implementation
            "language_issues": self.check_language_issues()
        }
    
    def check_figure_order(self) -> Dict[str, Any]:
        """Check if figures are referred to in sequential order."""
        figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
        figure_references = re.findall(figure_pattern, self.full_text, re.IGNORECASE)
        figure_numbers = sorted(set(int(num) for num in figure_references))
        
        is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
        
        if figure_numbers:
            expected_figures = set(range(1, max(figure_numbers) + 1))
            missing_figures = list(expected_figures - set(figure_numbers))
        else:
            missing_figures = None
    
        duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
        duplicate_numbers = [int(num) for num in duplicates]
        notMentioned = list(set(figure_references) - set(duplicates))
        
        return {
            "sequential_order": is_sequential,
            "figure_count": len(figure_numbers),
            "missing_figures": missing_figures,
            "figure_order": figure_numbers,
            "duplicate_references": duplicates,
            "not_mentioned": notMentioned
        }
    
    def check_reference_order(self) -> Dict[str, Any]:
        """Check if references in the main body text are in order."""
        reference_pattern = r'\[(\d+)\]'
        references = re.findall(reference_pattern, self.full_text)
        ref_numbers = [int(ref) for ref in references]
        
        max_ref = 0
        out_of_order = []
        for i, ref in enumerate(ref_numbers):
            if ref > max_ref + 1:
                out_of_order.append((i+1, ref))
            max_ref = max(max_ref, ref)
        
        all_refs = set(range(1, max_ref + 1))
        used_refs = set(ref_numbers)
        missing_refs = list(all_refs - used_refs)
        
        return {
            "max_reference": max_ref,
            "out_of_order": out_of_order,
            "missing_references": missing_refs,
            "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
        }

    def check_reference_style(self) -> Dict[str, Any]:
        """Check the reference style used in the paper and identify inconsistencies."""
        reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', self.full_text, re.IGNORECASE)
        if not reference_section_match:
            return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}

        references_text = reference_section_match.group(1)
        reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
        references = [ref.strip() for ref in reference_list if ref.strip()]

        styles = []
        inconsistent_refs = []
        patterns = {
            "IEEE": r'^\[\d+\]',
            "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
            "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
            "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
            "Vancouver": r'^\d+\.\s',
            "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
        }

        for i, ref in enumerate(references, 1):
            matched = False
            for style, pattern in patterns.items():
                if re.match(pattern, ref):
                    styles.append(style)
                    matched = True
                    break
            if not matched:
                styles.append("Unknown")
                inconsistent_refs.append((i, ref, "Unknown"))

        if not styles:
            return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}

        style_counts = Counter(styles)
        majority_style, majority_count = style_counts.most_common(1)[0]

        for i, style in enumerate(styles, 1):
            if style != majority_style and style != "Unknown":
                inconsistent_refs.append((i, references[i-1], style))

        consistency = majority_count / len(styles)

        return {
            "majority_style": majority_style,
            "inconsistent_refs": inconsistent_refs,
            "consistency": consistency
        }

    def highlight_issues_in_pdf(self, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> str:
        """Highlight inconsistent references and add notes for language issues in a single PDF."""
        try:
            doc = fitz.open(self.file_path)
            added_notes = set()

            for page_number, page in enumerate(doc, start=1):
                words = page.get_text("words")
                
                if inconsistent_refs:
                    for ref_num, ref_text, ref_style in inconsistent_refs:
                        self.highlight_text(page, words, ref_text, f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be {self.check_reference_style().get('majority_style', 'Unknown')}.")

                if language_matches:
                    for match in language_matches:
                        issue_text = match['sentence']
                        error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
                        issue_key = (issue_text, error_message)
                        
                        if issue_key not in added_notes:
                            if self.highlight_text(page, words, issue_text, error_message):
                                added_notes.add(issue_key)

            annotated_file_path = self.file_path.replace(".pdf", "_annotated_combined.pdf")
            doc.save(annotated_file_path)
            doc.close()
            
            if os.path.exists(annotated_file_path):
                return annotated_file_path
            else:
                print(f"Error: Annotated PDF was not saved at {annotated_file_path}")
                return ""

        except Exception as e:
            print(f"An error occurred while annotating the PDF: {str(e)}", file=sys.stderr)
            traceback.print_exc()
            return ""

    def highlight_text(self, page, words, text, annotation):
        """Highlight text and add annotation."""
        text_instances = self.find_text_instances(words, text)
        highlighted = False
        for inst in text_instances:
            highlight = page.add_highlight_annot(inst)
            highlight.update()
            comment = page.add_text_annot(inst[:2], annotation)
            comment.update()
            highlighted = True
        return highlighted

    def find_text_instances(self, words, text):
        """Find all instances of text in words."""
        text_lower = text.lower()
        text_words = text_lower.split()
        instances = []
        for i in range(len(words) - len(text_words) + 1):
            if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
                inst = fitz.Rect(words[i][:4])
                for j in range(1, len(text_words)):
                    inst = inst | fitz.Rect(words[i+j][:4])
                instances.append(inst)
        return instances

    def analyze(self) -> Dict[str, Any]:
        """Perform full analysis of the PDF."""
        self.full_text = self.label_authors()
        
        results = {
            "metadata": self.check_metadata(),
            "disclosures": self.check_disclosures(),
            "figures_and_tables": self.check_figures_and_tables(),
            "figure_order": self.check_figure_order(),
            "references": self.check_references(),
            "reference_order": self.check_reference_order(),
            "reference_style": self.check_reference_style(),
            "structure": self.check_structure(),
            "language": self.check_language(),
            "annotated_pdf_path": ""
        }
        
        inconsistent_refs = results.get("reference_style", {}).get("inconsistent_refs", [])
        language_matches = results.get("language", {}).get("language_issues", {}).get("matches", [])

        if inconsistent_refs or language_matches:
            annotated_path = self.highlight_issues_in_pdf(inconsistent_refs, language_matches)
            results["annotated_pdf_path"] = annotated_path

        return results

def analyze_pdf(file):
    try:
        # Create a temporary directory to store files
        with tempfile.TemporaryDirectory() as temp_dir:
            # Save the uploaded file temporarily
            temp_path = os.path.join(temp_dir, "uploaded.pdf")
            with open(temp_path, "wb") as f:
                f.write(file.read())
            
            analyzer = PDFAnalyzer(temp_path)
            results = analyzer.analyze()
            
            # Ensure all keys are present in the results, even if they're empty
            default_results = {
                "annotated_pdf_path": "",
                "metadata": {},
                "disclosures": {},
                "figures_and_tables": {},
                "figure_order": {},
                "references": {},
                "reference_order": {},
                "reference_style": {},
                "structure": {},
                "language": {},
            }
            
            # Update default_results with actual results
            default_results.update(results)
            
            # Handle the annotated PDF
            annotated_pdf_path = results.get("annotated_pdf_path", "")
            if annotated_pdf_path and os.path.exists(annotated_pdf_path):
                # Read the annotated PDF and return it as bytes
                with open(annotated_pdf_path, "rb") as f:
                    annotated_pdf_bytes = f.read()
            else:
                annotated_pdf_bytes = None
            
            # Remove the annotated_pdf_path from the results as we're returning the file separately
            default_results.pop("annotated_pdf_path", None)
            
            return json.dumps(default_results, indent=2, default=str), annotated_pdf_bytes
    
    except Exception as e:
        error_message = {
            "error": str(e),
            "traceback": traceback.format_exc()
        }
        return json.dumps(error_message, indent=2), None

# Create Gradio interface
iface = gr.Interface(
    fn=analyze_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs=[
        gr.JSON(label="Analysis Results"),
        gr.File(label="Annotated PDF")
    ],
    title="PDF Analyzer",
    description="Upload a PDF document to analyze its structure, references, language, and more.",
)

# Launch the app
if __name__ == "__main__":
    iface.launch()