Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Sleeping

App Files Files Community

samyak152002 commited on Nov 2, 2024

Commit

0e6dbe2

verified ·

1 Parent(s): 3700c3a

Update app.py

Browse files

Files changed (1) hide show

app.py +401 -346

app.py CHANGED Viewed

@@ -1,380 +1,435 @@
-import gradio as gr
-import PyPDF2
 import re
-import fitz
 from pdfminer.high_level import extract_text
 from pdfminer.layout import LAParams
 import language_tool_python
-from tqdm import tqdm
 from typing import List, Dict, Any, Tuple
 from collections import Counter
 import json
-import sys
 import traceback
 import io
-import os
 import tempfile
-class PDFAnalyzer:
-    def __init__(self, file_path: str):
-        self.file_path = file_path
-        self.pages_text = self.extract_pdf_text_by_page()
-        self.full_text = self.extract_pdf_text()
-        self.language_tool = language_tool_python.LanguageTool('en-US')
-    def extract_pdf_text_by_page(self) -> List[str]:
-        """Extracts text from a PDF file, page by page, using PyMuPDF."""
-        with fitz.open(self.file_path) as doc:
             return [page.get_text("text") for page in doc]
-    def extract_pdf_text(self) -> str:
-        """Extracts text from a PDF file using pdfminer."""
-        return extract_text(self.file_path, laparams=LAParams())
-    def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
-        """Checks for the presence of required terms in the text."""
-        return {term: term.lower() in self.full_text.lower() for term in search_terms}
-    def label_authors(self) -> str:
-        """Label authors in the text with 'Authors:' if not already labeled."""
-        author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
-        match = re.search(author_line_regex, self.full_text, re.MULTILINE)
-        if match:
-            authors = match.group(1).strip()
-            return self.full_text.replace(authors, f"Authors: {authors}")
-        return self.full_text
-    def check_metadata(self) -> Dict[str, Any]:
-        """Check for metadata elements."""
-        return {
-            "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', self.full_text)),
-            "list_of_authors": bool(re.search(r'Authors?:', self.full_text, re.IGNORECASE)),
-            "keywords_list": bool(re.search(r'Keywords?:', self.full_text, re.IGNORECASE)),
-            "word_count": len(self.full_text.split()) or "Missing"
         }
-    def check_disclosures(self) -> Dict[str, bool]:
-        """Check for disclosure statements."""
-        search_terms = [
-            "author contributions statement",
-            "conflict of interest statement",
-            "ethics statement",
-            "funding statement",
-            "data access statement"
-        ]
-        return self.check_text_presence(search_terms)
-    def check_figures_and_tables(self) -> Dict[str, bool]:
-        """Check for figures and tables."""
-        return {
-            "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', self.full_text, re.IGNORECASE)),
-            "figures_legends": bool(re.search(r'Figure \d+.*?legend', self.full_text, re.IGNORECASE)),
-            "tables_legends": bool(re.search(r'Table \d+.*?legend', self.full_text, re.IGNORECASE))
-        }
-    def check_references(self) -> Dict[str, Any]:
-        """Check for references."""
-        return {
-            "old_references": bool(re.search(r'\b19[0-9]{2}\b', self.full_text)),
-            "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', self.full_text[:1000], re.IGNORECASE)),
-            "reference_count": len(re.findall(r'\[.*?\]', self.full_text)),
-            "self_citations": bool(re.search(r'Self-citation', self.full_text, re.IGNORECASE))
         }
-    def check_structure(self) -> Dict[str, bool]:
-        """Check document structure."""
-        return {
-            "imrad_structure": all(section in self.full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
-            "abstract_structure": "structured abstract" in self.full_text.lower()
         }
-    def check_language_issues(self) -> Dict[str, Any]:
-        """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
-        matches = self.language_tool.check(self.full_text)
-        word_count = len(self.full_text.split())
-        issues_count = len(matches)
-        issues_per_1000 = (issues_count / word_count) * 1000
-        serializable_matches = [
-            {
-                "message": match.message,
-                "replacements": match.replacements,
-                "offset": match.offset,
-                "errorLength": match.errorLength,
-                "category": match.category,
-                "ruleIssueType": match.ruleIssueType,
-                "sentence": match.sentence
-            }
-            for match in matches
-        ]
-        return {
-            "issues_count": issues_count,
-            "issues_per_1000": issues_per_1000,
-            "failed": issues_per_1000 > 20,
-            "matches": serializable_matches
-        }
-    def check_language(self) -> Dict[str, Any]:
-        """Check language quality."""
-        return {
-            "plain_language": bool(re.search(r'plain language summary', self.full_text, re.IGNORECASE)),
-            "readability_issues": False,  # Placeholder for future implementation
-            "language_issues": self.check_language_issues()
-        }
-    def check_figure_order(self) -> Dict[str, Any]:
-        """Check if figures are referred to in sequential order."""
-        figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
-        figure_references = re.findall(figure_pattern, self.full_text, re.IGNORECASE)
-        figure_numbers = sorted(set(int(num) for num in figure_references))
-        is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
-        if figure_numbers:
-            expected_figures = set(range(1, max(figure_numbers) + 1))
-            missing_figures = list(expected_figures - set(figure_numbers))
-        else:
-            missing_figures = None
-        duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
-        duplicate_numbers = [int(num) for num in duplicates]
-        notMentioned = list(set(figure_references) - set(duplicates))
-        return {
-            "sequential_order": is_sequential,
-            "figure_count": len(figure_numbers),
-            "missing_figures": missing_figures,
-            "figure_order": figure_numbers,
-            "duplicate_references": duplicates,
-            "not_mentioned": notMentioned
-        }
-    def check_reference_order(self) -> Dict[str, Any]:
-        """Check if references in the main body text are in order."""
-        reference_pattern = r'\[(\d+)\]'
-        references = re.findall(reference_pattern, self.full_text)
-        ref_numbers = [int(ref) for ref in references]
-        max_ref = 0
-        out_of_order = []
-        for i, ref in enumerate(ref_numbers):
-            if ref > max_ref + 1:
-                out_of_order.append((i+1, ref))
-            max_ref = max(max_ref, ref)
-        all_refs = set(range(1, max_ref + 1))
-        used_refs = set(ref_numbers)
-        missing_refs = list(all_refs - used_refs)
-        return {
-            "max_reference": max_ref,
-            "out_of_order": out_of_order,
-            "missing_references": missing_refs,
-            "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
-        }
-    def check_reference_style(self) -> Dict[str, Any]:
-        """Check the reference style used in the paper and identify inconsistencies."""
-        reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', self.full_text, re.IGNORECASE)
-        if not reference_section_match:
-            return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
-        references_text = reference_section_match.group(1)
-        reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
-        references = [ref.strip() for ref in reference_list if ref.strip()]
-        styles = []
-        inconsistent_refs = []
-        patterns = {
-            "IEEE": r'^\[\d+\]',
-            "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
-            "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
-            "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
-            "Vancouver": r'^\d+\.\s',
-            "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
-        }
-        for i, ref in enumerate(references, 1):
-            matched = False
-            for style, pattern in patterns.items():
-                if re.match(pattern, ref):
-                    styles.append(style)
-                    matched = True
-                    break
-            if not matched:
-                styles.append("Unknown")
-                inconsistent_refs.append((i, ref, "Unknown"))
-        if not styles:
-            return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
-        style_counts = Counter(styles)
-        majority_style, majority_count = style_counts.most_common(1)[0]
-        for i, style in enumerate(styles, 1):
-            if style != majority_style and style != "Unknown":
-                inconsistent_refs.append((i, references[i-1], style))
-        consistency = majority_count / len(styles)
-        return {
-            "majority_style": majority_style,
-            "inconsistent_refs": inconsistent_refs,
-            "consistency": consistency
-        }
-    def highlight_issues_in_pdf(self, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> str:
-        """Highlight inconsistent references and add notes for language issues in a single PDF."""
-        try:
-            doc = fitz.open(self.file_path)
-            added_notes = set()
-            for page_number, page in enumerate(doc, start=1):
-                words = page.get_text("words")
-                if inconsistent_refs:
-                    for ref_num, ref_text, ref_style in inconsistent_refs:
-                        self.highlight_text(page, words, ref_text, f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be {self.check_reference_style().get('majority_style', 'Unknown')}.")
-                if language_matches:
-                    for match in language_matches:
-                        issue_text = match['sentence']
-                        error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
-                        issue_key = (issue_text, error_message)
-                        if issue_key not in added_notes:
-                            if self.highlight_text(page, words, issue_text, error_message):
-                                added_notes.add(issue_key)
-            annotated_file_path = self.file_path.replace(".pdf", "_annotated_combined.pdf")
-            doc.save(annotated_file_path)
-            doc.close()
-            if os.path.exists(annotated_file_path):
-                return annotated_file_path
-            else:
-                print(f"Error: Annotated PDF was not saved at {annotated_file_path}")
-                return ""
-        except Exception as e:
-            print(f"An error occurred while annotating the PDF: {str(e)}", file=sys.stderr)
-            traceback.print_exc()
-            return ""
-    def highlight_text(self, page, words, text, annotation):
-        """Highlight text and add annotation."""
-        text_instances = self.find_text_instances(words, text)
-        highlighted = False
-        for inst in text_instances:
-            highlight = page.add_highlight_annot(inst)
-            highlight.update()
-            comment = page.add_text_annot(inst[:2], annotation)
-            comment.update()
-            highlighted = True
-        return highlighted
-    def find_text_instances(self, words, text):
-        """Find all instances of text in words."""
-        text_lower = text.lower()
-        text_words = text_lower.split()
-        instances = []
-        for i in range(len(words) - len(text_words) + 1):
-            if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
-                inst = fitz.Rect(words[i][:4])
-                for j in range(1, len(text_words)):
-                    inst = inst | fitz.Rect(words[i+j][:4])
-                instances.append(inst)
-        return instances
-    def analyze(self) -> Dict[str, Any]:
-        """Perform full analysis of the PDF."""
-        self.full_text = self.label_authors()
-        results = {
-            "metadata": self.check_metadata(),
-            "disclosures": self.check_disclosures(),
-            "figures_and_tables": self.check_figures_and_tables(),
-            "figure_order": self.check_figure_order(),
-            "references": self.check_references(),
-            "reference_order": self.check_reference_order(),
-            "reference_style": self.check_reference_style(),
-            "structure": self.check_structure(),
-            "language": self.check_language(),
-            "annotated_pdf_path": ""
-        }
-        inconsistent_refs = results.get("reference_style", {}).get("inconsistent_refs", [])
-        language_matches = results.get("language", {}).get("language_issues", {}).get("matches", [])
-        if inconsistent_refs or language_matches:
-            annotated_path = self.highlight_issues_in_pdf(inconsistent_refs, language_matches)
-            results["annotated_pdf_path"] = annotated_path
-        return results
-def analyze_pdf(file):
-    try:
-        # Create a temporary directory to store files
-        with tempfile.TemporaryDirectory() as temp_dir:
-            # Save the uploaded file temporarily
-            temp_path = os.path.join(temp_dir, "uploaded.pdf")
-            with open(temp_path, "wb") as f:
-                f.write(file.read())
-            analyzer = PDFAnalyzer(temp_path)
-            results = analyzer.analyze()
-            # Ensure all keys are present in the results, even if they're empty
-            default_results = {
-                "annotated_pdf_path": "",
-                "metadata": {},
-                "disclosures": {},
-                "figures_and_tables": {},
-                "figure_order": {},
-                "references": {},
-                "reference_order": {},
-                "reference_style": {},
-                "structure": {},
-                "language": {},
-            }
-            # Update default_results with actual results
-            default_results.update(results)
-            # Handle the annotated PDF
-            annotated_pdf_path = results.get("annotated_pdf_path", "")
-            if annotated_pdf_path and os.path.exists(annotated_pdf_path):
-                # Read the annotated PDF and return it as bytes
-                with open(annotated_pdf_path, "rb") as f:
-                    annotated_pdf_bytes = f.read()
-            else:
-                annotated_pdf_bytes = None
-            # Remove the annotated_pdf_path from the results as we're returning the file separately
-            default_results.pop("annotated_pdf_path", None)
-            return json.dumps(default_results, indent=2, default=str), annotated_pdf_bytes
-    except Exception as e:
-        error_message = {
-            "error": str(e),
-            "traceback": traceback.format_exc()
-        }
-        return json.dumps(error_message, indent=2), None
-# Create Gradio interface
-iface = gr.Interface(
-    fn=analyze_pdf,
-    inputs=gr.File(label="Upload PDF"),
-    outputs=[
-        gr.JSON(label="Analysis Results"),
-        gr.File(label="Annotated PDF")
-    ],
-    title="PDF Analyzer",
-    description="Upload a PDF document to analyze its structure, references, language, and more.",
-)
-# Launch the app
 if __name__ == "__main__":
-    iface.launch()

 import re
+import fitz  # PyMuPDF
 from pdfminer.high_level import extract_text
 from pdfminer.layout import LAParams
 import language_tool_python
 from typing import List, Dict, Any, Tuple
 from collections import Counter
 import json
 import traceback
 import io
 import tempfile
+import os
+import gradio as gr
+# Set JAVA_HOME environment variable
+os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
+# ------------------------------
+# Analysis Functions
+# ------------------------------
+def extract_pdf_text_by_page(file) -> List[str]:
+    """Extracts text from a PDF file, page by page, using PyMuPDF."""
+    if isinstance(file, str):
+        with fitz.open(file) as doc:
+            return [page.get_text("text") for page in doc]
+    else:
+        with fitz.open(stream=file.read(), filetype="pdf") as doc:
             return [page.get_text("text") for page in doc]
+def extract_pdf_text(file) -> str:
+    """Extracts text from a PDF file using pdfminer."""
+    if isinstance(file, str):
+        with open(file, 'rb') as f:
+            return extract_text(f, laparams=LAParams())
+    else:
+        return extract_text(file, laparams=LAParams())
+def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
+    """Checks for the presence of required terms in the text."""
+    return {term: term.lower() in full_text.lower() for term in search_terms}
+def label_authors(full_text: str) -> str:
+    """Label authors in the text with 'Authors:' if not already labeled."""
+    author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
+    match = re.search(author_line_regex, full_text, re.MULTILINE)
+    if match:
+        authors = match.group(1).strip()
+        return full_text.replace(authors, f"Authors: {authors}")
+    return full_text
+def check_metadata(full_text: str) -> Dict[str, Any]:
+    """Check for metadata elements."""
+    return {
+        "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
+        "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
+        "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
+        "word_count": len(full_text.split()) or "Missing"
+    }
+def check_disclosures(full_text: str) -> Dict[str, bool]:
+    """Check for disclosure statements."""
+    search_terms = [
+        "author contributions statement",
+        "conflict of interest statement",
+        "ethics statement",
+        "funding statement",
+        "data access statement"
+    ]
+    return check_text_presence(full_text, search_terms)
+def check_figures_and_tables(full_text: str) -> Dict[str, bool]:
+    """Check for figures and tables."""
+    return {
+        "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', full_text, re.IGNORECASE)),
+        "figures_legends": bool(re.search(r'Figure \d+.*?legend', full_text, re.IGNORECASE)),
+        "tables_legends": bool(re.search(r'Table \d+.*?legend', full_text, re.IGNORECASE))
+    }
+def check_references(full_text: str) -> Dict[str, Any]:
+    """Check for references."""
+    return {
+        "old_references": bool(re.search(r'\b19[0-9]{2}\b', full_text)),
+        "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', full_text[:1000], re.IGNORECASE)),
+        "reference_count": len(re.findall(r'\[.*?\]', full_text)),
+        "self_citations": bool(re.search(r'Self-citation', full_text, re.IGNORECASE))
+    }
+def check_structure(full_text: str) -> Dict[str, bool]:
+    """Check document structure."""
+    return {
+        "imrad_structure": all(section in full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
+        "abstract_structure": "structured abstract" in full_text.lower()
+    }
+def check_language_issues(full_text: str) -> Dict[str, Any]:
+    """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
+    language_tool = language_tool_python.LanguageTool('en-US')
+    matches = language_tool.check(full_text)
+    word_count = len(full_text.split())
+    issues_count = len(matches)
+    issues_per_1000 = (issues_count / word_count) * 1000 if word_count else 0
+    serializable_matches = [
+        {
+            "message": match.message,
+            "replacements": match.replacements,
+            "offset": match.offset,
+            "errorLength": match.errorLength,
+            "category": match.category,
+            "ruleIssueType": match.ruleIssueType,
+            "sentence": match.sentence
         }
+        for match in matches
+    ]
+    return {
+        "issues_count": issues_count,
+        "issues_per_1000": issues_per_1000,
+        "failed": issues_per_1000 > 20,
+        "matches": serializable_matches
+    }
+def check_language(full_text: str) -> Dict[str, Any]:
+    """Check language quality."""
+    return {
+        "plain_language": bool(re.search(r'plain language summary', full_text, re.IGNORECASE)),
+        "readability_issues": False,  # Placeholder for future implementation
+        "language_issues": check_language_issues(full_text)
+    }
+def check_figure_order(full_text: str) -> Dict[str, Any]:
+    """Check if figures are referred to in sequential order."""
+    figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
+    figure_references = re.findall(figure_pattern, full_text, re.IGNORECASE)
+    figure_numbers = sorted(set(int(num) for num in figure_references))
+    is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
+    if figure_numbers:
+        expected_figures = set(range(1, max(figure_numbers) + 1))
+        missing_figures = list(expected_figures - set(figure_numbers))
+    else:
+        missing_figures = None
+    duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
+    duplicate_numbers = [int(num) for num in duplicates]
+    not_mentioned = list(set(figure_references) - set(duplicates))
+    return {
+        "sequential_order": is_sequential,
+        "figure_count": len(figure_numbers),
+        "missing_figures": missing_figures,
+        "figure_order": figure_numbers,
+        "duplicate_references": duplicates,
+        "not_mentioned": not_mentioned
+    }
+def check_reference_order(full_text: str) -> Dict[str, Any]:
+    """Check if references in the main body text are in order."""
+    reference_pattern = r'\[(\d+)\]'
+    references = re.findall(reference_pattern, full_text)
+    ref_numbers = [int(ref) for ref in references]
+    max_ref = 0
+    out_of_order = []
+    for i, ref in enumerate(ref_numbers):
+        if ref > max_ref + 1:
+            out_of_order.append((i+1, ref))
+        max_ref = max(max_ref, ref)
+    all_refs = set(range(1, max_ref + 1))
+    used_refs = set(ref_numbers)
+    missing_refs = list(all_refs - used_refs)
+    return {
+        "max_reference": max_ref,
+        "out_of_order": out_of_order,
+        "missing_references": missing_refs,
+        "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
+    }
+def check_reference_style(full_text: str) -> Dict[str, Any]:
+    """Check the reference style used in the paper and identify inconsistencies."""
+    reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', full_text, re.IGNORECASE)
+    if not reference_section_match:
+        return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
+    references_text = reference_section_match.group(1)
+    reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
+    references = [ref.strip() for ref in reference_list if ref.strip()]
+    styles = []
+    inconsistent_refs = []
+    patterns = {
+        "IEEE": r'^\[\d+\]',
+        "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
+        "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
+        "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
+        "Vancouver": r'^\d+\.\s',
+        "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
+    }
+    for i, ref in enumerate(references, 1):
+        matched = False
+        for style, pattern in patterns.items():
+            if re.match(pattern, ref):
+                styles.append(style)
+                matched = True
+                break
+        if not matched:
+            styles.append("Unknown")
+            inconsistent_refs.append((i, ref, "Unknown"))
+    if not styles:
+        return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
+    style_counts = Counter(styles)
+    majority_style, majority_count = style_counts.most_common(1)[0]
+    for i, style in enumerate(styles, 1):
+        if style != majority_style and style != "Unknown":
+            inconsistent_refs.append((i, references[i-1], style))
+    consistency = majority_count / len(styles)
+    return {
+        "majority_style": majority_style,
+        "inconsistent_refs": inconsistent_refs,
+        "consistency": consistency
+    }
+# ------------------------------
+# Annotation Functions
+# ------------------------------
+def highlight_text(page, words, text, annotation):
+    """Highlight text and add annotation."""
+    text_instances = find_text_instances(words, text)
+    highlighted = False
+    for inst in text_instances:
+        highlight = page.add_highlight_annot(inst)
+        highlight.update()
+        comment = page.add_text_annot(inst[:2], annotation)
+        comment.update()
+        highlighted = True
+    return highlighted
+def find_text_instances(words, text):
+    """Find all instances of text in words."""
+    text_lower = text.lower()
+    text_words = text_lower.split()
+    instances = []
+    for i in range(len(words) - len(text_words) + 1):
+        if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
+            inst = fitz.Rect(words[i][:4])
+            for j in range(1, len(text_words)):
+                inst = inst | fitz.Rect(words[i+j][:4])
+            instances.append(inst)
+    return instances
+def highlight_issues_in_pdf(file, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> bytes:
+    """Highlight inconsistent references and add notes for language issues in a single PDF."""
+    try:
+        if isinstance(file, str):
+            doc = fitz.open(file)
+        else:
+            doc = fitz.open(stream=file.read(), filetype="pdf")
+        added_notes = set()
+        for page_number, page in enumerate(doc, start=1):
+            words = page.get_text("words")
+            if inconsistent_refs:
+                for ref_num, ref_text, ref_style in inconsistent_refs:
+                    annotation_text = f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be consolidated to {ref_style}."
+                    highlight_text(page, words, ref_text, annotation_text)
+            if language_matches:
+                for match in language_matches:
+                    issue_text = match['sentence']
+                    error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
+                    issue_key = (issue_text, error_message)
+                    if issue_key not in added_notes:
+                        if highlight_text(page, words, issue_text, error_message):
+                            added_notes.add(issue_key)
+        annotated_pdf_bytes = doc.write()
+        doc.close()
+        return annotated_pdf_bytes
+    except Exception as e:
+        print(f"An error occurred while annotating the PDF: {str(e)}")
+        traceback.print_exc()
+        return b""
+# ------------------------------
+# Main Analysis Function
+# ------------------------------
+def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
+    """
+    Analyze the uploaded PDF and return analysis results and annotated PDF bytes.
+    """
+    try:
+        pages_text = extract_pdf_text_by_page(file)
+        full_text = extract_pdf_text(file)
+        full_text = label_authors(full_text)
+        # Perform analyses
+        metadata = check_metadata(full_text)
+        disclosures = check_disclosures(full_text)
+        figures_and_tables = check_figures_and_tables(full_text)
+        figure_order = check_figure_order(full_text)
+        references = check_references(full_text)
+        reference_order = check_reference_order(full_text)
+        reference_style = check_reference_style(full_text)
+        structure = check_structure(full_text)
+        language = check_language(full_text)
+        # Compile results
+        results = {
+            "metadata": metadata,
+            "disclosures": disclosures,
+            "figures_and_tables": figures_and_tables,
+            "figure_order": figure_order,
+            "references": references,
+            "reference_order": reference_order,
+            "reference_style": reference_style,
+            "structure": structure,
+            "language": language
         }
+        # Handle annotations
+        inconsistent_refs = reference_style.get("inconsistent_refs", [])
+        language_matches = language.get("language_issues", {}).get("matches", [])
+        if inconsistent_refs or language_matches:
+            annotated_pdf_bytes = highlight_issues_in_pdf(file, inconsistent_refs, language_matches)
+        else:
+            annotated_pdf_bytes = None
+        return results, annotated_pdf_bytes
+    except Exception as e:
+        error_message = {
+            "error": str(e),
+            "traceback": traceback.format_exc()
         }
+        return error_message, None
+# ------------------------------
+# Gradio Interface
+# ------------------------------
+def process_upload(file):
+    """
+    Process the uploaded PDF file and return analysis results and annotated PDF.
+    """
+    try:
+        if file is None:
+            return json.dumps({"error": "No file uploaded"}, indent=2), None
+        # Create a temporary file to work with
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_input:
+            temp_input.write(file)
+            temp_input_path = temp_input.name
+        # Analyze the PDF
+        results, annotated_pdf = analyze_pdf(temp_input_path)
+        results_json = json.dumps(results, indent=2)
+        # Clean up the temporary input file
+        os.unlink(temp_input_path)
+        # If we have an annotated PDF, save it temporarily
+        if annotated_pdf:
+            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+                tmp_file.write(annotated_pdf)
+                return results_json, tmp_file.name
+        return results_json, None
+    except Exception as e:
+        error_message = json.dumps({
+            "error": str(e),
+            "traceback": traceback.format_exc()
+        }, indent=2)
+        return error_message, None
+def create_interface():
+    with gr.Blocks(title="PDF Analyzer") as interface:
+        gr.Markdown("# PDF Analyzer")
+        gr.Markdown("Upload a PDF document to analyze its structure, references, language, and more.")
+        with gr.Row():
+            file_input = gr.File(
+                label="Upload PDF",
+                file_types=[".pdf"],
+                type="binary"  # Changed from "file" to "binary"
+            )
+        with gr.Row():
+            analyze_btn = gr.Button("Analyze PDF")
+        with gr.Row():
+            results_output = gr.JSON(
+                label="Analysis Results",
+                show_label=True
+            )
+        with gr.Row():
+            pdf_output = gr.File(
+                label="Annotated PDF",
+                show_label=True
+            )
+        analyze_btn.click(
+            fn=process_upload,
+            inputs=[file_input],
+            outputs=[results_output, pdf_output]
+        )
+    return interface
 if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(
+        share=True,  # Set to False in production
+        # server_name="0.0.0.0",
+        server_port=None
+    )