Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Sleeping

App Files Files Community

samyak152002 commited on Oct 7, 2024

Commit

0c80b43

verified ·

1 Parent(s): 1cac4fd

Create App.py

Browse files

Files changed (1) hide show

App.py +366 -0

App.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import gradio as gr
+import PyPDF2
+import re
+import fitz
+from pdfminer.high_level import extract_text
+from pdfminer.layout import LAParams
+import language_tool_python
+from tqdm import tqdm
+from typing import List, Dict, Any, Tuple
+from collections import Counter
+import json
+import sys
+import traceback
+import io
+import os
+class PDFAnalyzer:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.pages_text = self.extract_pdf_text_by_page()
+        self.full_text = self.extract_pdf_text()
+        self.language_tool = language_tool_python.LanguageTool('en-US')
+    def extract_pdf_text_by_page(self) -> List[str]:
+        """Extracts text from a PDF file, page by page, using PyMuPDF."""
+        with fitz.open(self.file_path) as doc:
+            return [page.get_text("text") for page in doc]
+    def extract_pdf_text(self) -> str:
+        """Extracts text from a PDF file using pdfminer."""
+        return extract_text(self.file_path, laparams=LAParams())
+    def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
+        """Checks for the presence of required terms in the text."""
+        return {term: term in self.full_text for term in search_terms}
+    def label_authors(self) -> str:
+        """Label authors in the text with 'Authors:' if not already labeled."""
+        author_line_regex = r"^(?:.*\n)(.*?)(?:\n\nNetaji Subhas University of Technology, Dwarka, Delhi, 110078, India)"
+        match = re.search(author_line_regex, self.full_text, re.MULTILINE)
+        if match:
+            authors = match.group(1).strip()
+            return self.full_text.replace(authors, f"Authors: {authors}")
+        return self.full_text
+    def check_metadata(self) -> Dict[str, Any]:
+        """Check for metadata elements."""
+        return {
+            "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', self.full_text)),
+            "list_of_authors": bool(re.search(r'Authors?:', self.full_text, re.IGNORECASE)),
+            "keywords_list": bool(re.search(r'Keywords?:', self.full_text, re.IGNORECASE)),
+            "word_count": len(self.full_text.split()) or "Missing"
+        }
+    def check_disclosures(self) -> Dict[str, bool]:
+        """Check for disclosure statements."""
+        search_terms = [
+            "author contributions statement",
+            "conflict of interest statement",
+            "ethics statement",
+            "funding statement",
+            "data access statement"
+        ]
+        return self.check_text_presence(search_terms)
+    def check_figures_and_tables(self) -> Dict[str, bool]:
+        """Check for figures and tables."""
+        return {
+            "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', self.full_text, re.IGNORECASE)),
+            "figures_legends": bool(re.search(r'Figure \d+.*?legend', self.full_text, re.IGNORECASE)),
+            "tables_legends": bool(re.search(r'Table \d+.*?legend', self.full_text, re.IGNORECASE))
+        }
+    def check_references(self) -> Dict[str, Any]:
+        """Check for references."""
+        return {
+            "old_references": bool(re.search(r'\b19[0-9]{2}\b', self.full_text)),
+            "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', self.full_text[:1000], re.IGNORECASE)),
+            "reference_count": len(re.findall(r'\[.*?\]', self.full_text)),
+            "self_citations": bool(re.search(r'Self-citation', self.full_text, re.IGNORECASE))
+        }
+    def check_structure(self) -> Dict[str, bool]:
+        """Check document structure."""
+        return {
+            "imrad_structure": all(section in self.full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
+            "abstract_structure": "structured abstract" in self.full_text.lower()
+        }
+    def check_language_issues(self) -> Dict[str, Any]:
+        """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
+        matches = self.language_tool.check(self.full_text)
+        word_count = len(self.full_text.split())
+        issues_count = len(matches)
+        issues_per_1000 = (issues_count / word_count) * 1000
+        serializable_matches = [
+            {
+                "message": match.message,
+                "replacements": match.replacements,
+                "offset": match.offset,
+                "errorLength": match.errorLength,
+                "category": match.category,
+                "ruleIssueType": match.ruleIssueType,
+                "sentence": match.sentence
+            }
+            for match in matches
+        ]
+        return {
+            "issues_count": issues_count,
+            "issues_per_1000": issues_per_1000,
+            "failed": issues_per_1000 > 20,
+            "matches": serializable_matches
+        }
+    def check_language(self) -> Dict[str, Any]:
+        """Check language quality."""
+        return {
+            "plain_language": bool(re.search(r'plain language summary', self.full_text, re.IGNORECASE)),
+            "readability_issues": False,  # Placeholder for future implementation
+            "language_issues": self.check_language_issues()
+        }
+    def check_figure_order(self) -> Dict[str, Any]:
+        """Check if figures are referred to in sequential order."""
+        figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
+        figure_references = re.findall(figure_pattern, self.full_text, re.IGNORECASE)
+        figure_numbers = sorted(set(int(num) for num in figure_references))
+        is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
+        if figure_numbers:
+            expected_figures = set(range(1, max(figure_numbers) + 1))
+            missing_figures = list(expected_figures - set(figure_numbers))
+        else:
+            missing_figures = None
+        duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
+        duplicate_numbers = [int(num) for num in duplicates]
+        notMentioned = list(set(figure_references) - set(duplicates))
+        return {
+            "sequential_order": is_sequential,
+            "figure_count": len(figure_numbers),
+            "missing_figures": missing_figures,
+            "figure_order": figure_numbers,
+            "duplicate_references": duplicates,
+            "not_mentioned": notMentioned
+        }
+    def check_reference_order(self) -> Dict[str, Any]:
+        """Check if references in the main body text are in order."""
+        reference_pattern = r'\[(\d+)\]'
+        references = re.findall(reference_pattern, self.full_text)
+        ref_numbers = [int(ref) for ref in references]
+        max_ref = 0
+        out_of_order = []
+        for i, ref in enumerate(ref_numbers):
+            if ref > max_ref + 1:
+                out_of_order.append((i+1, ref))
+            max_ref = max(max_ref, ref)
+        all_refs = set(range(1, max_ref + 1))
+        used_refs = set(ref_numbers)
+        missing_refs = list(all_refs - used_refs)
+        return {
+            "max_reference": max_ref,
+            "out_of_order": out_of_order,
+            "missing_references": missing_refs,
+            "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
+        }
+    def check_reference_style(self) -> Dict[str, Any]:
+        """Check the reference style used in the paper and identify inconsistencies."""
+        reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', self.full_text, re.IGNORECASE)
+        if not reference_section_match:
+            return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
+        references_text = reference_section_match.group(1)
+        reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
+        references = [ref.strip() for ref in reference_list if ref.strip()]
+        styles = []
+        inconsistent_refs = []
+        patterns = {
+            "IEEE": r'^\[\d+\]',
+            "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
+            "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
+            "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
+            "Vancouver": r'^\d+\.\s',
+            "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
+        }
+        for i, ref in enumerate(references, 1):
+            matched = False
+            for style, pattern in patterns.items():
+                if re.match(pattern, ref):
+                    styles.append(style)
+                    matched = True
+                    break
+            if not matched:
+                styles.append("Unknown")
+                inconsistent_refs.append((i, ref, "Unknown"))
+        if not styles:
+            return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
+        style_counts = Counter(styles)
+        majority_style, majority_count = style_counts.most_common(1)[0]
+        for i, style in enumerate(styles, 1):
+            if style != majority_style and style != "Unknown":
+                inconsistent_refs.append((i, references[i-1], style))
+        consistency = majority_count / len(styles)
+        return {
+            "majority_style": majority_style,
+            "inconsistent_refs": inconsistent_refs,
+            "consistency": consistency
+        }
+    def highlight_issues_in_pdf(self, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> str:
+        """Highlight inconsistent references and add notes for language issues in a single PDF."""
+        try:
+            doc = fitz.open(self.file_path)
+            added_notes = set()
+            for page_number, page in enumerate(doc, start=1):
+                words = page.get_text("words")
+                if inconsistent_refs:
+                    for ref_num, ref_text, ref_style in inconsistent_refs:
+                        self.highlight_text(page, words, ref_text, f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be {self.check_reference_style().get('majority_style', 'Unknown')}.")
+                if language_matches:
+                    for match in language_matches:
+                        issue_text = match['sentence']
+                        error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
+                        issue_key = (issue_text, error_message)
+                        if issue_key not in added_notes:
+                            if self.highlight_text(page, words, issue_text, error_message):
+                                added_notes.add(issue_key)
+            annotated_file_path = self.file_path.replace(".pdf", "_annotated_combined.pdf")
+            doc.save(annotated_file_path)
+            doc.close()
+            if os.path.exists(annotated_file_path):
+                return annotated_file_path
+            else:
+                print(f"Error: Annotated PDF was not saved at {annotated_file_path}")
+                return ""
+        except Exception as e:
+            print(f"An error occurred while annotating the PDF: {str(e)}", file=sys.stderr)
+            traceback.print_exc()
+            return ""
+    def highlight_text(self, page, words, text, annotation):
+        """Highlight text and add annotation."""
+        text_instances = self.find_text_instances(words, text)
+        highlighted = False
+        for inst in text_instances:
+            highlight = page.add_highlight_annot(inst)
+            highlight.update()
+            comment = page.add_text_annot(inst[:2], annotation)
+            comment.update()
+            highlighted = True
+        return highlighted
+    def find_text_instances(self, words, text):
+        """Find all instances of text in words."""
+        text_lower = text.lower()
+        text_words = text_lower.split()
+        instances = []
+        for i in range(len(words) - len(text_words) + 1):
+            if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
+                inst = fitz.Rect(words[i][:4])
+                for j in range(1, len(text_words)):
+                    inst = inst | fitz.Rect(words[i+j][:4])
+                instances.append(inst)
+        return instances
+    def analyze(self) -> Dict[str, Any]:
+        """Perform full analysis of the PDF."""
+        self.full_text = self.label_authors()
+        results = {
+            "metadata": self.check_metadata(),
+            "disclosures": self.check_disclosures(),
+            "figures_and_tables": self.check_figures_and_tables(),
+            "figure_order": self.check_figure_order(),
+            "references": self.check_references(),
+            "reference_order": self.check_reference_order(),
+            "reference_style": self.check_reference_style(),
+            "structure": self.check_structure(),
+            "language": self.check_language(),
+            "annotated_pdf_path": ""
+        }
+        inconsistent_refs = results.get("reference_style", {}).get("inconsistent_refs", [])
+        language_matches = results.get("language", {}).get("language_issues", {}).get("matches", [])
+        if inconsistent_refs or language_matches:
+            annotated_path = self.highlight_issues_in_pdf(inconsistent_refs, language_matches)
+            results["annotated_pdf_path"] = annotated_path
+        return results
+def analyze_pdf(file):
+    try:
+        # Save the uploaded file temporarily
+        temp_path = "temp_uploaded.pdf"
+        with open(temp_path, "wb") as f:
+            f.write(file.read())
+        analyzer = PDFAnalyzer(temp_path)
+        results = analyzer.analyze()
+        # Ensure all keys are present in the results, even if they're empty
+        default_results = {
+            "annotated_pdf_path": "",
+            "metadata": {},
+            "disclosures": {},
+            "figures_and_tables": {},
+            "figure_order": {},
+            "references": {},
+            "reference_order": {},
+            "reference_style": {},
+            "structure": {},
+            "language": {},
+        }
+        # Update default_results with actual results
+        default_results.update(results)
+        return json.dumps(default_results, indent=2, default=str)
+    except Exception as e:
+        error_message = {
+            "error": str(e),
+            "traceback": traceback.format_exc()
+        }
+        return json.dumps(error_message, indent=2)
+    finally:
+        # Clean up the temporary file
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+# Create Gradio interface
+iface = gr.Interface(
+    fn=analyze_pdf,
+    inputs=gr.File(label="Upload PDF"),
+    outputs=gr.JSON(label="Analysis Results"),
+    title="PDF Analyzer",
+    description="Upload a PDF document to analyze its structure, references, language, and more.",
+)
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()