samyak152002's picture
Rename App.py to app.py
6ecdc78 verified
raw
history blame
15 kB
import gradio as gr
import PyPDF2
import re
import fitz
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
import language_tool_python
from tqdm import tqdm
from typing import List, Dict, Any, Tuple
from collections import Counter
import json
import sys
import traceback
import io
import os
class PDFAnalyzer:
def __init__(self, file_path: str):
self.file_path = file_path
self.pages_text = self.extract_pdf_text_by_page()
self.full_text = self.extract_pdf_text()
self.language_tool = language_tool_python.LanguageTool('en-US')
def extract_pdf_text_by_page(self) -> List[str]:
"""Extracts text from a PDF file, page by page, using PyMuPDF."""
with fitz.open(self.file_path) as doc:
return [page.get_text("text") for page in doc]
def extract_pdf_text(self) -> str:
"""Extracts text from a PDF file using pdfminer."""
return extract_text(self.file_path, laparams=LAParams())
def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
"""Checks for the presence of required terms in the text."""
return {term: term in self.full_text for term in search_terms}
def label_authors(self) -> str:
"""Label authors in the text with 'Authors:' if not already labeled."""
author_line_regex = r"^(?:.*\n)(.*?)(?:\n\nNetaji Subhas University of Technology, Dwarka, Delhi, 110078, India)"
match = re.search(author_line_regex, self.full_text, re.MULTILINE)
if match:
authors = match.group(1).strip()
return self.full_text.replace(authors, f"Authors: {authors}")
return self.full_text
def check_metadata(self) -> Dict[str, Any]:
"""Check for metadata elements."""
return {
"author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', self.full_text)),
"list_of_authors": bool(re.search(r'Authors?:', self.full_text, re.IGNORECASE)),
"keywords_list": bool(re.search(r'Keywords?:', self.full_text, re.IGNORECASE)),
"word_count": len(self.full_text.split()) or "Missing"
}
def check_disclosures(self) -> Dict[str, bool]:
"""Check for disclosure statements."""
search_terms = [
"author contributions statement",
"conflict of interest statement",
"ethics statement",
"funding statement",
"data access statement"
]
return self.check_text_presence(search_terms)
def check_figures_and_tables(self) -> Dict[str, bool]:
"""Check for figures and tables."""
return {
"figures_with_citations": bool(re.search(r'Figure \d+.*?citation', self.full_text, re.IGNORECASE)),
"figures_legends": bool(re.search(r'Figure \d+.*?legend', self.full_text, re.IGNORECASE)),
"tables_legends": bool(re.search(r'Table \d+.*?legend', self.full_text, re.IGNORECASE))
}
def check_references(self) -> Dict[str, Any]:
"""Check for references."""
return {
"old_references": bool(re.search(r'\b19[0-9]{2}\b', self.full_text)),
"citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', self.full_text[:1000], re.IGNORECASE)),
"reference_count": len(re.findall(r'\[.*?\]', self.full_text)),
"self_citations": bool(re.search(r'Self-citation', self.full_text, re.IGNORECASE))
}
def check_structure(self) -> Dict[str, bool]:
"""Check document structure."""
return {
"imrad_structure": all(section in self.full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
"abstract_structure": "structured abstract" in self.full_text.lower()
}
def check_language_issues(self) -> Dict[str, Any]:
"""Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
matches = self.language_tool.check(self.full_text)
word_count = len(self.full_text.split())
issues_count = len(matches)
issues_per_1000 = (issues_count / word_count) * 1000
serializable_matches = [
{
"message": match.message,
"replacements": match.replacements,
"offset": match.offset,
"errorLength": match.errorLength,
"category": match.category,
"ruleIssueType": match.ruleIssueType,
"sentence": match.sentence
}
for match in matches
]
return {
"issues_count": issues_count,
"issues_per_1000": issues_per_1000,
"failed": issues_per_1000 > 20,
"matches": serializable_matches
}
def check_language(self) -> Dict[str, Any]:
"""Check language quality."""
return {
"plain_language": bool(re.search(r'plain language summary', self.full_text, re.IGNORECASE)),
"readability_issues": False, # Placeholder for future implementation
"language_issues": self.check_language_issues()
}
def check_figure_order(self) -> Dict[str, Any]:
"""Check if figures are referred to in sequential order."""
figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
figure_references = re.findall(figure_pattern, self.full_text, re.IGNORECASE)
figure_numbers = sorted(set(int(num) for num in figure_references))
is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
if figure_numbers:
expected_figures = set(range(1, max(figure_numbers) + 1))
missing_figures = list(expected_figures - set(figure_numbers))
else:
missing_figures = None
duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
duplicate_numbers = [int(num) for num in duplicates]
notMentioned = list(set(figure_references) - set(duplicates))
return {
"sequential_order": is_sequential,
"figure_count": len(figure_numbers),
"missing_figures": missing_figures,
"figure_order": figure_numbers,
"duplicate_references": duplicates,
"not_mentioned": notMentioned
}
def check_reference_order(self) -> Dict[str, Any]:
"""Check if references in the main body text are in order."""
reference_pattern = r'\[(\d+)\]'
references = re.findall(reference_pattern, self.full_text)
ref_numbers = [int(ref) for ref in references]
max_ref = 0
out_of_order = []
for i, ref in enumerate(ref_numbers):
if ref > max_ref + 1:
out_of_order.append((i+1, ref))
max_ref = max(max_ref, ref)
all_refs = set(range(1, max_ref + 1))
used_refs = set(ref_numbers)
missing_refs = list(all_refs - used_refs)
return {
"max_reference": max_ref,
"out_of_order": out_of_order,
"missing_references": missing_refs,
"is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
}
def check_reference_style(self) -> Dict[str, Any]:
"""Check the reference style used in the paper and identify inconsistencies."""
reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', self.full_text, re.IGNORECASE)
if not reference_section_match:
return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
references_text = reference_section_match.group(1)
reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
references = [ref.strip() for ref in reference_list if ref.strip()]
styles = []
inconsistent_refs = []
patterns = {
"IEEE": r'^\[\d+\]',
"Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
"APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
"MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
"Vancouver": r'^\d+\.\s',
"Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
}
for i, ref in enumerate(references, 1):
matched = False
for style, pattern in patterns.items():
if re.match(pattern, ref):
styles.append(style)
matched = True
break
if not matched:
styles.append("Unknown")
inconsistent_refs.append((i, ref, "Unknown"))
if not styles:
return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
style_counts = Counter(styles)
majority_style, majority_count = style_counts.most_common(1)[0]
for i, style in enumerate(styles, 1):
if style != majority_style and style != "Unknown":
inconsistent_refs.append((i, references[i-1], style))
consistency = majority_count / len(styles)
return {
"majority_style": majority_style,
"inconsistent_refs": inconsistent_refs,
"consistency": consistency
}
def highlight_issues_in_pdf(self, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> str:
"""Highlight inconsistent references and add notes for language issues in a single PDF."""
try:
doc = fitz.open(self.file_path)
added_notes = set()
for page_number, page in enumerate(doc, start=1):
words = page.get_text("words")
if inconsistent_refs:
for ref_num, ref_text, ref_style in inconsistent_refs:
self.highlight_text(page, words, ref_text, f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be {self.check_reference_style().get('majority_style', 'Unknown')}.")
if language_matches:
for match in language_matches:
issue_text = match['sentence']
error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
issue_key = (issue_text, error_message)
if issue_key not in added_notes:
if self.highlight_text(page, words, issue_text, error_message):
added_notes.add(issue_key)
annotated_file_path = self.file_path.replace(".pdf", "_annotated_combined.pdf")
doc.save(annotated_file_path)
doc.close()
if os.path.exists(annotated_file_path):
return annotated_file_path
else:
print(f"Error: Annotated PDF was not saved at {annotated_file_path}")
return ""
except Exception as e:
print(f"An error occurred while annotating the PDF: {str(e)}", file=sys.stderr)
traceback.print_exc()
return ""
def highlight_text(self, page, words, text, annotation):
"""Highlight text and add annotation."""
text_instances = self.find_text_instances(words, text)
highlighted = False
for inst in text_instances:
highlight = page.add_highlight_annot(inst)
highlight.update()
comment = page.add_text_annot(inst[:2], annotation)
comment.update()
highlighted = True
return highlighted
def find_text_instances(self, words, text):
"""Find all instances of text in words."""
text_lower = text.lower()
text_words = text_lower.split()
instances = []
for i in range(len(words) - len(text_words) + 1):
if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
inst = fitz.Rect(words[i][:4])
for j in range(1, len(text_words)):
inst = inst | fitz.Rect(words[i+j][:4])
instances.append(inst)
return instances
def analyze(self) -> Dict[str, Any]:
"""Perform full analysis of the PDF."""
self.full_text = self.label_authors()
results = {
"metadata": self.check_metadata(),
"disclosures": self.check_disclosures(),
"figures_and_tables": self.check_figures_and_tables(),
"figure_order": self.check_figure_order(),
"references": self.check_references(),
"reference_order": self.check_reference_order(),
"reference_style": self.check_reference_style(),
"structure": self.check_structure(),
"language": self.check_language(),
"annotated_pdf_path": ""
}
inconsistent_refs = results.get("reference_style", {}).get("inconsistent_refs", [])
language_matches = results.get("language", {}).get("language_issues", {}).get("matches", [])
if inconsistent_refs or language_matches:
annotated_path = self.highlight_issues_in_pdf(inconsistent_refs, language_matches)
results["annotated_pdf_path"] = annotated_path
return results
def analyze_pdf(file):
try:
# Save the uploaded file temporarily
temp_path = "temp_uploaded.pdf"
with open(temp_path, "wb") as f:
f.write(file.read())
analyzer = PDFAnalyzer(temp_path)
results = analyzer.analyze()
# Ensure all keys are present in the results, even if they're empty
default_results = {
"annotated_pdf_path": "",
"metadata": {},
"disclosures": {},
"figures_and_tables": {},
"figure_order": {},
"references": {},
"reference_order": {},
"reference_style": {},
"structure": {},
"language": {},
}
# Update default_results with actual results
default_results.update(results)
return json.dumps(default_results, indent=2, default=str)
except Exception as e:
error_message = {
"error": str(e),
"traceback": traceback.format_exc()
}
return json.dumps(error_message, indent=2)
finally:
# Clean up the temporary file
if os.path.exists(temp_path):
os.remove(temp_path)
# Create Gradio interface
iface = gr.Interface(
fn=analyze_pdf,
inputs=gr.File(label="Upload PDF"),
outputs=gr.JSON(label="Analysis Results"),
title="PDF Analyzer",
description="Upload a PDF document to analyze its structure, references, language, and more.",
)
# Launch the app
if __name__ == "__main__":
iface.launch()