Spaces:
Sleeping
Sleeping
import streamlit as st | |
import re | |
import fitz # PyMuPDF | |
from pdfminer.high_level import extract_text | |
from pdfminer.layout import LAParams | |
import language_tool_python | |
from typing import List, Dict, Any, Tuple | |
from collections import Counter | |
import json | |
import traceback | |
import io | |
import tempfile | |
import os | |
# Set JAVA_HOME environment variable | |
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64' | |
# Optional: Verify Java installation | |
try: | |
java_version = subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT).decode() | |
st.write(f"Java Version: {java_version}") | |
except Exception as e: | |
st.error("Java is not installed correctly.") | |
# ------------------------------ | |
# Analysis Functions | |
# ------------------------------ | |
def extract_pdf_text_by_page(file) -> List[str]: | |
"""Extracts text from a PDF file, page by page, using PyMuPDF.""" | |
file.seek(0) | |
with fitz.open(stream=file.read(), filetype="pdf") as doc: | |
return [page.get_text("text") for page in doc] | |
def extract_pdf_text(file) -> str: | |
"""Extracts text from a PDF file using pdfminer.""" | |
file.seek(0) | |
return extract_text(file, laparams=LAParams()) | |
def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]: | |
"""Checks for the presence of required terms in the text.""" | |
return {term: term.lower() in full_text.lower() for term in search_terms} | |
def label_authors(full_text: str) -> str: | |
"""Label authors in the text with 'Authors:' if not already labeled.""" | |
author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)" | |
match = re.search(author_line_regex, full_text, re.MULTILINE) | |
if match: | |
authors = match.group(1).strip() | |
return full_text.replace(authors, f"Authors: {authors}") | |
return full_text | |
def check_metadata(full_text: str) -> Dict[str, Any]: | |
"""Check for metadata elements.""" | |
return { | |
"author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)), | |
"list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)), | |
"keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)), | |
"word_count": len(full_text.split()) or "Missing" | |
} | |
def check_disclosures(full_text: str) -> Dict[str, bool]: | |
"""Check for disclosure statements.""" | |
search_terms = [ | |
"author contributions statement", | |
"conflict of interest statement", | |
"ethics statement", | |
"funding statement", | |
"data access statement" | |
] | |
return check_text_presence(full_text, search_terms) | |
def check_figures_and_tables(full_text: str) -> Dict[str, bool]: | |
"""Check for figures and tables.""" | |
return { | |
"figures_with_citations": bool(re.search(r'Figure \d+.*?citation', full_text, re.IGNORECASE)), | |
"figures_legends": bool(re.search(r'Figure \d+.*?legend', full_text, re.IGNORECASE)), | |
"tables_legends": bool(re.search(r'Table \d+.*?legend', full_text, re.IGNORECASE)) | |
} | |
def check_references(full_text: str) -> Dict[str, Any]: | |
"""Check for references.""" | |
return { | |
"old_references": bool(re.search(r'\b19[0-9]{2}\b', full_text)), | |
"citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', full_text[:1000], re.IGNORECASE)), | |
"reference_count": len(re.findall(r'\[.*?\]', full_text)), | |
"self_citations": bool(re.search(r'Self-citation', full_text, re.IGNORECASE)) | |
} | |
def check_structure(full_text: str) -> Dict[str, bool]: | |
"""Check document structure.""" | |
return { | |
"imrad_structure": all(section in full_text for section in ["Introduction", "Methods", "Results", "Discussion"]), | |
"abstract_structure": "structured abstract" in full_text.lower() | |
} | |
def check_language_issues(full_text: str) -> Dict[str, Any]: | |
"""Check for issues with capitalization, hyphenation, punctuation, spacing, etc.""" | |
language_tool = language_tool_python.LanguageTool('en-US') | |
matches = language_tool.check(full_text) | |
word_count = len(full_text.split()) | |
issues_count = len(matches) | |
issues_per_1000 = (issues_count / word_count) * 1000 if word_count else 0 | |
serializable_matches = [ | |
{ | |
"message": match.message, | |
"replacements": match.replacements, | |
"offset": match.offset, | |
"errorLength": match.errorLength, | |
"category": match.category, | |
"ruleIssueType": match.ruleIssueType, | |
"sentence": match.sentence | |
} | |
for match in matches | |
] | |
return { | |
"issues_count": issues_count, | |
"issues_per_1000": issues_per_1000, | |
"failed": issues_per_1000 > 20, | |
"matches": serializable_matches | |
} | |
def check_language(full_text: str) -> Dict[str, Any]: | |
"""Check language quality.""" | |
return { | |
"plain_language": bool(re.search(r'plain language summary', full_text, re.IGNORECASE)), | |
"readability_issues": False, # Placeholder for future implementation | |
"language_issues": check_language_issues(full_text) | |
} | |
def check_figure_order(full_text: str) -> Dict[str, Any]: | |
"""Check if figures are referred to in sequential order.""" | |
figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)' | |
figure_references = re.findall(figure_pattern, full_text, re.IGNORECASE) | |
figure_numbers = sorted(set(int(num) for num in figure_references)) | |
is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:])) | |
if figure_numbers: | |
expected_figures = set(range(1, max(figure_numbers) + 1)) | |
missing_figures = list(expected_figures - set(figure_numbers)) | |
else: | |
missing_figures = None | |
duplicates = [num for num, count in Counter(figure_references).items() if count > 1] | |
duplicate_numbers = [int(num) for num in duplicates] | |
not_mentioned = list(set(figure_references) - set(duplicates)) | |
return { | |
"sequential_order": is_sequential, | |
"figure_count": len(figure_numbers), | |
"missing_figures": missing_figures, | |
"figure_order": figure_numbers, | |
"duplicate_references": duplicates, | |
"not_mentioned": not_mentioned | |
} | |
def check_reference_order(full_text: str) -> Dict[str, Any]: | |
"""Check if references in the main body text are in order.""" | |
reference_pattern = r'\[(\d+)\]' | |
references = re.findall(reference_pattern, full_text) | |
ref_numbers = [int(ref) for ref in references] | |
max_ref = 0 | |
out_of_order = [] | |
for i, ref in enumerate(ref_numbers): | |
if ref > max_ref + 1: | |
out_of_order.append((i+1, ref)) | |
max_ref = max(max_ref, ref) | |
all_refs = set(range(1, max_ref + 1)) | |
used_refs = set(ref_numbers) | |
missing_refs = list(all_refs - used_refs) | |
return { | |
"max_reference": max_ref, | |
"out_of_order": out_of_order, | |
"missing_references": missing_refs, | |
"is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0 | |
} | |
def check_reference_style(full_text: str) -> Dict[str, Any]: | |
"""Check the reference style used in the paper and identify inconsistencies.""" | |
reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', full_text, re.IGNORECASE) | |
if not reference_section_match: | |
return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []} | |
references_text = reference_section_match.group(1) | |
reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text) | |
references = [ref.strip() for ref in reference_list if ref.strip()] | |
styles = [] | |
inconsistent_refs = [] | |
patterns = { | |
"IEEE": r'^\[\d+\]', | |
"Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?', | |
"APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?', | |
"MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.', | |
"Vancouver": r'^\d+\.\s', | |
"Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]', | |
} | |
for i, ref in enumerate(references, 1): | |
matched = False | |
for style, pattern in patterns.items(): | |
if re.match(pattern, ref): | |
styles.append(style) | |
matched = True | |
break | |
if not matched: | |
styles.append("Unknown") | |
inconsistent_refs.append((i, ref, "Unknown")) | |
if not styles: | |
return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []} | |
style_counts = Counter(styles) | |
majority_style, majority_count = style_counts.most_common(1)[0] | |
for i, style in enumerate(styles, 1): | |
if style != majority_style and style != "Unknown": | |
inconsistent_refs.append((i, references[i-1], style)) | |
consistency = majority_count / len(styles) | |
return { | |
"majority_style": majority_style, | |
"inconsistent_refs": inconsistent_refs, | |
"consistency": consistency | |
} | |
# ------------------------------ | |
# Annotation Functions | |
# ------------------------------ | |
def highlight_text(page, words, text, annotation): | |
"""Highlight text and add annotation.""" | |
text_instances = find_text_instances(words, text) | |
highlighted = False | |
for inst in text_instances: | |
highlight = page.add_highlight_annot(inst) | |
highlight.update() | |
comment = page.add_text_annot(inst[:2], annotation) | |
comment.update() | |
highlighted = True | |
return highlighted | |
def find_text_instances(words, text): | |
"""Find all instances of text in words.""" | |
text_lower = text.lower() | |
text_words = text_lower.split() | |
instances = [] | |
for i in range(len(words) - len(text_words) + 1): | |
if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))): | |
inst = fitz.Rect(words[i][:4]) | |
for j in range(1, len(text_words)): | |
inst = inst | fitz.Rect(words[i+j][:4]) | |
instances.append(inst) | |
return instances | |
def highlight_issues_in_pdf(file, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> bytes: | |
"""Highlight inconsistent references and add notes for language issues in a single PDF.""" | |
try: | |
file.seek(0) | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
added_notes = set() | |
for page_number, page in enumerate(doc, start=1): | |
words = page.get_text("words") | |
if inconsistent_refs: | |
for ref_num, ref_text, ref_style in inconsistent_refs: | |
annotation_text = f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be consolidated to {ref_style}." | |
highlight_text(page, words, ref_text, annotation_text) | |
if language_matches: | |
for match in language_matches: | |
issue_text = match['sentence'] | |
error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}" | |
issue_key = (issue_text, error_message) | |
if issue_key not in added_notes: | |
if highlight_text(page, words, issue_text, error_message): | |
added_notes.add(issue_key) | |
annotated_pdf_bytes = doc.write() | |
doc.close() | |
return annotated_pdf_bytes | |
except Exception as e: | |
print(f"An error occurred while annotating the PDF: {str(e)}") | |
traceback.print_exc() | |
return b"" | |
# ------------------------------ | |
# Main Analysis Function | |
# ------------------------------ | |
def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]: | |
""" | |
Analyze the uploaded PDF and return analysis results and annotated PDF bytes. | |
Returns: | |
Tuple containing: | |
- Analysis results as a dictionary. | |
- Annotated PDF as bytes. | |
""" | |
try: | |
# The 'file' is a BytesIO object provided by Streamlit | |
file.seek(0) | |
pages_text = extract_pdf_text_by_page(file) | |
full_text = extract_pdf_text(file) | |
full_text = label_authors(full_text) | |
# Perform analyses | |
metadata = check_metadata(full_text) | |
disclosures = check_disclosures(full_text) | |
figures_and_tables = check_figures_and_tables(full_text) | |
figure_order = check_figure_order(full_text) | |
references = check_references(full_text) | |
reference_order = check_reference_order(full_text) | |
reference_style = check_reference_style(full_text) | |
structure = check_structure(full_text) | |
language = check_language(full_text) | |
# Compile results | |
results = { | |
"metadata": metadata, | |
"disclosures": disclosures, | |
"figures_and_tables": figures_and_tables, | |
"figure_order": figure_order, | |
"references": references, | |
"reference_order": reference_order, | |
"reference_style": reference_style, | |
"structure": structure, | |
"language": language | |
} | |
# Handle annotations | |
inconsistent_refs = reference_style.get("inconsistent_refs", []) | |
language_matches = language.get("language_issues", {}).get("matches", []) | |
if inconsistent_refs or language_matches: | |
annotated_pdf_bytes = highlight_issues_in_pdf(file, inconsistent_refs, language_matches) | |
else: | |
annotated_pdf_bytes = None | |
return results, annotated_pdf_bytes | |
except Exception as e: | |
error_message = { | |
"error": str(e), | |
"traceback": traceback.format_exc() | |
} | |
return error_message, None | |
# ------------------------------ | |
# Streamlit Interface | |
# ------------------------------ | |
def main(): | |
st.title("PDF Analyzer") | |
st.write("Upload a PDF document to analyze its structure, references, language, and more.") | |
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"]) | |
if uploaded_file is not None: | |
with st.spinner("Analyzing PDF..."): | |
results, annotated_pdf = analyze_pdf(uploaded_file) | |
st.subheader("Analysis Results") | |
st.json(results) | |
if annotated_pdf: | |
st.subheader("Download Annotated PDF") | |
st.download_button( | |
label="Download Annotated PDF", | |
data=annotated_pdf, | |
file_name="annotated.pdf", | |
mime="application/pdf" | |
) | |
else: | |
st.success("No issues found. No annotated PDF to download.") | |
if __name__ == "__main__": | |
main() |