textmetric-stramlit-1

Sleeping

App Files Files Community

textmetric-stramlit-1 / app.py

samyak152002

Update app.py

8cc1285 verified 3 months ago

raw

history blame

14.5 kB

	import streamlit as st
	import re
	import fitz # PyMuPDF
	from pdfminer.high_level import extract_text
	from pdfminer.layout import LAParams
	import language_tool_python
	from typing import List, Dict, Any, Tuple
	from collections import Counter
	import json
	import traceback
	import io
	import tempfile
	import os


	# Set JAVA_HOME environment variable
	os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'

	# Optional: Verify Java installation
	try:
	java_version = subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT).decode()
	st.write(f"Java Version: {java_version}")
	except Exception as e:
	st.error("Java is not installed correctly.")

	# ------------------------------
	# Analysis Functions
	# ------------------------------

	def extract_pdf_text_by_page(file) -> List[str]:
	"""Extracts text from a PDF file, page by page, using PyMuPDF."""
	file.seek(0)
	with fitz.open(stream=file.read(), filetype="pdf") as doc:
	return [page.get_text("text") for page in doc]

	def extract_pdf_text(file) -> str:
	"""Extracts text from a PDF file using pdfminer."""
	file.seek(0)
	return extract_text(file, laparams=LAParams())

	def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
	"""Checks for the presence of required terms in the text."""
	return {term: term.lower() in full_text.lower() for term in search_terms}

	def label_authors(full_text: str) -> str:
	"""Label authors in the text with 'Authors:' if not already labeled."""
	author_line_regex = r"^(?:.\n)(.?)(?:\n\n)"
	match = re.search(author_line_regex, full_text, re.MULTILINE)
	if match:
	authors = match.group(1).strip()
	return full_text.replace(authors, f"Authors: {authors}")
	return full_text

	def check_metadata(full_text: str) -> Dict[str, Any]:
	"""Check for metadata elements."""
	return {
	"author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
	"list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
	"keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
	"word_count": len(full_text.split()) or "Missing"
	}

	def check_disclosures(full_text: str) -> Dict[str, bool]:
	"""Check for disclosure statements."""
	search_terms = [
	"author contributions statement",
	"conflict of interest statement",
	"ethics statement",
	"funding statement",
	"data access statement"
	]
	return check_text_presence(full_text, search_terms)

	def check_figures_and_tables(full_text: str) -> Dict[str, bool]:
	"""Check for figures and tables."""
	return {
	"figures_with_citations": bool(re.search(r'Figure \d+.*?citation', full_text, re.IGNORECASE)),
	"figures_legends": bool(re.search(r'Figure \d+.*?legend', full_text, re.IGNORECASE)),
	"tables_legends": bool(re.search(r'Table \d+.*?legend', full_text, re.IGNORECASE))
	}

	def check_references(full_text: str) -> Dict[str, Any]:
	"""Check for references."""
	return {
	"old_references": bool(re.search(r'\b19[0-9]{2}\b', full_text)),
	"citations_in_abstract": bool(re.search(r'\b(citation\|reference)\b', full_text[:1000], re.IGNORECASE)),
	"reference_count": len(re.findall(r'\[.*?\]', full_text)),
	"self_citations": bool(re.search(r'Self-citation', full_text, re.IGNORECASE))
	}

	def check_structure(full_text: str) -> Dict[str, bool]:
	"""Check document structure."""
	return {
	"imrad_structure": all(section in full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
	"abstract_structure": "structured abstract" in full_text.lower()
	}

	def check_language_issues(full_text: str) -> Dict[str, Any]:
	"""Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
	language_tool = language_tool_python.LanguageTool('en-US')
	matches = language_tool.check(full_text)
	word_count = len(full_text.split())
	issues_count = len(matches)
	issues_per_1000 = (issues_count / word_count) * 1000 if word_count else 0

	serializable_matches = [
	{
	"message": match.message,
	"replacements": match.replacements,
	"offset": match.offset,
	"errorLength": match.errorLength,
	"category": match.category,
	"ruleIssueType": match.ruleIssueType,
	"sentence": match.sentence
	}
	for match in matches
	]

	return {
	"issues_count": issues_count,
	"issues_per_1000": issues_per_1000,
	"failed": issues_per_1000 > 20,
	"matches": serializable_matches
	}

	def check_language(full_text: str) -> Dict[str, Any]:
	"""Check language quality."""
	return {
	"plain_language": bool(re.search(r'plain language summary', full_text, re.IGNORECASE)),
	"readability_issues": False, # Placeholder for future implementation
	"language_issues": check_language_issues(full_text)
	}

	def check_figure_order(full_text: str) -> Dict[str, Any]:
	"""Check if figures are referred to in sequential order."""
	figure_pattern = r'(?:Fig(?:ure)?\.?\|Figure)\s*(\d+)'
	figure_references = re.findall(figure_pattern, full_text, re.IGNORECASE)
	figure_numbers = sorted(set(int(num) for num in figure_references))

	is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))

	if figure_numbers:
	expected_figures = set(range(1, max(figure_numbers) + 1))
	missing_figures = list(expected_figures - set(figure_numbers))
	else:
	missing_figures = None

	duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
	duplicate_numbers = [int(num) for num in duplicates]
	not_mentioned = list(set(figure_references) - set(duplicates))

	return {
	"sequential_order": is_sequential,
	"figure_count": len(figure_numbers),
	"missing_figures": missing_figures,
	"figure_order": figure_numbers,
	"duplicate_references": duplicates,
	"not_mentioned": not_mentioned
	}

	def check_reference_order(full_text: str) -> Dict[str, Any]:
	"""Check if references in the main body text are in order."""
	reference_pattern = r'\[(\d+)\]'
	references = re.findall(reference_pattern, full_text)
	ref_numbers = [int(ref) for ref in references]

	max_ref = 0
	out_of_order = []
	for i, ref in enumerate(ref_numbers):
	if ref > max_ref + 1:
	out_of_order.append((i+1, ref))
	max_ref = max(max_ref, ref)

	all_refs = set(range(1, max_ref + 1))
	used_refs = set(ref_numbers)
	missing_refs = list(all_refs - used_refs)

	return {
	"max_reference": max_ref,
	"out_of_order": out_of_order,
	"missing_references": missing_refs,
	"is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
	}

	def check_reference_style(full_text: str) -> Dict[str, Any]:
	"""Check the reference style used in the paper and identify inconsistencies."""
	reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S\|\Z)', full_text, re.IGNORECASE)
	if not reference_section_match:
	return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}

	references_text = reference_section_match.group(1)
	reference_list = re.split(r'\n(?=\[\d+\]\|\d+\.\s\|\(\w+,\s*\d{4}\))', references_text)
	references = [ref.strip() for ref in reference_list if ref.strip()]

	styles = []
	inconsistent_refs = []
	patterns = {
	"IEEE": r'^\[\d+\]',
	"Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
	"APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
	"MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
	"Vancouver": r'^\d+\.\s',
	"Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
	}

	for i, ref in enumerate(references, 1):
	matched = False
	for style, pattern in patterns.items():
	if re.match(pattern, ref):
	styles.append(style)
	matched = True
	break
	if not matched:
	styles.append("Unknown")
	inconsistent_refs.append((i, ref, "Unknown"))

	if not styles:
	return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}

	style_counts = Counter(styles)
	majority_style, majority_count = style_counts.most_common(1)[0]

	for i, style in enumerate(styles, 1):
	if style != majority_style and style != "Unknown":
	inconsistent_refs.append((i, references[i-1], style))

	consistency = majority_count / len(styles)

	return {
	"majority_style": majority_style,
	"inconsistent_refs": inconsistent_refs,
	"consistency": consistency
	}

	# ------------------------------
	# Annotation Functions
	# ------------------------------

	def highlight_text(page, words, text, annotation):
	"""Highlight text and add annotation."""
	text_instances = find_text_instances(words, text)
	highlighted = False
	for inst in text_instances:
	highlight = page.add_highlight_annot(inst)
	highlight.update()
	comment = page.add_text_annot(inst[:2], annotation)
	comment.update()
	highlighted = True
	return highlighted

	def find_text_instances(words, text):
	"""Find all instances of text in words."""
	text_lower = text.lower()
	text_words = text_lower.split()
	instances = []
	for i in range(len(words) - len(text_words) + 1):
	if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
	inst = fitz.Rect(words[i][:4])
	for j in range(1, len(text_words)):
	inst = inst \| fitz.Rect(words[i+j][:4])
	instances.append(inst)
	return instances

	def highlight_issues_in_pdf(file, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> bytes:
	"""Highlight inconsistent references and add notes for language issues in a single PDF."""
	try:
	file.seek(0)
	doc = fitz.open(stream=file.read(), filetype="pdf")
	added_notes = set()

	for page_number, page in enumerate(doc, start=1):
	words = page.get_text("words")

	if inconsistent_refs:
	for ref_num, ref_text, ref_style in inconsistent_refs:
	annotation_text = f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be consolidated to {ref_style}."
	highlight_text(page, words, ref_text, annotation_text)

	if language_matches:
	for match in language_matches:
	issue_text = match['sentence']
	error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
	issue_key = (issue_text, error_message)

	if issue_key not in added_notes:
	if highlight_text(page, words, issue_text, error_message):
	added_notes.add(issue_key)

	annotated_pdf_bytes = doc.write()
	doc.close()
	return annotated_pdf_bytes

	except Exception as e:
	print(f"An error occurred while annotating the PDF: {str(e)}")
	traceback.print_exc()
	return b""

	# ------------------------------
	# Main Analysis Function
	# ------------------------------

	def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
	"""
	Analyze the uploaded PDF and return analysis results and annotated PDF bytes.

	Returns:
	Tuple containing:
	- Analysis results as a dictionary.
	- Annotated PDF as bytes.
	"""
	try:
	# The 'file' is a BytesIO object provided by Streamlit
	file.seek(0)
	pages_text = extract_pdf_text_by_page(file)
	full_text = extract_pdf_text(file)
	full_text = label_authors(full_text)

	# Perform analyses
	metadata = check_metadata(full_text)
	disclosures = check_disclosures(full_text)
	figures_and_tables = check_figures_and_tables(full_text)
	figure_order = check_figure_order(full_text)
	references = check_references(full_text)
	reference_order = check_reference_order(full_text)
	reference_style = check_reference_style(full_text)
	structure = check_structure(full_text)
	language = check_language(full_text)

	# Compile results
	results = {
	"metadata": metadata,
	"disclosures": disclosures,
	"figures_and_tables": figures_and_tables,
	"figure_order": figure_order,
	"references": references,
	"reference_order": reference_order,
	"reference_style": reference_style,
	"structure": structure,
	"language": language
	}

	# Handle annotations
	inconsistent_refs = reference_style.get("inconsistent_refs", [])
	language_matches = language.get("language_issues", {}).get("matches", [])

	if inconsistent_refs or language_matches:
	annotated_pdf_bytes = highlight_issues_in_pdf(file, inconsistent_refs, language_matches)
	else:
	annotated_pdf_bytes = None

	return results, annotated_pdf_bytes

	except Exception as e:
	error_message = {
	"error": str(e),
	"traceback": traceback.format_exc()
	}
	return error_message, None

	# ------------------------------
	# Streamlit Interface
	# ------------------------------

	def main():
	st.title("PDF Analyzer")
	st.write("Upload a PDF document to analyze its structure, references, language, and more.")

	uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])

	if uploaded_file is not None:
	with st.spinner("Analyzing PDF..."):
	results, annotated_pdf = analyze_pdf(uploaded_file)

	st.subheader("Analysis Results")
	st.json(results)

	if annotated_pdf:
	st.subheader("Download Annotated PDF")
	st.download_button(
	label="Download Annotated PDF",
	data=annotated_pdf,
	file_name="annotated.pdf",
	mime="application/pdf"
	)
	else:
	st.success("No issues found. No annotated PDF to download.")

	if __name__ == "__main__":
	main()