# annotations.py import fitz # PyMuPDF import re from typing import List, Dict, Any, Tuple from collections import Counter import language_tool_python import io def extract_pdf_text(file) -> str: """Extracts text from a PDF file using pdfminer.""" from pdfminer.high_level import extract_text from pdfminer.layout import LAParams if isinstance(file, str): with open(file, 'rb') as f: return extract_text(f, laparams=LAParams()) else: file.seek(0) return extract_text(file, laparams=LAParams()) def check_language_issues(full_text: str) -> Dict[str, Any]: """Check for language issues using LanguageTool.""" language_tool = language_tool_python.LanguageTool('en-US') matches = language_tool.check(full_text) issues = [] for match in matches: issues.append({ "message": match.message, "context": match.context, "suggestions": match.replacements[:3] if match.replacements else [], "category": match.category, "rule_id": match.ruleId }) return { "total_issues": len(issues), "issues": issues } def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes: """Highlights language issues in the PDF and returns the annotated PDF as bytes.""" try: if isinstance(file, str): doc = fitz.open(file) else: file.seek(0) doc = fitz.open(stream=file.read(), filetype="pdf") for match in language_matches: sentence = match['context'] # Use regular expressions to find the sentence in the text for page in doc: text_instances = page.search_for(sentence) for inst in text_instances: # Highlight the sentence highlight = page.add_highlight_annot(inst) highlight.update() # Save annotated PDF to bytes byte_stream = io.BytesIO() doc.save(byte_stream) annotated_pdf_bytes = byte_stream.getvalue() doc.close() return annotated_pdf_bytes except Exception as e: print(f"Error in highlighting PDF: {e}") return b"" def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]: """Analyzes the PDF for language issues and returns results and annotated PDF.""" try: full_text = extract_pdf_text(file) language_issues = check_language_issues(full_text) issues = language_issues.get("issues", []) annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None return language_issues, annotated_pdf except Exception as e: return {"error": str(e)}, None