# annotations.py import fitz # PyMuPDF from typing import List, Dict, Any, Tuple import language_tool_python import io def extract_pdf_text(file) -> str: """Extracts full text from a PDF file using PyMuPDF.""" try: doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file) full_text = "" for page in doc: full_text += page.get_text("text") + "\n" doc.close() return full_text except Exception as e: print(f"Error extracting text from PDF: {e}") return "" def check_language_issues(full_text: str) -> Dict[str, Any]: """Check for language issues using LanguageTool.""" language_tool = language_tool_python.LanguageTool('en-US') matches = language_tool.check(full_text) issues = [] for match in matches: issues.append({ "message": match.message, "context": match.context, "suggestions": match.replacements[:3] if match.replacements else [], "category": match.category, "rule_id": match.ruleId, "offset": match.offset, "length": match.errorLength }) return { "total_issues": len(issues), "issues": issues } def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes: """ Highlights language issues in the PDF and returns the annotated PDF as bytes. This function maps LanguageTool matches to specific words in the PDF and highlights those words. """ try: # Open the PDF doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file) # Extract words with positions from each page word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1) page_text = "" for page_number in range(len(doc)): page = doc[page_number] words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no) for w in words: word_text = w[4] word_list.append((page_number, word_text, w[0], w[1], w[2], w[3])) # Concatenate all words to form the full text concatenated_text = " ".join([w[1] for w in word_list]) # Iterate over each language issue for issue in language_matches: offset = issue["offset"] length = issue["length"] error_text = concatenated_text[offset:offset+length] # Find the words that fall within the error span current_pos = 0 target_words = [] for word in word_list: word_text = word[1] word_length = len(word_text) + 1 # +1 for the space if current_pos + word_length > offset and current_pos < offset + length: target_words.append(word) current_pos += word_length # Add highlight annotations to the target words for target in target_words: page_num, word_text, x0, y0, x1, y1 = target page = doc[page_num] # Define a rectangle around the word rect = fitz.Rect(x0, y0, x1, y1) # Add a highlight annotation highlight = page.add_highlight_annot(rect) highlight.set_colors(stroke=(1, 1, 0)) # Yellow color highlight.update() # Save annotated PDF to bytes byte_stream = io.BytesIO() doc.save(byte_stream) annotated_pdf_bytes = byte_stream.getvalue() doc.close() return annotated_pdf_bytes except Exception as e: print(f"Error in highlighting PDF: {e}") return b"" def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]: """Analyzes the PDF for language issues and returns results and annotated PDF.""" try: full_text = extract_pdf_text(file) if not full_text: return {"error": "Failed to extract text from PDF."}, None language_issues = check_language_issues(full_text) issues = language_issues.get("issues", []) annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None return language_issues, annotated_pdf except Exception as e: return {"error": str(e)}, None