Spaces:
Sleeping
Sleeping
File size: 6,084 Bytes
4b93adb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import fitz # PyMuPDF
from typing import List, Dict, Any, Tuple
import language_tool_python
import io
def extract_pdf_text(file) -> str:
"""Extracts full text from a PDF file using PyMuPDF."""
try:
# Open the PDF file
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
full_text = ""
for page_num, page in enumerate(doc, start=1):
text = page.get_text("text")
full_text += text + "\n"
print(f"Extracted text from page {page_num}: {len(text)} characters.")
doc.close()
print(f"Total extracted text length: {len(full_text)} characters.")
return full_text
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return ""
def check_language_issues(full_text: str) -> Dict[str, Any]:
"""Check for language issues using LanguageTool."""
try:
language_tool = language_tool_python.LanguageTool('en-US')
matches = language_tool.check(full_text)
issues = []
for match in matches:
issues.append({
"message": match.message,
"context": match.context.strip(),
"suggestions": match.replacements[:3] if match.replacements else [],
"category": match.category,
"rule_id": match.ruleId,
"offset": match.offset,
"length": match.errorLength
})
print(f"Total language issues found: {len(issues)}")
return {
"total_issues": len(issues),
"issues": issues
}
except Exception as e:
print(f"Error checking language issues: {e}")
return {"error": str(e)}
def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
"""
Highlights language issues in the PDF and returns the annotated PDF as bytes.
This function maps LanguageTool matches to specific words in the PDF
and highlights those words.
"""
try:
# Open the PDF
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
print(f"Opened PDF with {len(doc)} pages.")
# Extract words with positions from each page
word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
for page_number in range(len(doc)):
page = doc[page_number]
words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
for w in words:
word_text = w[4]
# **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
if '[' in word_text:
word_text = word_text.replace('[', ' [')
word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
print(f"Total words extracted: {len(word_list)}")
# Concatenate all words to form the full text
concatenated_text = " ".join([w[1] for w in word_list])
print(f"Concatenated text length: {len(concatenated_text)} characters.")
# Iterate over each language issue
for idx, issue in enumerate(language_matches, start=1):
offset = issue["offset"]
length = issue["length"]
error_text = concatenated_text[offset:offset+length]
print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
# Find the words that fall within the error span
current_pos = 0
target_words = []
for word in word_list:
word_text = word[1]
word_length = len(word_text) + 1 # +1 for the space
if current_pos + word_length > offset and current_pos < offset + length:
target_words.append(word)
current_pos += word_length
if not target_words:
print("No matching words found for this issue.")
continue
# Add highlight annotations to the target words
for target in target_words:
page_num, word_text, x0, y0, x1, y1 = target
page = doc[page_num]
# Define a rectangle around the word with some padding
rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
# Add a highlight annotation
highlight = page.add_highlight_annot(rect)
highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
highlight.update()
print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
# Save annotated PDF to bytes
byte_stream = io.BytesIO()
doc.save(byte_stream)
annotated_pdf_bytes = byte_stream.getvalue()
doc.close()
# Save annotated PDF locally for verification
with open("annotated_temp.pdf", "wb") as f:
f.write(annotated_pdf_bytes)
print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
return annotated_pdf_bytes
except Exception as e:
print(f"Error in highlighting PDF: {e}")
return b""
def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
"""Analyzes the PDF for language issues and returns results and annotated PDF."""
try:
# Reset file pointer before reading
file.seek(0)
full_text = extract_pdf_text(file)
if not full_text:
return {"error": "Failed to extract text from PDF."}, None
language_issues = check_language_issues(full_text)
if "error" in language_issues:
return language_issues, None
issues = language_issues.get("issues", [])
# Reset file pointer before highlighting
file.seek(0)
annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None
return language_issues, annotated_pdf
except Exception as e:
return {"error": str(e)}, None |