Spaces:
Sleeping
Sleeping
Update annotations.py
Browse files- annotations.py +65 -28
annotations.py
CHANGED
@@ -1,23 +1,22 @@
|
|
1 |
# annotations.py
|
2 |
|
3 |
import fitz # PyMuPDF
|
4 |
-
import re
|
5 |
from typing import List, Dict, Any, Tuple
|
6 |
-
from collections import Counter
|
7 |
import language_tool_python
|
8 |
import io
|
9 |
|
10 |
def extract_pdf_text(file) -> str:
|
11 |
-
"""Extracts text from a PDF file using
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
21 |
|
22 |
def check_language_issues(full_text: str) -> Dict[str, Any]:
|
23 |
"""Check for language issues using LanguageTool."""
|
@@ -30,7 +29,9 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
|
|
30 |
"context": match.context,
|
31 |
"suggestions": match.replacements[:3] if match.replacements else [],
|
32 |
"category": match.category,
|
33 |
-
"rule_id": match.ruleId
|
|
|
|
|
34 |
})
|
35 |
return {
|
36 |
"total_issues": len(issues),
|
@@ -38,23 +39,56 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
|
|
38 |
}
|
39 |
|
40 |
def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
|
41 |
-
"""
|
|
|
|
|
|
|
|
|
42 |
try:
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
# Save annotated PDF to bytes
|
59 |
byte_stream = io.BytesIO()
|
60 |
doc.save(byte_stream)
|
@@ -69,6 +103,9 @@ def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
|
|
69 |
"""Analyzes the PDF for language issues and returns results and annotated PDF."""
|
70 |
try:
|
71 |
full_text = extract_pdf_text(file)
|
|
|
|
|
|
|
72 |
language_issues = check_language_issues(full_text)
|
73 |
issues = language_issues.get("issues", [])
|
74 |
annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None
|
|
|
1 |
# annotations.py
|
2 |
|
3 |
import fitz # PyMuPDF
|
|
|
4 |
from typing import List, Dict, Any, Tuple
|
|
|
5 |
import language_tool_python
|
6 |
import io
|
7 |
|
8 |
def extract_pdf_text(file) -> str:
|
9 |
+
"""Extracts full text from a PDF file using PyMuPDF."""
|
10 |
+
try:
|
11 |
+
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
12 |
+
full_text = ""
|
13 |
+
for page in doc:
|
14 |
+
full_text += page.get_text("text") + "\n"
|
15 |
+
doc.close()
|
16 |
+
return full_text
|
17 |
+
except Exception as e:
|
18 |
+
print(f"Error extracting text from PDF: {e}")
|
19 |
+
return ""
|
20 |
|
21 |
def check_language_issues(full_text: str) -> Dict[str, Any]:
|
22 |
"""Check for language issues using LanguageTool."""
|
|
|
29 |
"context": match.context,
|
30 |
"suggestions": match.replacements[:3] if match.replacements else [],
|
31 |
"category": match.category,
|
32 |
+
"rule_id": match.ruleId,
|
33 |
+
"offset": match.offset,
|
34 |
+
"length": match.errorLength
|
35 |
})
|
36 |
return {
|
37 |
"total_issues": len(issues),
|
|
|
39 |
}
|
40 |
|
41 |
def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
|
42 |
+
"""
|
43 |
+
Highlights language issues in the PDF and returns the annotated PDF as bytes.
|
44 |
+
This function maps LanguageTool matches to specific words in the PDF
|
45 |
+
and highlights those words.
|
46 |
+
"""
|
47 |
try:
|
48 |
+
# Open the PDF
|
49 |
+
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
50 |
+
|
51 |
+
# Extract words with positions from each page
|
52 |
+
word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
|
53 |
+
page_text = ""
|
54 |
+
for page_number in range(len(doc)):
|
55 |
+
page = doc[page_number]
|
56 |
+
words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
|
57 |
+
for w in words:
|
58 |
+
word_text = w[4]
|
59 |
+
word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
|
60 |
+
|
61 |
+
# Concatenate all words to form the full text
|
62 |
+
concatenated_text = " ".join([w[1] for w in word_list])
|
63 |
|
64 |
+
# Iterate over each language issue
|
65 |
+
for issue in language_matches:
|
66 |
+
offset = issue["offset"]
|
67 |
+
length = issue["length"]
|
68 |
+
error_text = concatenated_text[offset:offset+length]
|
69 |
+
|
70 |
+
# Find the words that fall within the error span
|
71 |
+
current_pos = 0
|
72 |
+
target_words = []
|
73 |
+
for word in word_list:
|
74 |
+
word_text = word[1]
|
75 |
+
word_length = len(word_text) + 1 # +1 for the space
|
76 |
+
|
77 |
+
if current_pos + word_length > offset and current_pos < offset + length:
|
78 |
+
target_words.append(word)
|
79 |
+
current_pos += word_length
|
80 |
+
|
81 |
+
# Add highlight annotations to the target words
|
82 |
+
for target in target_words:
|
83 |
+
page_num, word_text, x0, y0, x1, y1 = target
|
84 |
+
page = doc[page_num]
|
85 |
+
# Define a rectangle around the word
|
86 |
+
rect = fitz.Rect(x0, y0, x1, y1)
|
87 |
+
# Add a highlight annotation
|
88 |
+
highlight = page.add_highlight_annot(rect)
|
89 |
+
highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
|
90 |
+
highlight.update()
|
91 |
+
|
92 |
# Save annotated PDF to bytes
|
93 |
byte_stream = io.BytesIO()
|
94 |
doc.save(byte_stream)
|
|
|
103 |
"""Analyzes the PDF for language issues and returns results and annotated PDF."""
|
104 |
try:
|
105 |
full_text = extract_pdf_text(file)
|
106 |
+
if not full_text:
|
107 |
+
return {"error": "Failed to extract text from PDF."}, None
|
108 |
+
|
109 |
language_issues = check_language_issues(full_text)
|
110 |
issues = language_issues.get("issues", [])
|
111 |
annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None
|