Spaces:
Sleeping
Sleeping
samyak152002
commited on
Commit
•
40e8eb9
1
Parent(s):
52dcb43
Create annotations.py
Browse files- annotations.py +77 -0
annotations.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# annotations.py
|
2 |
+
|
3 |
+
import fitz # PyMuPDF
|
4 |
+
import re
|
5 |
+
from typing import List, Dict, Any, Tuple
|
6 |
+
from collections import Counter
|
7 |
+
import language_tool_python
|
8 |
+
import io
|
9 |
+
|
10 |
+
def extract_pdf_text(file) -> str:
|
11 |
+
"""Extracts text from a PDF file using pdfminer."""
|
12 |
+
from pdfminer.high_level import extract_text
|
13 |
+
from pdfminer.layout import LAParams
|
14 |
+
|
15 |
+
if isinstance(file, str):
|
16 |
+
with open(file, 'rb') as f:
|
17 |
+
return extract_text(f, laparams=LAParams())
|
18 |
+
else:
|
19 |
+
file.seek(0)
|
20 |
+
return extract_text(file, laparams=LAParams())
|
21 |
+
|
22 |
+
def check_language_issues(full_text: str) -> Dict[str, Any]:
|
23 |
+
"""Check for language issues using LanguageTool."""
|
24 |
+
language_tool = language_tool_python.LanguageTool('en-US')
|
25 |
+
matches = language_tool.check(full_text)
|
26 |
+
issues = []
|
27 |
+
for match in matches:
|
28 |
+
issues.append({
|
29 |
+
"message": match.message,
|
30 |
+
"context": match.context,
|
31 |
+
"suggestions": match.replacements[:3] if match.replacements else [],
|
32 |
+
"category": match.category,
|
33 |
+
"rule_id": match.ruleId
|
34 |
+
})
|
35 |
+
return {
|
36 |
+
"total_issues": len(issues),
|
37 |
+
"issues": issues
|
38 |
+
}
|
39 |
+
|
40 |
+
def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
|
41 |
+
"""Highlights language issues in the PDF and returns the annotated PDF as bytes."""
|
42 |
+
try:
|
43 |
+
if isinstance(file, str):
|
44 |
+
doc = fitz.open(file)
|
45 |
+
else:
|
46 |
+
file.seek(0)
|
47 |
+
doc = fitz.open(stream=file.read(), filetype="pdf")
|
48 |
+
|
49 |
+
for match in language_matches:
|
50 |
+
sentence = match['context']
|
51 |
+
# Use regular expressions to find the sentence in the text
|
52 |
+
for page in doc:
|
53 |
+
text_instances = page.search_for(sentence)
|
54 |
+
for inst in text_instances:
|
55 |
+
# Highlight the sentence
|
56 |
+
highlight = page.add_highlight_annot(inst)
|
57 |
+
highlight.update()
|
58 |
+
# Save annotated PDF to bytes
|
59 |
+
byte_stream = io.BytesIO()
|
60 |
+
doc.save(byte_stream)
|
61 |
+
annotated_pdf_bytes = byte_stream.getvalue()
|
62 |
+
doc.close()
|
63 |
+
return annotated_pdf_bytes
|
64 |
+
except Exception as e:
|
65 |
+
print(f"Error in highlighting PDF: {e}")
|
66 |
+
return b""
|
67 |
+
|
68 |
+
def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
|
69 |
+
"""Analyzes the PDF for language issues and returns results and annotated PDF."""
|
70 |
+
try:
|
71 |
+
full_text = extract_pdf_text(file)
|
72 |
+
language_issues = check_language_issues(full_text)
|
73 |
+
issues = language_issues.get("issues", [])
|
74 |
+
annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None
|
75 |
+
return language_issues, annotated_pdf
|
76 |
+
except Exception as e:
|
77 |
+
return {"error": str(e)}, None
|