samyak152002 commited on
Commit
40e8eb9
1 Parent(s): 52dcb43

Create annotations.py

Browse files
Files changed (1) hide show
  1. annotations.py +77 -0
annotations.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # annotations.py
2
+
3
+ import fitz # PyMuPDF
4
+ import re
5
+ from typing import List, Dict, Any, Tuple
6
+ from collections import Counter
7
+ import language_tool_python
8
+ import io
9
+
10
+ def extract_pdf_text(file) -> str:
11
+ """Extracts text from a PDF file using pdfminer."""
12
+ from pdfminer.high_level import extract_text
13
+ from pdfminer.layout import LAParams
14
+
15
+ if isinstance(file, str):
16
+ with open(file, 'rb') as f:
17
+ return extract_text(f, laparams=LAParams())
18
+ else:
19
+ file.seek(0)
20
+ return extract_text(file, laparams=LAParams())
21
+
22
+ def check_language_issues(full_text: str) -> Dict[str, Any]:
23
+ """Check for language issues using LanguageTool."""
24
+ language_tool = language_tool_python.LanguageTool('en-US')
25
+ matches = language_tool.check(full_text)
26
+ issues = []
27
+ for match in matches:
28
+ issues.append({
29
+ "message": match.message,
30
+ "context": match.context,
31
+ "suggestions": match.replacements[:3] if match.replacements else [],
32
+ "category": match.category,
33
+ "rule_id": match.ruleId
34
+ })
35
+ return {
36
+ "total_issues": len(issues),
37
+ "issues": issues
38
+ }
39
+
40
+ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
41
+ """Highlights language issues in the PDF and returns the annotated PDF as bytes."""
42
+ try:
43
+ if isinstance(file, str):
44
+ doc = fitz.open(file)
45
+ else:
46
+ file.seek(0)
47
+ doc = fitz.open(stream=file.read(), filetype="pdf")
48
+
49
+ for match in language_matches:
50
+ sentence = match['context']
51
+ # Use regular expressions to find the sentence in the text
52
+ for page in doc:
53
+ text_instances = page.search_for(sentence)
54
+ for inst in text_instances:
55
+ # Highlight the sentence
56
+ highlight = page.add_highlight_annot(inst)
57
+ highlight.update()
58
+ # Save annotated PDF to bytes
59
+ byte_stream = io.BytesIO()
60
+ doc.save(byte_stream)
61
+ annotated_pdf_bytes = byte_stream.getvalue()
62
+ doc.close()
63
+ return annotated_pdf_bytes
64
+ except Exception as e:
65
+ print(f"Error in highlighting PDF: {e}")
66
+ return b""
67
+
68
+ def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
69
+ """Analyzes the PDF for language issues and returns results and annotated PDF."""
70
+ try:
71
+ full_text = extract_pdf_text(file)
72
+ language_issues = check_language_issues(full_text)
73
+ issues = language_issues.get("issues", [])
74
+ annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None
75
+ return language_issues, annotated_pdf
76
+ except Exception as e:
77
+ return {"error": str(e)}, None