samyak152002 commited on
Commit
a24b0c9
·
verified ·
1 Parent(s): 99dc100

Update annotations.py

Browse files
Files changed (1) hide show
  1. annotations.py +65 -28
annotations.py CHANGED
@@ -1,23 +1,22 @@
1
  # annotations.py
2
 
3
  import fitz # PyMuPDF
4
- import re
5
  from typing import List, Dict, Any, Tuple
6
- from collections import Counter
7
  import language_tool_python
8
  import io
9
 
10
  def extract_pdf_text(file) -> str:
11
- """Extracts text from a PDF file using pdfminer."""
12
- from pdfminer.high_level import extract_text
13
- from pdfminer.layout import LAParams
14
-
15
- if isinstance(file, str):
16
- with open(file, 'rb') as f:
17
- return extract_text(f, laparams=LAParams())
18
- else:
19
- file.seek(0)
20
- return extract_text(file, laparams=LAParams())
 
21
 
22
  def check_language_issues(full_text: str) -> Dict[str, Any]:
23
  """Check for language issues using LanguageTool."""
@@ -30,7 +29,9 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
30
  "context": match.context,
31
  "suggestions": match.replacements[:3] if match.replacements else [],
32
  "category": match.category,
33
- "rule_id": match.ruleId
 
 
34
  })
35
  return {
36
  "total_issues": len(issues),
@@ -38,23 +39,56 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
38
  }
39
 
40
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
41
- """Highlights language issues in the PDF and returns the annotated PDF as bytes."""
 
 
 
 
42
  try:
43
- if isinstance(file, str):
44
- doc = fitz.open(file)
45
- else:
46
- file.seek(0)
47
- doc = fitz.open(stream=file.read(), filetype="pdf")
 
 
 
 
 
 
 
 
 
 
48
 
49
- for match in language_matches:
50
- sentence = match['context']
51
- # Use regular expressions to find the sentence in the text
52
- for page in doc:
53
- text_instances = page.search_for(sentence)
54
- for inst in text_instances:
55
- # Highlight the sentence
56
- highlight = page.add_highlight_annot(inst)
57
- highlight.update()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  # Save annotated PDF to bytes
59
  byte_stream = io.BytesIO()
60
  doc.save(byte_stream)
@@ -69,6 +103,9 @@ def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
69
  """Analyzes the PDF for language issues and returns results and annotated PDF."""
70
  try:
71
  full_text = extract_pdf_text(file)
 
 
 
72
  language_issues = check_language_issues(full_text)
73
  issues = language_issues.get("issues", [])
74
  annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None
 
1
  # annotations.py
2
 
3
  import fitz # PyMuPDF
 
4
  from typing import List, Dict, Any, Tuple
 
5
  import language_tool_python
6
  import io
7
 
8
  def extract_pdf_text(file) -> str:
9
+ """Extracts full text from a PDF file using PyMuPDF."""
10
+ try:
11
+ doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
12
+ full_text = ""
13
+ for page in doc:
14
+ full_text += page.get_text("text") + "\n"
15
+ doc.close()
16
+ return full_text
17
+ except Exception as e:
18
+ print(f"Error extracting text from PDF: {e}")
19
+ return ""
20
 
21
  def check_language_issues(full_text: str) -> Dict[str, Any]:
22
  """Check for language issues using LanguageTool."""
 
29
  "context": match.context,
30
  "suggestions": match.replacements[:3] if match.replacements else [],
31
  "category": match.category,
32
+ "rule_id": match.ruleId,
33
+ "offset": match.offset,
34
+ "length": match.errorLength
35
  })
36
  return {
37
  "total_issues": len(issues),
 
39
  }
40
 
41
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
42
+ """
43
+ Highlights language issues in the PDF and returns the annotated PDF as bytes.
44
+ This function maps LanguageTool matches to specific words in the PDF
45
+ and highlights those words.
46
+ """
47
  try:
48
+ # Open the PDF
49
+ doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
50
+
51
+ # Extract words with positions from each page
52
+ word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
53
+ page_text = ""
54
+ for page_number in range(len(doc)):
55
+ page = doc[page_number]
56
+ words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
57
+ for w in words:
58
+ word_text = w[4]
59
+ word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
60
+
61
+ # Concatenate all words to form the full text
62
+ concatenated_text = " ".join([w[1] for w in word_list])
63
 
64
+ # Iterate over each language issue
65
+ for issue in language_matches:
66
+ offset = issue["offset"]
67
+ length = issue["length"]
68
+ error_text = concatenated_text[offset:offset+length]
69
+
70
+ # Find the words that fall within the error span
71
+ current_pos = 0
72
+ target_words = []
73
+ for word in word_list:
74
+ word_text = word[1]
75
+ word_length = len(word_text) + 1 # +1 for the space
76
+
77
+ if current_pos + word_length > offset and current_pos < offset + length:
78
+ target_words.append(word)
79
+ current_pos += word_length
80
+
81
+ # Add highlight annotations to the target words
82
+ for target in target_words:
83
+ page_num, word_text, x0, y0, x1, y1 = target
84
+ page = doc[page_num]
85
+ # Define a rectangle around the word
86
+ rect = fitz.Rect(x0, y0, x1, y1)
87
+ # Add a highlight annotation
88
+ highlight = page.add_highlight_annot(rect)
89
+ highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
90
+ highlight.update()
91
+
92
  # Save annotated PDF to bytes
93
  byte_stream = io.BytesIO()
94
  doc.save(byte_stream)
 
103
  """Analyzes the PDF for language issues and returns results and annotated PDF."""
104
  try:
105
  full_text = extract_pdf_text(file)
106
+ if not full_text:
107
+ return {"error": "Failed to extract text from PDF."}, None
108
+
109
  language_issues = check_language_issues(full_text)
110
  issues = language_issues.get("issues", [])
111
  annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None