samyak152002 commited on
Commit
c85e0b2
·
verified ·
1 Parent(s): 3eafa03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -51
app.py CHANGED
@@ -236,74 +236,80 @@ def check_reference_order(full_text: str) -> Dict[str, Any]:
236
 
237
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
238
  """
239
- Highlights language issues in the PDF and skips highlighting in the references section.
 
 
240
  """
241
  try:
242
  # Open the PDF
243
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
244
-
245
- word_list = []
246
- concatenated_text = ""
247
- offsets = [] # Track start offsets of words
248
-
249
- # Extract words and build concatenated text
250
  for page_number in range(len(doc)):
251
  page = doc[page_number]
252
- words = page.get_text("words") # (x0, y0, x1, y1, word, ...)
253
  for w in words:
 
254
  word_text = w[4]
255
- if "[" in word_text:
256
- word_text = word_text.replace("[", " [") # Adjust for spaces before '['
 
257
  word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
258
- offsets.append(len(concatenated_text))
259
- concatenated_text += word_text + " "
260
 
261
- # Find "References" section and exclude from processing
262
- references_start = concatenated_text.lower().find("references")
263
- references_offset = len(concatenated_text) if references_start == -1 else references_start
264
 
265
- for issue in language_matches:
266
- offset = issue["offset"]
 
267
  length = issue["length"]
 
 
 
 
268
 
269
- # Skip issues in the references section
270
- if offset >= references_offset:
271
- continue
272
-
273
- # Map the issue to corresponding words in the PDF
274
- error_text = concatenated_text[offset:offset + length]
275
  target_words = []
 
 
 
276
 
277
- for idx, word in enumerate(word_list):
278
- word_offset_start = offsets[idx]
279
- word_offset_end = word_offset_start + len(word[1])
280
-
281
- if word_offset_start < offset + length and word_offset_end > offset:
282
  target_words.append(word)
 
283
 
284
  if not target_words:
285
- print(f"Skipping issue: {error_text} - No matching words found.")
286
  continue
287
 
288
- # Get bounding box and validate it
289
- page_number = target_words[0][0]
290
- page = doc[page_number]
291
-
292
- # Calculate rectangle, handling multi-line or disjoint words
293
- x0 = min(word[2] for word in target_words)
294
- y0 = min(word[3] for word in target_words)
295
- x1 = max(word[4] for word in target_words)
296
- y1 = max(word[5] for word in target_words)
297
-
298
- # Ensure valid rectangle
299
- if x0 >= x1 or y0 >= y1:
300
- print(f"Invalid rectangle for issue: {error_text} - Skipping.")
301
- continue
302
-
303
- rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
304
- highlight = page.add_highlight_annot(rect)
305
- highlight.set_colors(stroke=(1, 1, 0)) # Yellow
306
- highlight.update()
 
 
 
 
307
 
308
  # Save annotated PDF to bytes
309
  byte_stream = io.BytesIO()
@@ -311,14 +317,16 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
311
  annotated_pdf_bytes = byte_stream.getvalue()
312
  doc.close()
313
 
 
 
 
 
 
314
  return language_matches, annotated_pdf_bytes
315
  except Exception as e:
316
  print(f"Error in highlighting PDF: {e}")
317
  return b""
318
 
319
-
320
-
321
-
322
  # ------------------------------
323
  # Main Analysis Function
324
  # ------------------------------
 
236
 
237
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
238
  """
239
+ Highlights language issues in the PDF and returns the annotated PDF as bytes.
240
+ This function maps LanguageTool matches to specific words in the PDF
241
+ and highlights those words.
242
  """
243
  try:
244
  # Open the PDF
245
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
246
+ # print(f"Opened PDF with {len(doc)} pages.")
247
+ # print(language_matches)
248
+ # Extract words with positions from each page
249
+ word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
 
 
250
  for page_number in range(len(doc)):
251
  page = doc[page_number]
252
+ words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
253
  for w in words:
254
+ # print(w)
255
  word_text = w[4]
256
+ # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
257
+ if '[' in word_text:
258
+ word_text = word_text.replace('[', ' [')
259
  word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
260
+ # print(f"Total words extracted: {len(word_list)}")
 
261
 
262
+ # Concatenate all words to form the full text
263
+ concatenated_text = " ".join([w[1] for w in word_list])
264
+ # print(f"Concatenated text length: {concatenated_text} characters.")
265
 
266
+ # Iterate over each language issue
267
+ for idx, issue in enumerate(language_matches, start=1):
268
+ offset = issue["offset"] # offset+line_no-1
269
  length = issue["length"]
270
+
271
+
272
+ error_text = concatenated_text[offset:offset+length+1]
273
+ # print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
274
 
275
+ # Find the words that fall within the error span
276
+ current_pos = 0
 
 
 
 
277
  target_words = []
278
+ for word in word_list:
279
+ word_text = word[1]
280
+ word_length = len(word_text) + 1 # +1 for the space
281
 
282
+ if current_pos + word_length > offset and current_pos < offset + length:
 
 
 
 
283
  target_words.append(word)
284
+ current_pos += word_length
285
 
286
  if not target_words:
287
+ # print("No matching words found for this issue.")
288
  continue
289
 
290
+ initial_x = target_words[0][2]
291
+ initial_y = target_words[0][3]
292
+ final_x = target_words[len(target_words)-1][4]
293
+ final_y = target_words[len(target_words)-1][5]
294
+ issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
295
+ issue["page"] = target_words[0][0] + 1
296
+ # Add highlight annotations to the target words
297
+ print()
298
+ print("issue", issue)
299
+ print("error text", error_text)
300
+ print(target_words)
301
+ print()
302
+ for target in target_words:
303
+ page_num, word_text, x0, y0, x1, y1 = target
304
+ page = doc[page_num]
305
+ # Define a rectangle around the word with some padding
306
+ rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
307
+ # Add a highlight annotation
308
+ highlight = page.add_highlight_annot(rect)
309
+ highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
310
+ highlight.update()
311
+ # print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
312
+
313
 
314
  # Save annotated PDF to bytes
315
  byte_stream = io.BytesIO()
 
317
  annotated_pdf_bytes = byte_stream.getvalue()
318
  doc.close()
319
 
320
+ # Save annotated PDF locally for verification
321
+ with open("annotated_temp.pdf", "wb") as f:
322
+ f.write(annotated_pdf_bytes)
323
+ # print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
324
+
325
  return language_matches, annotated_pdf_bytes
326
  except Exception as e:
327
  print(f"Error in highlighting PDF: {e}")
328
  return b""
329
 
 
 
 
330
  # ------------------------------
331
  # Main Analysis Function
332
  # ------------------------------