samyak152002 commited on
Commit
d399efc
·
verified ·
1 Parent(s): 18c6797

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -39
app.py CHANGED
@@ -229,35 +229,25 @@ def check_reference_order(full_text: str) -> Dict[str, Any]:
229
 
230
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
231
  """
232
- Highlights language issues in the PDF and returns the annotated PDF as bytes.
233
- This function maps LanguageTool matches to specific words in the PDF
234
- and highlights those words.
235
  """
236
  try:
237
  # Open the PDF
238
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
239
- # print(f"Opened PDF with {len(doc)} pages.")
240
- # print(language_matches)
241
  # Extract words with positions from each page
242
  word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
243
  for page_number in range(len(doc)):
244
  page = doc[page_number]
245
- print(page.get_text("words"))
246
  words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
247
  for w in words:
248
- # print(w)
249
  word_text = w[4]
250
- # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
251
- # if '[' in word_text:
252
- # word_text = word_text.replace('[', ' [')
253
  word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
254
- # print(f"Total words extracted: {len(word_list)}")
255
 
256
  # Concatenate all words to form the full text
257
- concatenated_text=""
258
  concatenated_text = " ".join([w[1] for w in word_list])
259
-
260
- # print(f"Concatenated text length: {concatenated_text} characters.")
261
 
262
  # Find "Abstract" section and set the processing start point
263
  abstract_start = concatenated_text.lower().find("abstract")
@@ -269,16 +259,14 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
269
 
270
  # Iterate over each language issue
271
  for idx, issue in enumerate(language_matches, start=1):
272
- offset = issue["offset"] # offset+line_no-1
273
  length = issue["length"]
274
 
275
  # Skip issues in the references section
276
  if offset < abstract_offset or offset >= references_offset:
277
  continue
278
-
279
-
280
- error_text = concatenated_text[offset:offset+length]
281
- print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
282
 
283
  # Find the words that fall within the error span
284
  current_pos = 0
@@ -292,32 +280,37 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
292
  current_pos += word_length
293
 
294
  if not target_words:
295
- # print("No matching words found for this issue.")
296
  continue
297
 
298
  initial_x = target_words[0][2]
299
  initial_y = target_words[0][3]
300
- final_x = target_words[len(target_words)-1][4]
301
- final_y = target_words[len(target_words)-1][5]
302
  issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
303
  issue["page"] = target_words[0][0] + 1
 
304
  # Add highlight annotations to the target words
305
- print()
306
- print("issue", issue)
307
- print("error text", error_text)
308
- print(target_words)
309
- print()
310
- for target in target_words:
311
- page_num, word_text, x0, y0, x1, y1 = target
312
- page = doc[page_num]
313
- # Define a rectangle around the word with some padding
314
- rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
315
- # Add a highlight annotation
316
- highlight = page.add_highlight_annot(rect)
317
- highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
318
- highlight.update()
319
- # print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
  # Save annotated PDF to bytes
323
  byte_stream = io.BytesIO()
@@ -325,10 +318,9 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
325
  annotated_pdf_bytes = byte_stream.getvalue()
326
  doc.close()
327
 
328
- # Save annotated PDF locally for verification
329
  with open("annotated_temp.pdf", "wb") as f:
330
  f.write(annotated_pdf_bytes)
331
- # print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
332
 
333
  return language_matches, annotated_pdf_bytes
334
  except Exception as e:
@@ -338,6 +330,7 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
338
 
339
 
340
 
 
341
  # ------------------------------
342
  # Main Analysis Function
343
  # ------------------------------
 
229
 
230
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
231
  """
232
+ Highlights language issues in the PDF, adds a comment box with text on the side of the page,
233
+ and draws arrows pointing from the highlighted text to the comment box.
234
+ Returns the annotated PDF as bytes.
235
  """
236
  try:
237
  # Open the PDF
238
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
239
+
 
240
  # Extract words with positions from each page
241
  word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
242
  for page_number in range(len(doc)):
243
  page = doc[page_number]
 
244
  words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
245
  for w in words:
 
246
  word_text = w[4]
 
 
 
247
  word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
 
248
 
249
  # Concatenate all words to form the full text
 
250
  concatenated_text = " ".join([w[1] for w in word_list])
 
 
251
 
252
  # Find "Abstract" section and set the processing start point
253
  abstract_start = concatenated_text.lower().find("abstract")
 
259
 
260
  # Iterate over each language issue
261
  for idx, issue in enumerate(language_matches, start=1):
262
+ offset = issue["offset"]
263
  length = issue["length"]
264
 
265
  # Skip issues in the references section
266
  if offset < abstract_offset or offset >= references_offset:
267
  continue
268
+
269
+ error_text = concatenated_text[offset:offset + length]
 
 
270
 
271
  # Find the words that fall within the error span
272
  current_pos = 0
 
280
  current_pos += word_length
281
 
282
  if not target_words:
 
283
  continue
284
 
285
  initial_x = target_words[0][2]
286
  initial_y = target_words[0][3]
287
+ final_x = target_words[len(target_words) - 1][4]
288
+ final_y = target_words[len(target_words) - 1][5]
289
  issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
290
  issue["page"] = target_words[0][0] + 1
291
+
292
  # Add highlight annotations to the target words
293
+ page_num = target_words[0][0]
294
+ page = doc[page_num]
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
+ # Create a rectangle around the highlighted text
297
+ rect = fitz.Rect(initial_x - 1, initial_y - 1, final_x + 1, final_y + 1)
298
+ highlight = page.add_highlight_annot(rect)
299
+ highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
300
+ highlight.update()
301
+
302
+ # Create the comment box on the side of the page
303
+ comment_x = page.rect.width - 150 # Adjust this value as needed
304
+ comment_y = initial_y # Position the comment box near the highlighted text
305
+ comment_rect = fitz.Rect(comment_x, comment_y, comment_x + 140, comment_y + 100)
306
+ page.add_freetext_annot(comment_rect, error_text)
307
+
308
+ # Draw an arrow from the highlighted word to the comment box
309
+ arrow_start = (initial_x + final_x) / 2, (initial_y + final_y) / 2 # Center of the highlighted word
310
+ arrow_end = comment_rect.center # Center of the comment box
311
+
312
+ # Draw the arrow
313
+ page.add_arrow(arrow_start, arrow_end, color=(0, 0, 0), width=2)
314
 
315
  # Save annotated PDF to bytes
316
  byte_stream = io.BytesIO()
 
318
  annotated_pdf_bytes = byte_stream.getvalue()
319
  doc.close()
320
 
321
+ # Save annotated PDF locally for verification (optional)
322
  with open("annotated_temp.pdf", "wb") as f:
323
  f.write(annotated_pdf_bytes)
 
324
 
325
  return language_matches, annotated_pdf_bytes
326
  except Exception as e:
 
330
 
331
 
332
 
333
+
334
  # ------------------------------
335
  # Main Analysis Function
336
  # ------------------------------