samyak152002 commited on
Commit
dde32e5
·
verified ·
1 Parent(s): 28757b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -44
app.py CHANGED
@@ -226,27 +226,38 @@ def check_reference_order(full_text: str) -> Dict[str, Any]:
226
  "missing_references": missing_refs,
227
  "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
228
  }
 
229
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
230
  """
231
- Highlights language issues in the PDF, adds a dynamic comment box with text on the side of the page,
232
- and draws arrows pointing from the highlighted text to the comment box.
233
- Returns the annotated PDF as bytes.
234
  """
235
  try:
236
  # Open the PDF
237
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
238
-
 
239
  # Extract words with positions from each page
240
  word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
241
  for page_number in range(len(doc)):
242
  page = doc[page_number]
 
243
  words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
244
  for w in words:
 
245
  word_text = w[4]
 
 
 
246
  word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
 
247
 
248
  # Concatenate all words to form the full text
 
249
  concatenated_text = " ".join([w[1] for w in word_list])
 
 
250
 
251
  # Find "Abstract" section and set the processing start point
252
  abstract_start = concatenated_text.lower().find("abstract")
@@ -258,14 +269,16 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
258
 
259
  # Iterate over each language issue
260
  for idx, issue in enumerate(language_matches, start=1):
261
- offset = issue["offset"]
262
  length = issue["length"]
263
 
264
  # Skip issues in the references section
265
  if offset < abstract_offset or offset >= references_offset:
266
  continue
267
-
268
- error_text = concatenated_text[offset:offset + length]
 
 
269
 
270
  # Find the words that fall within the error span
271
  current_pos = 0
@@ -279,48 +292,32 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
279
  current_pos += word_length
280
 
281
  if not target_words:
 
282
  continue
283
 
284
  initial_x = target_words[0][2]
285
  initial_y = target_words[0][3]
286
- final_x = target_words[len(target_words) - 1][4]
287
- final_y = target_words[len(target_words) - 1][5]
288
  issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
289
  issue["page"] = target_words[0][0] + 1
290
-
291
  # Add highlight annotations to the target words
292
- page_num = target_words[0][0]
293
- page = doc[page_num]
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
- # Create a rectangle around the highlighted text
296
- rect = fitz.Rect(initial_x - 1, initial_y - 1, final_x + 1, final_y + 1)
297
- highlight = page.add_highlight_annot(rect)
298
- highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
299
- highlight.update()
300
-
301
- # Dynamically calculate the position of the comment box
302
- page_width, page_height = page.rect.width, page.rect.height
303
- comment_box_width = min(140, page_width / 3) # Ensure the comment box width is a reasonable fraction of the page width
304
- comment_box_height = 100 # Set a reasonable height for the comment box
305
-
306
- # Position the comment box dynamically
307
- if initial_x < page_width / 2: # If the highlighted text is on the left half of the page
308
- comment_x = page_width - comment_box_width - 10 # Position it on the right side
309
- else: # If the highlighted text is on the right half of the page
310
- comment_x = 10 # Position it on the left side
311
-
312
- comment_y = initial_y # Position the comment box near the highlighted text
313
- comment_rect = fitz.Rect(comment_x, comment_y, comment_x + comment_box_width, comment_y + comment_box_height)
314
- page.add_freetext_annot(comment_rect, error_text)
315
-
316
- # Draw an arrow from the highlighted word to the comment box
317
- arrow_start_x = (initial_x + final_x) / 2 # Center X of the highlighted word
318
- arrow_start_y = (initial_y + final_y) / 2 # Center Y of the highlighted word
319
- arrow_end_x = (comment_rect.x0 + comment_rect.x1) / 2 # Center X of the comment box
320
- arrow_end_y = (comment_rect.y0 + comment_rect.y1) / 2 # Center Y of the comment box
321
-
322
- # Draw the arrow
323
- page.add_arrow((arrow_start_x, arrow_start_y), (arrow_end_x, arrow_end_y), color=(0, 0, 0), width=2)
324
 
325
  # Save annotated PDF to bytes
326
  byte_stream = io.BytesIO()
@@ -328,9 +325,10 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
328
  annotated_pdf_bytes = byte_stream.getvalue()
329
  doc.close()
330
 
331
- # Save annotated PDF locally for verification (optional)
332
  with open("annotated_temp.pdf", "wb") as f:
333
  f.write(annotated_pdf_bytes)
 
334
 
335
  return language_matches, annotated_pdf_bytes
336
  except Exception as e:
@@ -340,8 +338,6 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
340
 
341
 
342
 
343
-
344
-
345
  # ------------------------------
346
  # Main Analysis Function
347
  # ------------------------------
 
226
  "missing_references": missing_refs,
227
  "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
228
  }
229
+
230
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
231
  """
232
+ Highlights language issues in the PDF and returns the annotated PDF as bytes.
233
+ This function maps LanguageTool matches to specific words in the PDF
234
+ and highlights those words.
235
  """
236
  try:
237
  # Open the PDF
238
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
239
+ # print(f"Opened PDF with {len(doc)} pages.")
240
+ # print(language_matches)
241
  # Extract words with positions from each page
242
  word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
243
  for page_number in range(len(doc)):
244
  page = doc[page_number]
245
+ print(page.get_text("words"))
246
  words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
247
  for w in words:
248
+ # print(w)
249
  word_text = w[4]
250
+ # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
251
+ # if '[' in word_text:
252
+ # word_text = word_text.replace('[', ' [')
253
  word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
254
+ # print(f"Total words extracted: {len(word_list)}")
255
 
256
  # Concatenate all words to form the full text
257
+ concatenated_text=""
258
  concatenated_text = " ".join([w[1] for w in word_list])
259
+
260
+ # print(f"Concatenated text length: {concatenated_text} characters.")
261
 
262
  # Find "Abstract" section and set the processing start point
263
  abstract_start = concatenated_text.lower().find("abstract")
 
269
 
270
  # Iterate over each language issue
271
  for idx, issue in enumerate(language_matches, start=1):
272
+ offset = issue["offset"] # offset+line_no-1
273
  length = issue["length"]
274
 
275
  # Skip issues in the references section
276
  if offset < abstract_offset or offset >= references_offset:
277
  continue
278
+
279
+
280
+ error_text = concatenated_text[offset:offset+length]
281
+ print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
282
 
283
  # Find the words that fall within the error span
284
  current_pos = 0
 
292
  current_pos += word_length
293
 
294
  if not target_words:
295
+ # print("No matching words found for this issue.")
296
  continue
297
 
298
  initial_x = target_words[0][2]
299
  initial_y = target_words[0][3]
300
+ final_x = target_words[len(target_words)-1][4]
301
+ final_y = target_words[len(target_words)-1][5]
302
  issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
303
  issue["page"] = target_words[0][0] + 1
 
304
  # Add highlight annotations to the target words
305
+ print()
306
+ print("issue", issue)
307
+ print("error text", error_text)
308
+ print(target_words)
309
+ print()
310
+ for target in target_words:
311
+ page_num, word_text, x0, y0, x1, y1 = target
312
+ page = doc[page_num]
313
+ # Define a rectangle around the word with some padding
314
+ rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
315
+ # Add a highlight annotation
316
+ highlight = page.add_highlight_annot(rect)
317
+ highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
318
+ highlight.update()
319
+ # print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
  # Save annotated PDF to bytes
323
  byte_stream = io.BytesIO()
 
325
  annotated_pdf_bytes = byte_stream.getvalue()
326
  doc.close()
327
 
328
+ # Save annotated PDF locally for verification
329
  with open("annotated_temp.pdf", "wb") as f:
330
  f.write(annotated_pdf_bytes)
331
+ # print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
332
 
333
  return language_matches, annotated_pdf_bytes
334
  except Exception as e:
 
338
 
339
 
340
 
 
 
341
  # ------------------------------
342
  # Main Analysis Function
343
  # ------------------------------