samyak152002 commited on
Commit
3eafa03
·
verified ·
1 Parent(s): a0e200f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -52
app.py CHANGED
@@ -43,7 +43,7 @@ def extract_pdf_text(file) -> str:
43
  text = block[4] # The text content is at index 4
44
 
45
  # Handle line-break hyphens
46
- text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
47
 
48
  # Preserve regular hyphens within words (e.g., "state-of-the-art")
49
  processed_text += text + "\n"
@@ -236,73 +236,74 @@ def check_reference_order(full_text: str) -> Dict[str, Any]:
236
 
237
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
238
  """
239
- Highlights language issues in the PDF and returns the annotated PDF as bytes.
240
- This function maps LanguageTool matches to specific words in the PDF
241
- and highlights those words.
242
  """
243
  try:
244
  # Open the PDF
245
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
246
- print(f"Opened PDF with {len(doc)} pages.")
247
- print(language_matches)
248
- # Extract words with positions from each page
249
- word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
 
 
250
  for page_number in range(len(doc)):
251
  page = doc[page_number]
252
- words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
253
  for w in words:
254
- # print(w)
255
  word_text = w[4]
256
- # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
257
- if '[' in word_text:
258
- word_text = word_text.replace('[', ' [')
259
  word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
260
- print(f"Total words extracted: {len(word_list)}")
 
261
 
262
- # Concatenate all words to form the full text
263
- concatenated_text = " ".join([w[1] for w in word_list])
264
- print(f"Concatenated text length: {len(concatenated_text)} characters.")
265
 
266
- # Iterate over each language issue
267
- for idx, issue in enumerate(language_matches, start=1):
268
  offset = issue["offset"]
269
  length = issue["length"]
270
- error_text = concatenated_text[offset:offset+length]
271
- print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
272
 
273
- # Find the words that fall within the error span
274
- current_pos = 0
 
 
 
 
275
  target_words = []
276
- for word in word_list:
277
- word_text = word[1]
278
- word_length = len(word_text) + 1 # +1 for the space
279
 
280
- if current_pos + word_length > offset and current_pos < offset + length:
 
 
 
 
281
  target_words.append(word)
282
- current_pos += word_length
283
 
284
  if not target_words:
285
- print("No matching words found for this issue.")
286
  continue
287
 
288
- initial_x = target_words[0][2]
289
- initial_y = target_words[0][3]
290
- final_x = target_words[len(target_words)-1][4]
291
- final_y = target_words[len(target_words)-1][5]
292
- issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
293
- issue["page"] = target_words[0][0] + 1
294
- # Add highlight annotations to the target words
295
- for target in target_words:
296
- page_num, word_text, x0, y0, x1, y1 = target
297
- page = doc[page_num]
298
- # Define a rectangle around the word with some padding
299
- rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
300
- # Add a highlight annotation
301
- highlight = page.add_highlight_annot(rect)
302
- highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
303
- highlight.update()
304
- print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
305
-
 
306
 
307
  # Save annotated PDF to bytes
308
  byte_stream = io.BytesIO()
@@ -310,16 +311,14 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
310
  annotated_pdf_bytes = byte_stream.getvalue()
311
  doc.close()
312
 
313
- # Save annotated PDF locally for verification
314
- with open("annotated_temp.pdf", "wb") as f:
315
- f.write(annotated_pdf_bytes)
316
- print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
317
-
318
  return language_matches, annotated_pdf_bytes
319
  except Exception as e:
320
  print(f"Error in highlighting PDF: {e}")
321
  return b""
322
 
 
 
 
323
  # ------------------------------
324
  # Main Analysis Function
325
  # ------------------------------
 
43
  text = block[4] # The text content is at index 4
44
 
45
  # Handle line-break hyphens
46
+ # text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
47
 
48
  # Preserve regular hyphens within words (e.g., "state-of-the-art")
49
  processed_text += text + "\n"
 
236
 
237
  def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
238
  """
239
+ Highlights language issues in the PDF and skips highlighting in the references section.
 
 
240
  """
241
  try:
242
  # Open the PDF
243
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
244
+
245
+ word_list = []
246
+ concatenated_text = ""
247
+ offsets = [] # Track start offsets of words
248
+
249
+ # Extract words and build concatenated text
250
  for page_number in range(len(doc)):
251
  page = doc[page_number]
252
+ words = page.get_text("words") # (x0, y0, x1, y1, word, ...)
253
  for w in words:
 
254
  word_text = w[4]
255
+ if "[" in word_text:
256
+ word_text = word_text.replace("[", " [") # Adjust for spaces before '['
 
257
  word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
258
+ offsets.append(len(concatenated_text))
259
+ concatenated_text += word_text + " "
260
 
261
+ # Find "References" section and exclude from processing
262
+ references_start = concatenated_text.lower().find("references")
263
+ references_offset = len(concatenated_text) if references_start == -1 else references_start
264
 
265
+ for issue in language_matches:
 
266
  offset = issue["offset"]
267
  length = issue["length"]
 
 
268
 
269
+ # Skip issues in the references section
270
+ if offset >= references_offset:
271
+ continue
272
+
273
+ # Map the issue to corresponding words in the PDF
274
+ error_text = concatenated_text[offset:offset + length]
275
  target_words = []
 
 
 
276
 
277
+ for idx, word in enumerate(word_list):
278
+ word_offset_start = offsets[idx]
279
+ word_offset_end = word_offset_start + len(word[1])
280
+
281
+ if word_offset_start < offset + length and word_offset_end > offset:
282
  target_words.append(word)
 
283
 
284
  if not target_words:
285
+ print(f"Skipping issue: {error_text} - No matching words found.")
286
  continue
287
 
288
+ # Get bounding box and validate it
289
+ page_number = target_words[0][0]
290
+ page = doc[page_number]
291
+
292
+ # Calculate rectangle, handling multi-line or disjoint words
293
+ x0 = min(word[2] for word in target_words)
294
+ y0 = min(word[3] for word in target_words)
295
+ x1 = max(word[4] for word in target_words)
296
+ y1 = max(word[5] for word in target_words)
297
+
298
+ # Ensure valid rectangle
299
+ if x0 >= x1 or y0 >= y1:
300
+ print(f"Invalid rectangle for issue: {error_text} - Skipping.")
301
+ continue
302
+
303
+ rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
304
+ highlight = page.add_highlight_annot(rect)
305
+ highlight.set_colors(stroke=(1, 1, 0)) # Yellow
306
+ highlight.update()
307
 
308
  # Save annotated PDF to bytes
309
  byte_stream = io.BytesIO()
 
311
  annotated_pdf_bytes = byte_stream.getvalue()
312
  doc.close()
313
 
 
 
 
 
 
314
  return language_matches, annotated_pdf_bytes
315
  except Exception as e:
316
  print(f"Error in highlighting PDF: {e}")
317
  return b""
318
 
319
+
320
+
321
+
322
  # ------------------------------
323
  # Main Analysis Function
324
  # ------------------------------