Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Running

App Files Files Community

samyak152002 commited on Dec 6, 2024

Commit

c85e0b2

verified ·

1 Parent(s): 3eafa03

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -51

app.py CHANGED Viewed

@@ -236,74 +236,80 @@ def check_reference_order(full_text: str) -> Dict[str, Any]:
 def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
     """
-    Highlights language issues in the PDF and skips highlighting in the references section.
     """
     try:
         # Open the PDF
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
-        word_list = []
-        concatenated_text = ""
-        offsets = []  # Track start offsets of words
-        # Extract words and build concatenated text
         for page_number in range(len(doc)):
             page = doc[page_number]
-            words = page.get_text("words")  # (x0, y0, x1, y1, word, ...)
             for w in words:
                 word_text = w[4]
-                if "[" in word_text:
-                    word_text = word_text.replace("[", " [")  # Adjust for spaces before '['
                 word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
-                offsets.append(len(concatenated_text))
-                concatenated_text += word_text + " "
-        # Find "References" section and exclude from processing
-        references_start = concatenated_text.lower().find("references")
-        references_offset = len(concatenated_text) if references_start == -1 else references_start
-        for issue in language_matches:
-            offset = issue["offset"]
             length = issue["length"]
-            # Skip issues in the references section
-            if offset >= references_offset:
-                continue
-            # Map the issue to corresponding words in the PDF
-            error_text = concatenated_text[offset:offset + length]
             target_words = []
-            for idx, word in enumerate(word_list):
-                word_offset_start = offsets[idx]
-                word_offset_end = word_offset_start + len(word[1])
-                if word_offset_start < offset + length and word_offset_end > offset:
                     target_words.append(word)
             if not target_words:
-                print(f"Skipping issue: {error_text} - No matching words found.")
                 continue
-            # Get bounding box and validate it
-            page_number = target_words[0][0]
-            page = doc[page_number]
-            # Calculate rectangle, handling multi-line or disjoint words
-            x0 = min(word[2] for word in target_words)
-            y0 = min(word[3] for word in target_words)
-            x1 = max(word[4] for word in target_words)
-            y1 = max(word[5] for word in target_words)
-            # Ensure valid rectangle
-            if x0 >= x1 or y0 >= y1:
-                print(f"Invalid rectangle for issue: {error_text} - Skipping.")
-                continue
-            rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
-            highlight = page.add_highlight_annot(rect)
-            highlight.set_colors(stroke=(1, 1, 0))  # Yellow
-            highlight.update()
         # Save annotated PDF to bytes
         byte_stream = io.BytesIO()
@@ -311,14 +317,16 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
         annotated_pdf_bytes = byte_stream.getvalue()
         doc.close()
         return language_matches, annotated_pdf_bytes
     except Exception as e:
         print(f"Error in highlighting PDF: {e}")
         return b""
 # ------------------------------
 # Main Analysis Function
 # ------------------------------

 def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
     """
+    Highlights language issues in the PDF and returns the annotated PDF as bytes.
+    This function maps LanguageTool matches to specific words in the PDF
+    and highlights those words.
     """
     try:
         # Open the PDF
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
+        # print(f"Opened PDF with {len(doc)} pages.")
+        # print(language_matches)
+        # Extract words with positions from each page
+        word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
         for page_number in range(len(doc)):
             page = doc[page_number]
+            words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
             for w in words:
+#                 print(w)
                 word_text = w[4]
+                # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
+                if '[' in word_text:
+                    word_text = word_text.replace('[', ' [')
                 word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
+        # print(f"Total words extracted: {len(word_list)}")
+        # Concatenate all words to form the full text
+        concatenated_text = " ".join([w[1] for w in word_list])
+        # print(f"Concatenated text length: {concatenated_text} characters.")
+        # Iterate over each language issue
+        for idx, issue in enumerate(language_matches, start=1):
+            offset = issue["offset"]  # offset+line_no-1
             length = issue["length"]
+            error_text = concatenated_text[offset:offset+length+1]
+            # print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
+            # Find the words that fall within the error span
+            current_pos = 0
             target_words = []
+            for word in word_list:
+                word_text = word[1]
+                word_length = len(word_text) + 1  # +1 for the space
+                if current_pos + word_length > offset and current_pos < offset + length:
                     target_words.append(word)
+                current_pos += word_length
             if not target_words:
+                # print("No matching words found for this issue.")
                 continue
+            initial_x = target_words[0][2]
+            initial_y = target_words[0][3]
+            final_x = target_words[len(target_words)-1][4]
+            final_y = target_words[len(target_words)-1][5]
+            issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
+            issue["page"] = target_words[0][0] + 1
+            # Add highlight annotations to the target words
+            print()
+            print("issue", issue)
+            print("error text", error_text)
+            print(target_words)
+            print()
+            for target in target_words:
+                page_num, word_text, x0, y0, x1, y1 = target
+                page = doc[page_num]
+                # Define a rectangle around the word with some padding
+                rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
+                # Add a highlight annotation
+                highlight = page.add_highlight_annot(rect)
+                highlight.set_colors(stroke=(1, 1, 0))  # Yellow color
+                highlight.update()
+                # print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
         # Save annotated PDF to bytes
         byte_stream = io.BytesIO()
         annotated_pdf_bytes = byte_stream.getvalue()
         doc.close()
+        # Save annotated PDF locally for verification
+        with open("annotated_temp.pdf", "wb") as f:
+            f.write(annotated_pdf_bytes)
+        # print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
         return language_matches, annotated_pdf_bytes
     except Exception as e:
         print(f"Error in highlighting PDF: {e}")
         return b""
 # ------------------------------
 # Main Analysis Function
 # ------------------------------