Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Sleeping

App Files Files Community

samyak152002 commited on Dec 6, 2024

Commit

3eafa03

verified ·

1 Parent(s): a0e200f

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -52

app.py CHANGED Viewed

@@ -43,7 +43,7 @@ def extract_pdf_text(file) -> str:
                 text = block[4]  # The text content is at index 4
                 # Handle line-break hyphens
-                text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
                 # Preserve regular hyphens within words (e.g., "state-of-the-art")
                 processed_text += text + "\n"
@@ -236,73 +236,74 @@ def check_reference_order(full_text: str) -> Dict[str, Any]:
 def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
     """
-    Highlights language issues in the PDF and returns the annotated PDF as bytes.
-    This function maps LanguageTool matches to specific words in the PDF
-    and highlights those words.
     """
     try:
         # Open the PDF
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
-        print(f"Opened PDF with {len(doc)} pages.")
-        print(language_matches)
-        # Extract words with positions from each page
-        word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
         for page_number in range(len(doc)):
             page = doc[page_number]
-            words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
             for w in words:
-#                 print(w)
                 word_text = w[4]
-                # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
-                if '[' in word_text:
-                    word_text = word_text.replace('[', ' [')
                 word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
-        print(f"Total words extracted: {len(word_list)}")
-        # Concatenate all words to form the full text
-        concatenated_text = " ".join([w[1] for w in word_list])
-        print(f"Concatenated text length: {len(concatenated_text)} characters.")
-        # Iterate over each language issue
-        for idx, issue in enumerate(language_matches, start=1):
             offset = issue["offset"]
             length = issue["length"]
-            error_text = concatenated_text[offset:offset+length]
-            print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
-            # Find the words that fall within the error span
-            current_pos = 0
             target_words = []
-            for word in word_list:
-                word_text = word[1]
-                word_length = len(word_text) + 1  # +1 for the space
-                if current_pos + word_length > offset and current_pos < offset + length:
                     target_words.append(word)
-                current_pos += word_length
             if not target_words:
-                print("No matching words found for this issue.")
                 continue
-            initial_x = target_words[0][2]
-            initial_y = target_words[0][3]
-            final_x = target_words[len(target_words)-1][4]
-            final_y = target_words[len(target_words)-1][5]
-            issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
-            issue["page"] = target_words[0][0] + 1
-            # Add highlight annotations to the target words
-            for target in target_words:
-                page_num, word_text, x0, y0, x1, y1 = target
-                page = doc[page_num]
-                # Define a rectangle around the word with some padding
-                rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
-                # Add a highlight annotation
-                highlight = page.add_highlight_annot(rect)
-                highlight.set_colors(stroke=(1, 1, 0))  # Yellow color
-                highlight.update()
-                print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
         # Save annotated PDF to bytes
         byte_stream = io.BytesIO()
@@ -310,16 +311,14 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
         annotated_pdf_bytes = byte_stream.getvalue()
         doc.close()
-        # Save annotated PDF locally for verification
-        with open("annotated_temp.pdf", "wb") as f:
-            f.write(annotated_pdf_bytes)
-        print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
         return language_matches, annotated_pdf_bytes
     except Exception as e:
         print(f"Error in highlighting PDF: {e}")
         return b""
 # ------------------------------
 # Main Analysis Function
 # ------------------------------

                 text = block[4]  # The text content is at index 4
                 # Handle line-break hyphens
+                # text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
                 # Preserve regular hyphens within words (e.g., "state-of-the-art")
                 processed_text += text + "\n"
 def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
     """
+    Highlights language issues in the PDF and skips highlighting in the references section.
     """
     try:
         # Open the PDF
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
+        word_list = []
+        concatenated_text = ""
+        offsets = []  # Track start offsets of words
+        # Extract words and build concatenated text
         for page_number in range(len(doc)):
             page = doc[page_number]
+            words = page.get_text("words")  # (x0, y0, x1, y1, word, ...)
             for w in words:
                 word_text = w[4]
+                if "[" in word_text:
+                    word_text = word_text.replace("[", " [")  # Adjust for spaces before '['
                 word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
+                offsets.append(len(concatenated_text))
+                concatenated_text += word_text + " "
+        # Find "References" section and exclude from processing
+        references_start = concatenated_text.lower().find("references")
+        references_offset = len(concatenated_text) if references_start == -1 else references_start
+        for issue in language_matches:
             offset = issue["offset"]
             length = issue["length"]
+            # Skip issues in the references section
+            if offset >= references_offset:
+                continue
+            # Map the issue to corresponding words in the PDF
+            error_text = concatenated_text[offset:offset + length]
             target_words = []
+            for idx, word in enumerate(word_list):
+                word_offset_start = offsets[idx]
+                word_offset_end = word_offset_start + len(word[1])
+                if word_offset_start < offset + length and word_offset_end > offset:
                     target_words.append(word)
             if not target_words:
+                print(f"Skipping issue: {error_text} - No matching words found.")
                 continue
+            # Get bounding box and validate it
+            page_number = target_words[0][0]
+            page = doc[page_number]
+            # Calculate rectangle, handling multi-line or disjoint words
+            x0 = min(word[2] for word in target_words)
+            y0 = min(word[3] for word in target_words)
+            x1 = max(word[4] for word in target_words)
+            y1 = max(word[5] for word in target_words)
+            # Ensure valid rectangle
+            if x0 >= x1 or y0 >= y1:
+                print(f"Invalid rectangle for issue: {error_text} - Skipping.")
+                continue
+            rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
+            highlight = page.add_highlight_annot(rect)
+            highlight.set_colors(stroke=(1, 1, 0))  # Yellow
+            highlight.update()
         # Save annotated PDF to bytes
         byte_stream = io.BytesIO()
         annotated_pdf_bytes = byte_stream.getvalue()
         doc.close()
         return language_matches, annotated_pdf_bytes
     except Exception as e:
         print(f"Error in highlighting PDF: {e}")
         return b""
 # ------------------------------
 # Main Analysis Function
 # ------------------------------