textmetric-stramlit-1

Sleeping

App Files Files Community

samyak152002 commited on Nov 3, 2024

Commit

a24b0c9

verified ·

1 Parent(s): 99dc100

Update annotations.py

Browse files

Files changed (1) hide show

annotations.py +65 -28

annotations.py CHANGED Viewed

@@ -1,23 +1,22 @@
 # annotations.py
 import fitz  # PyMuPDF
-import re
 from typing import List, Dict, Any, Tuple
-from collections import Counter
 import language_tool_python
 import io
 def extract_pdf_text(file) -> str:
-    """Extracts text from a PDF file using pdfminer."""
-    from pdfminer.high_level import extract_text
-    from pdfminer.layout import LAParams
-    if isinstance(file, str):
-        with open(file, 'rb') as f:
-            return extract_text(f, laparams=LAParams())
-    else:
-        file.seek(0)
-        return extract_text(file, laparams=LAParams())
 def check_language_issues(full_text: str) -> Dict[str, Any]:
     """Check for language issues using LanguageTool."""
@@ -30,7 +29,9 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
             "context": match.context,
             "suggestions": match.replacements[:3] if match.replacements else [],
             "category": match.category,
-            "rule_id": match.ruleId
         })
     return {
         "total_issues": len(issues),
@@ -38,23 +39,56 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
     }
 def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
-    """Highlights language issues in the PDF and returns the annotated PDF as bytes."""
     try:
-        if isinstance(file, str):
-            doc = fitz.open(file)
-        else:
-            file.seek(0)
-            doc = fitz.open(stream=file.read(), filetype="pdf")
-        for match in language_matches:
-            sentence = match['context']
-            # Use regular expressions to find the sentence in the text
-            for page in doc:
-                text_instances = page.search_for(sentence)
-                for inst in text_instances:
-                    # Highlight the sentence
-                    highlight = page.add_highlight_annot(inst)
-                    highlight.update()
         # Save annotated PDF to bytes
         byte_stream = io.BytesIO()
         doc.save(byte_stream)
@@ -69,6 +103,9 @@ def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
     """Analyzes the PDF for language issues and returns results and annotated PDF."""
     try:
         full_text = extract_pdf_text(file)
         language_issues = check_language_issues(full_text)
         issues = language_issues.get("issues", [])
         annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None

 # annotations.py
 import fitz  # PyMuPDF
 from typing import List, Dict, Any, Tuple
 import language_tool_python
 import io
 def extract_pdf_text(file) -> str:
+    """Extracts full text from a PDF file using PyMuPDF."""
+    try:
+        doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
+        full_text = ""
+        for page in doc:
+            full_text += page.get_text("text") + "\n"
+        doc.close()
+        return full_text
+    except Exception as e:
+        print(f"Error extracting text from PDF: {e}")
+        return ""
 def check_language_issues(full_text: str) -> Dict[str, Any]:
     """Check for language issues using LanguageTool."""
             "context": match.context,
             "suggestions": match.replacements[:3] if match.replacements else [],
             "category": match.category,
+            "rule_id": match.ruleId,
+            "offset": match.offset,
+            "length": match.errorLength
         })
     return {
         "total_issues": len(issues),
     }
 def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
+    """
+    Highlights language issues in the PDF and returns the annotated PDF as bytes.
+    This function maps LanguageTool matches to specific words in the PDF
+    and highlights those words.
+    """
     try:
+        # Open the PDF
+        doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
+        # Extract words with positions from each page
+        word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
+        page_text = ""
+        for page_number in range(len(doc)):
+            page = doc[page_number]
+            words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
+            for w in words:
+                word_text = w[4]
+                word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
+        # Concatenate all words to form the full text
+        concatenated_text = " ".join([w[1] for w in word_list])
+        # Iterate over each language issue
+        for issue in language_matches:
+            offset = issue["offset"]
+            length = issue["length"]
+            error_text = concatenated_text[offset:offset+length]
+            # Find the words that fall within the error span
+            current_pos = 0
+            target_words = []
+            for word in word_list:
+                word_text = word[1]
+                word_length = len(word_text) + 1  # +1 for the space
+                if current_pos + word_length > offset and current_pos < offset + length:
+                    target_words.append(word)
+                current_pos += word_length
+            # Add highlight annotations to the target words
+            for target in target_words:
+                page_num, word_text, x0, y0, x1, y1 = target
+                page = doc[page_num]
+                # Define a rectangle around the word
+                rect = fitz.Rect(x0, y0, x1, y1)
+                # Add a highlight annotation
+                highlight = page.add_highlight_annot(rect)
+                highlight.set_colors(stroke=(1, 1, 0))  # Yellow color
+                highlight.update()
         # Save annotated PDF to bytes
         byte_stream = io.BytesIO()
         doc.save(byte_stream)
     """Analyzes the PDF for language issues and returns results and annotated PDF."""
     try:
         full_text = extract_pdf_text(file)
+        if not full_text:
+            return {"error": "Failed to extract text from PDF."}, None
         language_issues = check_language_issues(full_text)
         issues = language_issues.get("issues", [])
         annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None