Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Running

App Files Files Community

samyak152002 commited on Dec 9, 2024

Commit

810882a

verified ·

1 Parent(s): 4bb46a1

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -31

app.py CHANGED Viewed

@@ -19,14 +19,14 @@ os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
 # Analysis Functions
 # ------------------------------
-def extract_pdf_text_by_page(file) -> List[str]:
-    """Extracts text from a PDF file, page by page, using PyMuPDF."""
-    if isinstance(file, str):
-        with fitz.open(file) as doc:
-            return [page.get_text("text") for page in doc]
-    else:
-        with fitz.open(stream=file.read(), filetype="pdf") as doc:
-            return [page.get_text("text") for page in doc]
 def extract_pdf_text(file) -> str:
     """Extracts full text from a PDF file using PyMuPDF."""
@@ -34,23 +34,12 @@ def extract_pdf_text(file) -> str:
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         full_text = ""
-        for page_num, page in enumerate(doc, start=1):
-            # Get text blocks with their coordinates
-            blocks = page.get_text("blocks")
-            processed_text = ""
-            for block in blocks:
-                text = block[4]  # The text content is at index 4
-                # Handle line-break hyphens
-                # text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
-                # Preserve regular hyphens within words (e.g., "state-of-the-art")
-                processed_text += text + "\n"
-            full_text += processed_text
-            print(f"Extracted text from page {page_num}: {len(processed_text)} characters.")
         doc.close()
         print(f"Total extracted text length: {len(full_text)} characters.")
         return full_text
@@ -125,6 +114,10 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
         # Process LanguageTool matches
         for match in matches:
             issues.append({
                 "message": match.message,
                 "context": match.context.strip(),
@@ -249,18 +242,21 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
         word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
         for page_number in range(len(doc)):
             page = doc[page_number]
             words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
             for w in words:
 #                 print(w)
                 word_text = w[4]
                 # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
-                if '[' in word_text:
-                    word_text = word_text.replace('[', ' [')
                 word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
         # print(f"Total words extracted: {len(word_list)}")
         # Concatenate all words to form the full text
         concatenated_text = " ".join([w[1] for w in word_list])
         # print(f"Concatenated text length: {concatenated_text} characters.")
         # Find "References" section and exclude from processing
@@ -277,8 +273,8 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
                 continue
-            error_text = concatenated_text[offset:offset+length+1]
-            # print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
             # Find the words that fall within the error span
             current_pos = 0
@@ -350,7 +346,8 @@ def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
         full_text = extract_pdf_text(filepath)
         if not full_text:
             return {"error": "Failed to extract text from PDF."}, None
         language_issues = check_language_issues(full_text)
         # Handle potential errors from check_language_issues
@@ -456,7 +453,7 @@ def create_interface():
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch(
-        share=True,  # Set to False in production
         # server_name="0.0.0.0",
         server_port=None
     )

 # Analysis Functions
 # ------------------------------
+# def extract_pdf_text_by_page(file) -> List[str]:
+#     """Extracts text from a PDF file, page by page, using PyMuPDF."""
+#     if isinstance(file, str):
+#         with fitz.open(file) as doc:
+#             return [page.get_text("text") for page in doc]
+#     else:
+#         with fitz.open(stream=file.read(), filetype="pdf") as doc:
+#             return [page.get_text("text") for page in doc]
 def extract_pdf_text(file) -> str:
     """Extracts full text from a PDF file using PyMuPDF."""
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         full_text = ""
+        for page_number in range(len(doc)):
+            page = doc[page_number]
+            words = page.get_text("word")
+            full_text += words
+        print(full_text)
         doc.close()
         print(f"Total extracted text length: {len(full_text)} characters.")
         return full_text
         # Process LanguageTool matches
         for match in matches:
+            # Ignore issues with rule_id 'EN_SPLIT_WORDS_HYPHEN'
+            if match.ruleId == "EN_SPLIT_WORDS_HYPHEN":
+                continue
             issues.append({
                 "message": match.message,
                 "context": match.context.strip(),
         word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
         for page_number in range(len(doc)):
             page = doc[page_number]
+            print(page.get_text("words"))
             words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
             for w in words:
 #                 print(w)
                 word_text = w[4]
                 # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
+                # if '[' in word_text:
+                #     word_text = word_text.replace('[', ' [')
                 word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
         # print(f"Total words extracted: {len(word_list)}")
         # Concatenate all words to form the full text
+        concatenated_text=""
         concatenated_text = " ".join([w[1] for w in word_list])
         # print(f"Concatenated text length: {concatenated_text} characters.")
         # Find "References" section and exclude from processing
                 continue
+            error_text = concatenated_text[offset:offset+length]
+            print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
             # Find the words that fall within the error span
             current_pos = 0
         full_text = extract_pdf_text(filepath)
         if not full_text:
             return {"error": "Failed to extract text from PDF."}, None
+        # print(full_text)
         language_issues = check_language_issues(full_text)
         # Handle potential errors from check_language_issues
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch(
+        share=False,  # Set to False in production
         # server_name="0.0.0.0",
         server_port=None
     )