Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Sleeping

App Files Files Community

samyak152002 commited on Nov 30, 2024

Commit

6a6e3b4

verified ·

1 Parent(s): 4dd18db

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -92

app.py CHANGED Viewed

@@ -33,17 +33,17 @@ def extract_pdf_text(file) -> str:
     print("me llamo samyak")
     try:
         # Open the PDF file
-        print("me llamo samyak")
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         full_text = ""
-        print(doc)
         for page_num, page in enumerate(doc, start=1):
             text = page.get_text("text")
             full_text += text + "\n"
             print(f"Extracted text from page {page_num}: {len(text)} characters.")
         doc.close()
         print(f"Total extracted text length: {len(full_text)} characters.")
-        print(full_text)
         return full_text
     except Exception as e:
         print(f"Error extracting text from PDF: {e}")
@@ -120,7 +120,8 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
                 "category": match.category,
                 "rule_id": match.ruleId,
                 "offset": match.offset,
-                "length": match.errorLength
             })
         print(f"Total language issues found: {len(issues)}")
         return {
@@ -190,85 +191,6 @@ def check_reference_order(full_text: str) -> Dict[str, Any]:
         "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
     }
-def check_reference_style(full_text: str) -> Dict[str, Any]:
-    """Check the reference style used in the paper and identify inconsistencies."""
-    reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', full_text, re.IGNORECASE)
-    if not reference_section_match:
-        return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
-    references_text = reference_section_match.group(1)
-    reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
-    references = [ref.strip() for ref in reference_list if ref.strip()]
-    styles = []
-    inconsistent_refs = []
-    patterns = {
-        "IEEE": r'^\[\d+\]',
-        "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
-        "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
-        "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
-        "Vancouver": r'^\d+\.\s',
-        "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
-    }
-    for i, ref in enumerate(references, 1):
-        matched = False
-        for style, pattern in patterns.items():
-            if re.match(pattern, ref):
-                styles.append(style)
-                matched = True
-                break
-        if not matched:
-            styles.append("Unknown")
-            inconsistent_refs.append((i, ref, "Unknown"))
-    if not styles:
-        return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
-    style_counts = Counter(styles)
-    majority_style, majority_count = style_counts.most_common(1)[0]
-    for i, style in enumerate(styles, 1):
-        if style != majority_style and style != "Unknown":
-            inconsistent_refs.append((i, references[i-1], style))
-    consistency = majority_count / len(styles)
-    return {
-        "majority_style": majority_style,
-        "inconsistent_refs": inconsistent_refs,
-        "consistency": consistency
-    }
-# ------------------------------
-# Annotation Functions
-# ------------------------------
-def highlight_text(page, words, text, annotation):
-    """Highlight text and add annotation."""
-    text_instances = find_text_instances(words, text)
-    highlighted = False
-    for inst in text_instances:
-        highlight = page.add_highlight_annot(inst)
-        highlight.update()
-        comment = page.add_text_annot(inst[:2], annotation)
-        comment.update()
-        highlighted = True
-    return highlighted
-def find_text_instances(words, text):
-    """Find all instances of text in words."""
-    text_lower = text.lower()
-    text_words = text_lower.split()
-    instances = []
-    for i in range(len(words) - len(text_words) + 1):
-        if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
-            inst = fitz.Rect(words[i][:4])
-            for j in range(1, len(text_words)):
-                inst = inst | fitz.Rect(words[i+j][:4])
-            instances.append(inst)
-    return instances
 def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
     """
     Highlights language issues in the PDF and returns the annotated PDF as bytes.
@@ -279,13 +201,14 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
         # Open the PDF
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         print(f"Opened PDF with {len(doc)} pages.")
         # Extract words with positions from each page
         word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
         for page_number in range(len(doc)):
             page = doc[page_number]
             words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
             for w in words:
                 word_text = w[4]
                 # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
                 if '[' in word_text:
@@ -318,7 +241,12 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
             if not target_words:
                 print("No matching words found for this issue.")
                 continue
             # Add highlight annotations to the target words
             for target in target_words:
                 page_num, word_text, x0, y0, x1, y1 = target
@@ -330,6 +258,7 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
                 highlight.set_colors(stroke=(1, 1, 0))  # Yellow color
                 highlight.update()
                 print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
         # Save annotated PDF to bytes
         byte_stream = io.BytesIO()
@@ -342,7 +271,7 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
             f.write(annotated_pdf_bytes)
         print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
-        return annotated_pdf_bytes
     except Exception as e:
         print(f"Error in highlighting PDF: {e}")
         return b""
@@ -358,12 +287,11 @@ def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
             return {"error": "Failed to extract text from PDF."}, None
         language_issues = check_language_issues(full_text)
-        if "error" in language_issues:
-            return language_issues, None
-        issues = language_issues.get("issues", [])
-        annotated_pdf = highlight_issues_in_pdf(filepath, issues) if issues else None
-        return language_issues, annotated_pdf
     except Exception as e:
         return {"error": str(e)}, None

     print("me llamo samyak")
     try:
         # Open the PDF file
+#         print("me llamo samyak")
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         full_text = ""
+#         print(doc)
         for page_num, page in enumerate(doc, start=1):
             text = page.get_text("text")
             full_text += text + "\n"
             print(f"Extracted text from page {page_num}: {len(text)} characters.")
         doc.close()
         print(f"Total extracted text length: {len(full_text)} characters.")
+#         print(full_text)
         return full_text
     except Exception as e:
         print(f"Error extracting text from PDF: {e}")
                 "category": match.category,
                 "rule_id": match.ruleId,
                 "offset": match.offset,
+                "length": match.errorLength,
+                "coordinates":[]
             })
         print(f"Total language issues found: {len(issues)}")
         return {
         "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
     }
 def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
     """
     Highlights language issues in the PDF and returns the annotated PDF as bytes.
         # Open the PDF
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         print(f"Opened PDF with {len(doc)} pages.")
+        print(language_matches)
         # Extract words with positions from each page
         word_list = []  # List of tuples: (page_number, word, x0, y0, x1, y1)
         for page_number in range(len(doc)):
             page = doc[page_number]
             words = page.get_text("words")  # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
             for w in words:
+#                 print(w)
                 word_text = w[4]
                 # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
                 if '[' in word_text:
             if not target_words:
                 print("No matching words found for this issue.")
                 continue
+            initial_x = target_words[0][2]
+            initial_y = target_words[0][3]
+            final_x = target_words[len(target_words)-1][4]
+            final_y = target_words[len(target_words)-1][5]
+            issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
             # Add highlight annotations to the target words
             for target in target_words:
                 page_num, word_text, x0, y0, x1, y1 = target
                 highlight.set_colors(stroke=(1, 1, 0))  # Yellow color
                 highlight.update()
                 print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
         # Save annotated PDF to bytes
         byte_stream = io.BytesIO()
             f.write(annotated_pdf_bytes)
         print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
+        return language_matches, annotated_pdf_bytes
     except Exception as e:
         print(f"Error in highlighting PDF: {e}")
         return b""
             return {"error": "Failed to extract text from PDF."}, None
         language_issues = check_language_issues(full_text)
+        if language_issues:
+            issues = language_issues.get("issues", [])
+            language_issues, annotated_pdf = highlight_issues_in_pdf(filepath, issues) if issues else None
+            return language_issues, annotated_pdf
     except Exception as e:
         return {"error": str(e)}, None