Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Running

App Files Files Community

samyak152002 commited on Oct 7, 2024

Commit

3700c3a

verified ·

1 Parent(s): 6ecdc78

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -34

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ import sys
 import traceback
 import io
 import os
 class PDFAnalyzer:
     def __init__(self, file_path: str):
@@ -32,11 +33,11 @@ class PDFAnalyzer:
     def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
         """Checks for the presence of required terms in the text."""
-        return {term: term in self.full_text for term in search_terms}
     def label_authors(self) -> str:
         """Label authors in the text with 'Authors:' if not already labeled."""
-        author_line_regex = r"^(?:.*\n)(.*?)(?:\n\nNetaji Subhas University of Technology, Dwarka, Delhi, 110078, India)"
         match = re.search(author_line_regex, self.full_text, re.MULTILINE)
         if match:
             authors = match.group(1).strip()
@@ -314,49 +315,62 @@ class PDFAnalyzer:
 def analyze_pdf(file):
     try:
-        # Save the uploaded file temporarily
-        temp_path = "temp_uploaded.pdf"
-        with open(temp_path, "wb") as f:
-            f.write(file.read())
-        analyzer = PDFAnalyzer(temp_path)
-        results = analyzer.analyze()
-        # Ensure all keys are present in the results, even if they're empty
-        default_results = {
-            "annotated_pdf_path": "",
-            "metadata": {},
-            "disclosures": {},
-            "figures_and_tables": {},
-            "figure_order": {},
-            "references": {},
-            "reference_order": {},
-            "reference_style": {},
-            "structure": {},
-            "language": {},
-        }
-        # Update default_results with actual results
-        default_results.update(results)
-        return json.dumps(default_results, indent=2, default=str)
     except Exception as e:
         error_message = {
             "error": str(e),
             "traceback": traceback.format_exc()
         }
-        return json.dumps(error_message, indent=2)
-    finally:
-        # Clean up the temporary file
-        if os.path.exists(temp_path):
-            os.remove(temp_path)
 # Create Gradio interface
 iface = gr.Interface(
     fn=analyze_pdf,
     inputs=gr.File(label="Upload PDF"),
-    outputs=gr.JSON(label="Analysis Results"),
     title="PDF Analyzer",
     description="Upload a PDF document to analyze its structure, references, language, and more.",
 )

 import traceback
 import io
 import os
+import tempfile
 class PDFAnalyzer:
     def __init__(self, file_path: str):
     def check_text_presence(self, search_terms: List[str]) -> Dict[str, bool]:
         """Checks for the presence of required terms in the text."""
+        return {term: term.lower() in self.full_text.lower() for term in search_terms}
     def label_authors(self) -> str:
         """Label authors in the text with 'Authors:' if not already labeled."""
+        author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
         match = re.search(author_line_regex, self.full_text, re.MULTILINE)
         if match:
             authors = match.group(1).strip()
 def analyze_pdf(file):
     try:
+        # Create a temporary directory to store files
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Save the uploaded file temporarily
+            temp_path = os.path.join(temp_dir, "uploaded.pdf")
+            with open(temp_path, "wb") as f:
+                f.write(file.read())
+            analyzer = PDFAnalyzer(temp_path)
+            results = analyzer.analyze()
+            # Ensure all keys are present in the results, even if they're empty
+            default_results = {
+                "annotated_pdf_path": "",
+                "metadata": {},
+                "disclosures": {},
+                "figures_and_tables": {},
+                "figure_order": {},
+                "references": {},
+                "reference_order": {},
+                "reference_style": {},
+                "structure": {},
+                "language": {},
+            }
+            # Update default_results with actual results
+            default_results.update(results)
+            # Handle the annotated PDF
+            annotated_pdf_path = results.get("annotated_pdf_path", "")
+            if annotated_pdf_path and os.path.exists(annotated_pdf_path):
+                # Read the annotated PDF and return it as bytes
+                with open(annotated_pdf_path, "rb") as f:
+                    annotated_pdf_bytes = f.read()
+            else:
+                annotated_pdf_bytes = None
+            # Remove the annotated_pdf_path from the results as we're returning the file separately
+            default_results.pop("annotated_pdf_path", None)
+            return json.dumps(default_results, indent=2, default=str), annotated_pdf_bytes
     except Exception as e:
         error_message = {
             "error": str(e),
             "traceback": traceback.format_exc()
         }
+        return json.dumps(error_message, indent=2), None
 # Create Gradio interface
 iface = gr.Interface(
     fn=analyze_pdf,
     inputs=gr.File(label="Upload PDF"),
+    outputs=[
+        gr.JSON(label="Analysis Results"),
+        gr.File(label="Annotated PDF")
+    ],
     title="PDF Analyzer",
     description="Upload a PDF document to analyze its structure, references, language, and more.",
 )