Spaces:

samyak152002
/

texmetrics-regex-checks-gradio-1

Running

App Files Files Community

samyak152002 commited on Nov 19, 2024

Commit

4dd18db

verified ·

1 Parent(s): e444d56

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -37

app.py CHANGED Viewed

@@ -30,16 +30,20 @@ def extract_pdf_text_by_page(file) -> List[str]:
 def extract_pdf_text(file) -> str:
     """Extracts full text from a PDF file using PyMuPDF."""
     try:
         # Open the PDF file
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         full_text = ""
         for page_num, page in enumerate(doc, start=1):
             text = page.get_text("text")
             full_text += text + "\n"
             print(f"Extracted text from page {page_num}: {len(text)} characters.")
         doc.close()
         print(f"Total extracted text length: {len(full_text)} characters.")
         return full_text
     except Exception as e:
         print(f"Error extracting text from PDF: {e}")
@@ -346,12 +350,10 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
 # Main Analysis Function
 # ------------------------------
-def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
     """Analyzes the PDF for language issues and returns results and annotated PDF."""
     try:
-        # Reset file pointer before reading
-        file.seek(0)
-        full_text = extract_pdf_text(file)
         if not full_text:
             return {"error": "Failed to extract text from PDF."}, None
@@ -360,9 +362,7 @@ def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
             return language_issues, None
         issues = language_issues.get("issues", [])
-        # Reset file pointer before highlighting
-        file.seek(0)
-        annotated_pdf = highlight_issues_in_pdf(file, issues) if issues else None
         return language_issues, annotated_pdf
     except Exception as e:
         return {"error": str(e)}, None
@@ -375,36 +375,45 @@ def process_upload(file):
     """
     Process the uploaded PDF file and return analysis results and annotated PDF.
     """
-    try:
-        if file is None:
-            return json.dumps({"error": "No file uploaded"}, indent=2), None
-        # Create a temporary file to work with
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_input:
-            temp_input.write(file)
-            temp_input_path = temp_input.name
-        # Analyze the PDF
-        results, annotated_pdf = analyze_pdf(temp_input_path)
-        results_json = json.dumps(results, indent=2)
-        # Clean up the temporary input file
-        os.unlink(temp_input_path)
-        # If we have an annotated PDF, save it temporarily
-        if annotated_pdf:
-            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
-                tmp_file.write(annotated_pdf)
-                return results_json, tmp_file.name
-        return results_json, None
-    except Exception as e:
-        error_message = json.dumps({
-            "error": str(e),
-            "traceback": traceback.format_exc()
-        }, indent=2)
-        return error_message, None
 def create_interface():
@@ -416,7 +425,7 @@ def create_interface():
             file_input = gr.File(
                 label="Upload PDF",
                 file_types=[".pdf"],
-                type="binary"  # Changed from "file" to "binary"
             )
         with gr.Row():

 def extract_pdf_text(file) -> str:
     """Extracts full text from a PDF file using PyMuPDF."""
+    print("me llamo samyak")
     try:
         # Open the PDF file
+        print("me llamo samyak")
         doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
         full_text = ""
+        print(doc)
         for page_num, page in enumerate(doc, start=1):
             text = page.get_text("text")
             full_text += text + "\n"
             print(f"Extracted text from page {page_num}: {len(text)} characters.")
         doc.close()
         print(f"Total extracted text length: {len(full_text)} characters.")
+        print(full_text)
         return full_text
     except Exception as e:
         print(f"Error extracting text from PDF: {e}")
 # Main Analysis Function
 # ------------------------------
+def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
     """Analyzes the PDF for language issues and returns results and annotated PDF."""
     try:
+        full_text = extract_pdf_text(filepath)
         if not full_text:
             return {"error": "Failed to extract text from PDF."}, None
             return language_issues, None
         issues = language_issues.get("issues", [])
+        annotated_pdf = highlight_issues_in_pdf(filepath, issues) if issues else None
         return language_issues, annotated_pdf
     except Exception as e:
         return {"error": str(e)}, None
     """
     Process the uploaded PDF file and return analysis results and annotated PDF.
     """
+    # print(file.name)
+    if file is None:
+        return json.dumps({"error": "No file uploaded"}, indent=2), None
+    # # Create a temporary file to work with
+    # with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_input:
+    #     temp_input.write(file)
+    #     temp_input_path = temp_input.name
+    #     print(temp_input_path)
+    temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
+    temp_input.write(file)
+    temp_input_path = temp_input.name
+    print(temp_input_path)
+    # Analyze -inputthe PDF
+    results, annotated_pdf = analyze_pdf(temp_input_path)
+    print(results)
+    results_json = json.dumps(results, indent=2)
+    # Clean up the temporary input file
+    os.unlink(temp_input_path)
+    # If we have an annotated PDF, save it temporarily
+    if annotated_pdf:
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+            tmp_file.write(annotated_pdf)
+            return results_json, tmp_file.name
+    return results_json, None
+    # except Exception as e:
+    #     error_message = json.dumps({
+    #         "error": str(e),
+    #         "traceback": traceback.format_exc()
+    #     }, indent=2)
+    #     return error_message, None
 def create_interface():
             file_input = gr.File(
                 label="Upload PDF",
                 file_types=[".pdf"],
+                type="binary"
             )
         with gr.Row():