Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

8e00b94

verified ·

1 Parent(s): ce354c6

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -46

app.py CHANGED Viewed

@@ -1,57 +1,25 @@
-import PyPDF2
-from pdfminer.high_level import extract_pages
-from pdfminer.layout import LTTextBoxHorizontal, LTFigure
 import gradio as gr
-def process_figure(fig):
-    # Replace this with your actual figure processing logic (e.g., save image, get URL)
-    # This is a placeholder for demonstration purposes
-    processed_image_url = "https://via.placeholder.com/150"  # Placeholder image URL
-    return processed_image_url
 def parse_pdf(pdf_file, output_format):
     with open(pdf_file, 'rb') as file:
-        pages = extract_pages(file)
-        text = ""
-        tables = []  # Placeholder for tables (implementation needed)
-        figures = []
-        for page in pages:
-            for element in page:
-                if isinstance(element, LTTextBoxHorizontal):
-                    text += element.get_text()
-                elif isinstance(element, LTFigure):
-                    figures.append(element)
-        # Extract tables (more advanced techniques might be needed)
-        # ... (Implement table extraction logic here)
-    if output_format == "JSON":
-        # Replace this with your JSON conversion logic, including tables and figures
-        json_output = {"text": text, "figures": figures}  # Placeholder for JSON conversion
-        return json_output
-    elif output_format == "Markdown":
-        processed_image_url = ""
-        markdown_output = f"# Extracted Text\n\n{text}\n\n# Figures\n"
-        for fig in figures:
-            # Process each figure (e.g., save as image)
-            processed_image_url = process_figure(fig)
-            markdown_output += f"\n![]({processed_image_url})"
-        return markdown_output
-    elif output_format == "HTML":
-        processed_image_url = ""  # Define outside the loop for HTML output
-        html_output = f"<p>{text}</p>\n"
-        for fig in figures:
-            # Process each figure (e.g., save as image)
-            processed_image_url = process_figure(fig)
-            html_output += f"<img src='{processed_image_url}' alt='Figure'>"
-        return html_output
 # Create the Gradio interface
 iface = gr.Interface(
     fn=parse_pdf,
-    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
     outputs="text",
     title="PDF Parser",
     description="Parse a PDF and choose the output format."

+import marker
 import gradio as gr
 def parse_pdf(pdf_file, output_format):
     with open(pdf_file, 'rb') as file:
+        if output_format == "Markdown":
+            markdown_text = marker.convert(file)
+            return markdown_text
+        elif output_format == "HTML":
+            # Convert to HTML using marker-pdf's advanced parsing capabilities
+            # You might need to explore additional options and parameters for fine-tuning the output
+            html_text = marker.convert(file, output_format="html")
+            return html_text
+        elif output_format == "JSON":
+            # Convert to JSON using marker-pdf's structured output
+            json_output = marker.convert(file, output_format="json")
+            return json_output
 # Create the Gradio interface
 iface = gr.Interface(
     fn=parse_pdf,
+    inputs=["file", gr.Dropdown(["Markdown", "HTML", "JSON"])],
     outputs="text",
     title="PDF Parser",
     description="Parse a PDF and choose the output format."