Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 11, 2024

Commit

b4b5bbe

verified ·

1 Parent(s): cc5c62b

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -19

app.py CHANGED Viewed

@@ -1,27 +1,54 @@
 import gradio as gr
-import PyMuPDF as fitz  # Importing PyMuPDF as fitz
-# Function to extract text from a PDF
-def extract_pdf_text(file):
-    doc = fitz.open(file.name)  # Open the PDF file using PyMuPDF
-    text = ""
-    for page in doc:
-        text += page.get_text()  # Extract text from each page
-    return text
-# Gradio interface
-output_format_dropdown = gr.Dropdown(
-    choices=["txt", "pdf", "docx"],
-    label="Output Format",
-    default="txt"
-)
 iface = gr.Interface(
-    fn=extract_pdf_text,
-    inputs=gr.File(label="Upload PDF File"),
-    outputs=[gr.Textbox(label="Extracted Text"), output_format_dropdown],
-    live=True
 )
 if __name__ == "__main__":
-    iface.launch()

+import PyPDF2
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextBoxHorizontal, LTFigure
 import gradio as gr
+def parse_pdf(pdf_file, output_format):
+    with open(pdf_file, 'rb') as file:
+        pages = extract_pages(file)
+        text = ""
+        tables = []
+        figures = []
+        for page in pages:
+            for element in page:
+                if isinstance(element, LTTextBoxHorizontal):
+                    text += element.get_text()
+                elif isinstance(element, LTFigure):
+                    figures.append(element)
+        # Extract tables (more advanced techniques might be needed)
+        # ...
+    if output_format == "JSON":
+        # Replace this with your JSON conversion logic, including tables and figures
+        json_output = {"text": text, "figures": figures}  # Placeholder for JSON conversion
+        return json_output
+    elif output_format == "Markdown":
+        # Replace this with your Markdown conversion logic, including tables and figures
+        markdown_output = f"# Extracted Text\n\n{text}\n\n# Figures\n"
+        for fig in figures:
+            # Handle figure conversion (e.g., saving as images)
+            # ...
+        return markdown_output
+    elif output_format == "HTML":
+        # Replace this with your HTML conversion logic, including tables and figures
+        html_output = f"<p>{text}</p>\n"
+        for fig in figures:
+            # Handle figure conversion (e.g., embedding images)
+            # ...
+        return html_output
+# Create the Gradio interface
 iface = gr.Interface(
+    fn=parse_pdf,
+    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
+    outputs="text",
+    title="PDF Parser",
+    description="Parse a PDF and choose the output format."
 )
+# Launch the Gradio app
 if __name__ == "__main__":
+    iface.launch()