Spaces:

sblumenf
/

pdf-convert

Sleeping

sblumenf commited on Dec 12, 2024

Commit

c506d0d

verified ·

1 Parent(s): f15272f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import gradio as gr
 from pdfminer.high_level import extract_pages, extract_text
 from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
 def parse_pdf(pdf_file, output_format):
     with open(pdf_file, 'rb') as file:
@@ -21,7 +22,9 @@ def parse_pdf(pdf_file, output_format):
                     # Here's an example of saving images with a unique filename
                     image_data = element  # Replace with your image extraction logic
                     image_filename = f"extracted_image_{len(images)}.jpg"
-                    # ... (Implement image saving logic using the filename)
                     images.append({"filename": image_filename})  # Add filename to image data
         # Implement table extraction logic (e.g., using heuristics or advanced techniques)
@@ -58,7 +61,11 @@ def parse_pdf(pdf_file, output_format):
         # ... (e.g., use `<img>` tags with image source)
         download_data = html_text.encode("utf-8")
-    return text, download_data
 iface = gr.Interface(
     fn=parse_pdf,

 import gradio as gr
 from pdfminer.high_level import extract_pages, extract_text
 from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
+import os  # Import os for file path manipulation
 def parse_pdf(pdf_file, output_format):
     with open(pdf_file, 'rb') as file:
                     # Here's an example of saving images with a unique filename
                     image_data = element  # Replace with your image extraction logic
                     image_filename = f"extracted_image_{len(images)}.jpg"
+                    # Save the image using the filename
+                    with open(image_filename, 'wb') as image_file:
+                        image_file.write(image_data)  # Assuming image_data is binary data
                     images.append({"filename": image_filename})  # Add filename to image data
         # Implement table extraction logic (e.g., using heuristics or advanced techniques)
         # ... (e.g., use `<img>` tags with image source)
         download_data = html_text.encode("utf-8")
+    # Create a temporary directory to store downloaded files (optional)
+    # download_dir = tempfile.mkdtemp()  # Uncomment if needed for temporary storage
+    # Return the extracted text and the filename (or path) for download
+    return text, os.path.join(os.getcwd(), images[0]["filename"])  # Example using first image
 iface = gr.Interface(
     fn=parse_pdf,