Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,7 @@ import json
|
|
2 |
import gradio as gr
|
3 |
from pdfminer.high_level import extract_pages, extract_text
|
4 |
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
|
|
|
5 |
|
6 |
def parse_pdf(pdf_file, output_format):
|
7 |
with open(pdf_file, 'rb') as file:
|
@@ -21,7 +22,9 @@ def parse_pdf(pdf_file, output_format):
|
|
21 |
# Here's an example of saving images with a unique filename
|
22 |
image_data = element # Replace with your image extraction logic
|
23 |
image_filename = f"extracted_image_{len(images)}.jpg"
|
24 |
-
#
|
|
|
|
|
25 |
images.append({"filename": image_filename}) # Add filename to image data
|
26 |
|
27 |
# Implement table extraction logic (e.g., using heuristics or advanced techniques)
|
@@ -58,7 +61,11 @@ def parse_pdf(pdf_file, output_format):
|
|
58 |
# ... (e.g., use `<img>` tags with image source)
|
59 |
download_data = html_text.encode("utf-8")
|
60 |
|
61 |
-
|
|
|
|
|
|
|
|
|
62 |
|
63 |
iface = gr.Interface(
|
64 |
fn=parse_pdf,
|
|
|
2 |
import gradio as gr
|
3 |
from pdfminer.high_level import extract_pages, extract_text
|
4 |
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
|
5 |
+
import os # Import os for file path manipulation
|
6 |
|
7 |
def parse_pdf(pdf_file, output_format):
|
8 |
with open(pdf_file, 'rb') as file:
|
|
|
22 |
# Here's an example of saving images with a unique filename
|
23 |
image_data = element # Replace with your image extraction logic
|
24 |
image_filename = f"extracted_image_{len(images)}.jpg"
|
25 |
+
# Save the image using the filename
|
26 |
+
with open(image_filename, 'wb') as image_file:
|
27 |
+
image_file.write(image_data) # Assuming image_data is binary data
|
28 |
images.append({"filename": image_filename}) # Add filename to image data
|
29 |
|
30 |
# Implement table extraction logic (e.g., using heuristics or advanced techniques)
|
|
|
61 |
# ... (e.g., use `<img>` tags with image source)
|
62 |
download_data = html_text.encode("utf-8")
|
63 |
|
64 |
+
# Create a temporary directory to store downloaded files (optional)
|
65 |
+
# download_dir = tempfile.mkdtemp() # Uncomment if needed for temporary storage
|
66 |
+
|
67 |
+
# Return the extracted text and the filename (or path) for download
|
68 |
+
return text, os.path.join(os.getcwd(), images[0]["filename"]) # Example using first image
|
69 |
|
70 |
iface = gr.Interface(
|
71 |
fn=parse_pdf,
|