Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,36 +5,7 @@ from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
|
|
5 |
import os # Import os for file path manipulation
|
6 |
|
7 |
def parse_pdf(pdf_file, output_format):
|
8 |
-
|
9 |
-
pages = extract_pages(file)
|
10 |
-
|
11 |
-
text = ""
|
12 |
-
tables = [] # Placeholder for extracted table data
|
13 |
-
images = [] # List to store extracted image data
|
14 |
-
|
15 |
-
for page in pages:
|
16 |
-
for element in page:
|
17 |
-
if isinstance(element, LTTextBoxHorizontal):
|
18 |
-
text += element.get_text()
|
19 |
-
elif isinstance(element, (LTFigure, LTImage)):
|
20 |
-
# Extract image data (e.g., save as image, convert to base64)
|
21 |
-
# ... (Implement image processing logic)
|
22 |
-
# Here's an example of extracting image data and saving the image
|
23 |
-
if hasattr(element, 'stream'): # Check for image data stream (LTImage)
|
24 |
-
image_data = element.stream.read()
|
25 |
-
else: # Handle LTFigure (may require additional processing)
|
26 |
-
# ... (Implement logic to extract image data from LTFigure)
|
27 |
-
# You might need libraries like Pillow for image manipulation
|
28 |
-
image_data = b"Placeholder for extracted image data" # Example placeholder
|
29 |
-
|
30 |
-
image_filename = f"extracted_image_{len(images)}.jpg"
|
31 |
-
with open(image_filename, 'wb') as image_file:
|
32 |
-
image_file.write(image_data)
|
33 |
-
images.append({"filename": image_filename}) # Add filename to image data
|
34 |
-
|
35 |
-
# Implement table extraction logic (e.g., using heuristics or advanced techniques)
|
36 |
-
# You can use libraries like Camelot for complex tables
|
37 |
-
# ...
|
38 |
|
39 |
# Convert extracted data to desired format and populate download_data
|
40 |
if output_format == "JSON":
|
@@ -43,34 +14,17 @@ def parse_pdf(pdf_file, output_format):
|
|
43 |
"tables": tables, # Replace with actual table data
|
44 |
"images": images # List of dictionaries with filenames
|
45 |
}
|
46 |
-
download_data = json.dumps(json_data)
|
47 |
|
48 |
elif output_format == "Markdown":
|
49 |
-
#
|
50 |
-
|
51 |
-
|
52 |
-
markdown_text = f"# Extracted Text\n\n{text}\n\n# Images\n"
|
53 |
-
# Implement logic to embed images within Markdown (optional)
|
54 |
-
# ... (e.g., use relative paths if images are saved locally)
|
55 |
-
# or (consider alternative Markdown image embedding methods)
|
56 |
-
download_data = markdown_text.encode("utf-8")
|
57 |
|
58 |
elif output_format == "HTML":
|
59 |
-
#
|
60 |
-
|
61 |
-
# ... (Implement table data conversion to HTML)
|
62 |
-
# html_tables += "</table>"
|
63 |
-
|
64 |
-
html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
|
65 |
-
# Implement logic to display images within HTML (optional)
|
66 |
-
# ... (e.g., use `<img>` tags with image source)
|
67 |
-
download_data = html_text.encode("utf-8")
|
68 |
-
|
69 |
-
# Create a temporary directory to store downloaded files (optional)
|
70 |
-
# download_dir = tempfile.mkdtemp() # Uncomment if needed for temporary storage
|
71 |
|
72 |
-
|
73 |
-
return text, os.path.join(os.getcwd(), images[0]["filename"]) # Example using first image
|
74 |
|
75 |
iface = gr.Interface(
|
76 |
fn=parse_pdf,
|
@@ -84,4 +38,4 @@ iface = gr.Interface(
|
|
84 |
)
|
85 |
|
86 |
if __name__ == "__main__":
|
87 |
-
iface.launch(share=False) # Set share=False
|
|
|
5 |
import os # Import os for file path manipulation
|
6 |
|
7 |
def parse_pdf(pdf_file, output_format):
|
8 |
+
# ... (Your existing parsing logic)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Convert extracted data to desired format and populate download_data
|
11 |
if output_format == "JSON":
|
|
|
14 |
"tables": tables, # Replace with actual table data
|
15 |
"images": images # List of dictionaries with filenames
|
16 |
}
|
17 |
+
download_data = json.dumps(json_data) # No need to encode as Gradio handles it
|
18 |
|
19 |
elif output_format == "Markdown":
|
20 |
+
# ... (Your Markdown conversion logic)
|
21 |
+
download_data = markdown_text
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
elif output_format == "HTML":
|
24 |
+
# ... (Your HTML conversion logic)
|
25 |
+
download_data = html_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
return text, download_data
|
|
|
28 |
|
29 |
iface = gr.Interface(
|
30 |
fn=parse_pdf,
|
|
|
38 |
)
|
39 |
|
40 |
if __name__ == "__main__":
|
41 |
+
iface.launch(share=False) # Set share=False as Gradio warns about it on Hugging Face Spaces
|