Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

f15272f

verified ·

1 Parent(s): f598e4b

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -20

app.py CHANGED Viewed

@@ -3,27 +3,26 @@ import gradio as gr
 from pdfminer.high_level import extract_pages, extract_text
 from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
-# Optional import for Markdown table generation (comment out if not needed)
-# import mistletoe
 def parse_pdf(pdf_file, output_format):
     with open(pdf_file, 'rb') as file:
         pages = extract_pages(file)
         text = ""
         tables = []  # Placeholder for extracted table data
-        images = []  # Placeholder for extracted image data
         for page in pages:
             for element in page:
                 if isinstance(element, LTTextBoxHorizontal):
                     text += element.get_text()
-                elif isinstance(element, LTFigure):
-                    # Extract image data (e.g., save as image, convert to base64)
-                    images.append(element)
-                elif isinstance(element, LTImage):
                     # Extract image data (e.g., save as image, convert to base64)
-                    images.append(element)
         # Implement table extraction logic (e.g., using heuristics or advanced techniques)
         # You can use libraries like Camelot for complex tables
@@ -34,7 +33,7 @@ def parse_pdf(pdf_file, output_format):
         json_data = {
             "text": text,
             "tables": tables,  # Replace with actual table data
-            "images": images  # Replace with actual image data (e.g., base64)
         }
         download_data = json.dumps(json_data).encode("utf-8")  # Encode JSON for download
@@ -42,13 +41,10 @@ def parse_pdf(pdf_file, output_format):
         # Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed)
         # markdown_tables = mistletoe.markdown(convert_table=True)(tables)  # Example using mistletoe
-        markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
-        # Implement table conversion (e.g., manually create Markdown table structure)
-        #  ... (replace with your table conversion logic)
-        # markdown_text += markdown_tables  # Uncomment if using mistletoe
-        markdown_text += "\n# Images\n"
-        # Implement image conversion (e.g., relative paths or base64 encoding)
-        # ...
         download_data = markdown_text.encode("utf-8")
     elif output_format == "HTML":
@@ -58,8 +54,8 @@ def parse_pdf(pdf_file, output_format):
         # html_tables += "</table>"
         html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
-        # Implement image conversion using `<img>` tag
-        # ...
         download_data = html_text.encode("utf-8")
     return text, download_data
@@ -76,4 +72,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch(share=True)  # Set share=True to create a public link

 from pdfminer.high_level import extract_pages, extract_text
 from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
 def parse_pdf(pdf_file, output_format):
     with open(pdf_file, 'rb') as file:
         pages = extract_pages(file)
         text = ""
         tables = []  # Placeholder for extracted table data
+        images = []  # List to store extracted image data
         for page in pages:
             for element in page:
                 if isinstance(element, LTTextBoxHorizontal):
                     text += element.get_text()
+                elif isinstance(element, (LTFigure, LTImage)):
                     # Extract image data (e.g., save as image, convert to base64)
+                    # ... (Implement image processing logic)
+                    # Here's an example of saving images with a unique filename
+                    image_data = element  # Replace with your image extraction logic
+                    image_filename = f"extracted_image_{len(images)}.jpg"
+                    # ... (Implement image saving logic using the filename)
+                    images.append({"filename": image_filename})  # Add filename to image data
         # Implement table extraction logic (e.g., using heuristics or advanced techniques)
         # You can use libraries like Camelot for complex tables
         json_data = {
             "text": text,
             "tables": tables,  # Replace with actual table data
+            "images": images  # List of dictionaries with filenames
         }
         download_data = json.dumps(json_data).encode("utf-8")  # Encode JSON for download
         # Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed)
         # markdown_tables = mistletoe.markdown(convert_table=True)(tables)  # Example using mistletoe
+        markdown_text = f"# Extracted Text\n\n{text}\n\n# Images\n"
+        # Implement logic to embed images within Markdown (optional)
+        # ... (e.g., use relative paths if images are saved locally)
+        #  or (consider alternative Markdown image embedding methods)
         download_data = markdown_text.encode("utf-8")
     elif output_format == "HTML":
         # html_tables += "</table>"
         html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
+        # Implement logic to display images within HTML (optional)
+        # ... (e.g., use `<img>` tags with image source)
         download_data = html_text.encode("utf-8")
     return text, download_data
 )
 if __name__ == "__main__":
+    iface.launch(share=False)  # Set share=False as Gradio warns about it on Hugging Face Spaces