Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

f598e4b

verified ·

1 Parent(s): 7dec78f

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -5

app.py CHANGED Viewed

@@ -2,7 +2,9 @@ import json
 import gradio as gr
 from pdfminer.high_level import extract_pages, extract_text
 from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
-import mistletoe  # for Markdown table generation (optional)
 def parse_pdf(pdf_file, output_format):
     with open(pdf_file, 'rb') as file:
@@ -37,10 +39,14 @@ def parse_pdf(pdf_file, output_format):
         download_data = json.dumps(json_data).encode("utf-8")  # Encode JSON for download
     elif output_format == "Markdown":
-        # Implement table conversion using mistletoe or other Markdown libraries
-        markdown_tables = mistletoe.markdown(convert_table=True)(tables)  # Example using mistletoe
-        markdown_text = f"# Extracted Text\n\n{text}\n\n{markdown_tables}\n\n# Images\n"
         # Implement image conversion (e.g., relative paths or base64 encoding)
         # ...
         download_data = markdown_text.encode("utf-8")
@@ -49,7 +55,7 @@ def parse_pdf(pdf_file, output_format):
         # Implement table conversion using HTML table tags
         html_tables = "<table>"  # Start of HTML table (replace with actual table structure)
         # ... (Implement table data conversion to HTML)
-        html_tables += "</table>"
         html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
         # Implement image conversion using `<img>` tag

 import gradio as gr
 from pdfminer.high_level import extract_pages, extract_text
 from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
+# Optional import for Markdown table generation (comment out if not needed)
+# import mistletoe
 def parse_pdf(pdf_file, output_format):
     with open(pdf_file, 'rb') as file:
         download_data = json.dumps(json_data).encode("utf-8")  # Encode JSON for download
     elif output_format == "Markdown":
+        # Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed)
+        # markdown_tables = mistletoe.markdown(convert_table=True)(tables)  # Example using mistletoe
+        markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
+        # Implement table conversion (e.g., manually create Markdown table structure)
+        #  ... (replace with your table conversion logic)
+        # markdown_text += markdown_tables  # Uncomment if using mistletoe
+        markdown_text += "\n# Images\n"
         # Implement image conversion (e.g., relative paths or base64 encoding)
         # ...
         download_data = markdown_text.encode("utf-8")
         # Implement table conversion using HTML table tags
         html_tables = "<table>"  # Start of HTML table (replace with actual table structure)
         # ... (Implement table data conversion to HTML)
+        # html_tables += "</table>"
         html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
         # Implement image conversion using `<img>` tag