Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,9 @@ import json
|
|
2 |
import gradio as gr
|
3 |
from pdfminer.high_level import extract_pages, extract_text
|
4 |
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
|
5 |
-
|
|
|
|
|
6 |
|
7 |
def parse_pdf(pdf_file, output_format):
|
8 |
with open(pdf_file, 'rb') as file:
|
@@ -37,10 +39,14 @@ def parse_pdf(pdf_file, output_format):
|
|
37 |
download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
|
38 |
|
39 |
elif output_format == "Markdown":
|
40 |
-
# Implement table conversion using mistletoe or other Markdown libraries
|
41 |
-
markdown_tables = mistletoe.markdown(convert_table=True)(tables) # Example using mistletoe
|
42 |
|
43 |
-
markdown_text = f"# Extracted Text\n\n{text}\n\n
|
|
|
|
|
|
|
|
|
44 |
# Implement image conversion (e.g., relative paths or base64 encoding)
|
45 |
# ...
|
46 |
download_data = markdown_text.encode("utf-8")
|
@@ -49,7 +55,7 @@ def parse_pdf(pdf_file, output_format):
|
|
49 |
# Implement table conversion using HTML table tags
|
50 |
html_tables = "<table>" # Start of HTML table (replace with actual table structure)
|
51 |
# ... (Implement table data conversion to HTML)
|
52 |
-
html_tables += "</table>"
|
53 |
|
54 |
html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
|
55 |
# Implement image conversion using `<img>` tag
|
|
|
2 |
import gradio as gr
|
3 |
from pdfminer.high_level import extract_pages, extract_text
|
4 |
from pdfminer.layout import LTTextBoxHorizontal, LTFigure, LTImage
|
5 |
+
|
6 |
+
# Optional import for Markdown table generation (comment out if not needed)
|
7 |
+
# import mistletoe
|
8 |
|
9 |
def parse_pdf(pdf_file, output_format):
|
10 |
with open(pdf_file, 'rb') as file:
|
|
|
39 |
download_data = json.dumps(json_data).encode("utf-8") # Encode JSON for download
|
40 |
|
41 |
elif output_format == "Markdown":
|
42 |
+
# Implement table conversion using mistletoe or other Markdown libraries (uncomment if needed)
|
43 |
+
# markdown_tables = mistletoe.markdown(convert_table=True)(tables) # Example using mistletoe
|
44 |
|
45 |
+
markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
|
46 |
+
# Implement table conversion (e.g., manually create Markdown table structure)
|
47 |
+
# ... (replace with your table conversion logic)
|
48 |
+
# markdown_text += markdown_tables # Uncomment if using mistletoe
|
49 |
+
markdown_text += "\n# Images\n"
|
50 |
# Implement image conversion (e.g., relative paths or base64 encoding)
|
51 |
# ...
|
52 |
download_data = markdown_text.encode("utf-8")
|
|
|
55 |
# Implement table conversion using HTML table tags
|
56 |
html_tables = "<table>" # Start of HTML table (replace with actual table structure)
|
57 |
# ... (Implement table data conversion to HTML)
|
58 |
+
# html_tables += "</table>"
|
59 |
|
60 |
html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n{html_tables}\n\n<h2>Images</h2>\n"
|
61 |
# Implement image conversion using `<img>` tag
|