Spaces:
Running
Running
File size: 3,329 Bytes
bf8799d 7d337b8 bf8799d 7d337b8 bf8799d 7d337b8 bf8799d 7d337b8 bf8799d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import streamlit as st
import tempfile
import os
from pdf2markdown4llm import PDF2Markdown4LLM
st.set_page_config(page_title="PDF to Markdown Converter", layout="wide")
st.title("PDF to Markdown Converter")
st.write("Convert your PDF files to Markdown format")
def progress_callback(progress):
"""Callback function to handle progress updates"""
progress_bar.progress(progress.percentage / 100)
status_text.text(f"Phase: {progress.phase.value}, Page {progress.current_page}/{progress.total_pages}\n"
f"Progress: {progress.percentage:.1f}%, Message: {progress.message}")
def format_markdown_for_preview(markdown_text):
"""Format markdown text for proper preview display"""
# Ensure newlines are preserved by adding two spaces at the end of each line
lines = markdown_text.split('\n')
formatted_lines = [line + ' ' if line.strip() else line for line in lines]
return '\n'.join(formatted_lines)
# File upload
uploaded_file = st.file_uploader("Select a PDF file", type=['pdf'])
if uploaded_file is not None:
# Configuration options
with st.expander("Conversion Settings"):
remove_headers = st.checkbox("Remove Headers", value=False)
skip_empty_tables = st.checkbox("Skip Empty Tables", value=True)
table_header = st.text_input("Table Header Format", value="### Table")
if st.button("Start Conversion"):
# Initialize progress bar and status text
progress_bar = st.progress(0)
status_text = st.empty()
try:
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
# Initialize converter
converter = PDF2Markdown4LLM(
remove_headers=remove_headers,
skip_empty_tables=skip_empty_tables,
table_header=table_header,
progress_callback=progress_callback
)
# Perform conversion
markdown_content = converter.convert(tmp_file_path)
# Clean up temporary file
os.unlink(tmp_file_path)
# Display results
st.success("Conversion completed successfully!")
# Raw markdown download
st.download_button(
label="Download Markdown File",
data=markdown_content,
file_name="converted.md",
mime="text/markdown"
)
# Preview with proper formatting
st.subheader("Preview")
# Create tabs for different preview modes
preview_tab, raw_tab = st.tabs(["Formatted Preview", "Raw Markdown"])
with preview_tab:
formatted_content = format_markdown_for_preview(markdown_content)
st.markdown(formatted_content)
with raw_tab:
st.code(markdown_content, language="markdown")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
if 'progress_bar' in locals():
progress_bar.empty()
if 'status_text' in locals():
status_text.empty() |