HawkClaws's picture
Update app.py
7d337b8 verified
import streamlit as st
import tempfile
import os
from pdf2markdown4llm import PDF2Markdown4LLM
st.set_page_config(page_title="PDF to Markdown Converter", layout="wide")
st.title("PDF to Markdown Converter")
st.write("Convert your PDF files to Markdown format")
def progress_callback(progress):
"""Callback function to handle progress updates"""
progress_bar.progress(progress.percentage / 100)
status_text.text(f"Phase: {progress.phase.value}, Page {progress.current_page}/{progress.total_pages}\n"
f"Progress: {progress.percentage:.1f}%, Message: {progress.message}")
def format_markdown_for_preview(markdown_text):
"""Format markdown text for proper preview display"""
# Ensure newlines are preserved by adding two spaces at the end of each line
lines = markdown_text.split('\n')
formatted_lines = [line + ' ' if line.strip() else line for line in lines]
return '\n'.join(formatted_lines)
# File upload
uploaded_file = st.file_uploader("Select a PDF file", type=['pdf'])
if uploaded_file is not None:
# Configuration options
with st.expander("Conversion Settings"):
remove_headers = st.checkbox("Remove Headers", value=False)
skip_empty_tables = st.checkbox("Skip Empty Tables", value=True)
table_header = st.text_input("Table Header Format", value="### Table")
if st.button("Start Conversion"):
# Initialize progress bar and status text
progress_bar = st.progress(0)
status_text = st.empty()
try:
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
# Initialize converter
converter = PDF2Markdown4LLM(
remove_headers=remove_headers,
skip_empty_tables=skip_empty_tables,
table_header=table_header,
progress_callback=progress_callback
)
# Perform conversion
markdown_content = converter.convert(tmp_file_path)
# Clean up temporary file
os.unlink(tmp_file_path)
# Display results
st.success("Conversion completed successfully!")
# Raw markdown download
st.download_button(
label="Download Markdown File",
data=markdown_content,
file_name="converted.md",
mime="text/markdown"
)
# Preview with proper formatting
st.subheader("Preview")
# Create tabs for different preview modes
preview_tab, raw_tab = st.tabs(["Formatted Preview", "Raw Markdown"])
with preview_tab:
formatted_content = format_markdown_for_preview(markdown_content)
st.markdown(formatted_content)
with raw_tab:
st.code(markdown_content, language="markdown")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
if 'progress_bar' in locals():
progress_bar.empty()
if 'status_text' in locals():
status_text.empty()