File size: 2,510 Bytes
bf8799d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import streamlit as st
import tempfile
import os
from pdf2markdown4llm import PDF2Markdown4LLM

st.set_page_config(page_title="PDF to Markdown Converter", layout="wide")

st.title("PDF to Markdown Converter")
st.write("Convert your PDF files to Markdown format")

def progress_callback(progress):
    """Callback function to handle progress updates"""
    progress_bar.progress(progress.percentage / 100)
    status_text.text(f"Phase: {progress.phase.value}, Page {progress.current_page}/{progress.total_pages}\n"
                     f"Progress: {progress.percentage:.1f}%, Message: {progress.message}")

# File upload
uploaded_file = st.file_uploader("Select a PDF file", type=['pdf'])

if uploaded_file is not None:
    # Configuration options
    with st.expander("Conversion Settings"):
        remove_headers = st.checkbox("Remove Headers", value=False)
        skip_empty_tables = st.checkbox("Skip Empty Tables", value=True)
        table_header = st.text_input("Table Header Format", value="### Table")

    if st.button("Start Conversion"):
        # Initialize progress bar and status text
        progress_bar = st.progress(0)
        status_text = st.empty()

        try:
            # Create temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                tmp_file.write(uploaded_file.getvalue())
                tmp_file_path = tmp_file.name

            # Initialize converter
            converter = PDF2Markdown4LLM(
                remove_headers=remove_headers,
                skip_empty_tables=skip_empty_tables,
                table_header=table_header,
                progress_callback=progress_callback
            )

            # Perform conversion
            markdown_content = converter.convert(tmp_file_path)

            # Clean up temporary file
            os.unlink(tmp_file_path)

            # Display results
            st.success("Conversion completed successfully!")
            st.download_button(
                label="Download Markdown File",
                data=markdown_content,
                file_name="converted.md",
                mime="text/markdown"
            )
            
            st.subheader("Preview")
            st.markdown(markdown_content)

        except Exception as e:
            st.error(f"An error occurred: {str(e)}")
            if 'progress_bar' in locals():
                progress_bar.empty()
            if 'status_text' in locals():
                status_text.empty()