File size: 3,329 Bytes
bf8799d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d337b8
 
 
 
 
 
 
bf8799d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d337b8
 
bf8799d
 
 
 
 
 
 
7d337b8
bf8799d
7d337b8
 
 
 
 
 
 
 
 
 
bf8799d
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import streamlit as st
import tempfile
import os
from pdf2markdown4llm import PDF2Markdown4LLM

st.set_page_config(page_title="PDF to Markdown Converter", layout="wide")

st.title("PDF to Markdown Converter")
st.write("Convert your PDF files to Markdown format")

def progress_callback(progress):
    """Callback function to handle progress updates"""
    progress_bar.progress(progress.percentage / 100)
    status_text.text(f"Phase: {progress.phase.value}, Page {progress.current_page}/{progress.total_pages}\n"
                     f"Progress: {progress.percentage:.1f}%, Message: {progress.message}")

def format_markdown_for_preview(markdown_text):
    """Format markdown text for proper preview display"""
    # Ensure newlines are preserved by adding two spaces at the end of each line
    lines = markdown_text.split('\n')
    formatted_lines = [line + '  ' if line.strip() else line for line in lines]
    return '\n'.join(formatted_lines)

# File upload
uploaded_file = st.file_uploader("Select a PDF file", type=['pdf'])

if uploaded_file is not None:
    # Configuration options
    with st.expander("Conversion Settings"):
        remove_headers = st.checkbox("Remove Headers", value=False)
        skip_empty_tables = st.checkbox("Skip Empty Tables", value=True)
        table_header = st.text_input("Table Header Format", value="### Table")

    if st.button("Start Conversion"):
        # Initialize progress bar and status text
        progress_bar = st.progress(0)
        status_text = st.empty()

        try:
            # Create temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                tmp_file.write(uploaded_file.getvalue())
                tmp_file_path = tmp_file.name

            # Initialize converter
            converter = PDF2Markdown4LLM(
                remove_headers=remove_headers,
                skip_empty_tables=skip_empty_tables,
                table_header=table_header,
                progress_callback=progress_callback
            )

            # Perform conversion
            markdown_content = converter.convert(tmp_file_path)

            # Clean up temporary file
            os.unlink(tmp_file_path)

            # Display results
            st.success("Conversion completed successfully!")
            
            # Raw markdown download
            st.download_button(
                label="Download Markdown File",
                data=markdown_content,
                file_name="converted.md",
                mime="text/markdown"
            )
            
            # Preview with proper formatting
            st.subheader("Preview")
            
            # Create tabs for different preview modes
            preview_tab, raw_tab = st.tabs(["Formatted Preview", "Raw Markdown"])
            
            with preview_tab:
                formatted_content = format_markdown_for_preview(markdown_content)
                st.markdown(formatted_content)
            
            with raw_tab:
                st.code(markdown_content, language="markdown")

        except Exception as e:
            st.error(f"An error occurred: {str(e)}")
            if 'progress_bar' in locals():
                progress_bar.empty()
            if 'status_text' in locals():
                status_text.empty()