Spaces:

traversaal-internal
/

Transform-PDF-Tables-to-HTML-and-Excel

Running

File size: 30,345 Bytes

import io
import streamlit as st
import requests
import time
import os
from pathlib import Path
import glob
import base64
import pandas as pd
from datetime import datetime

# Configure page
st.set_page_config(
    page_title="PDF Parser - Table Extraction Tool",
    page_icon="📋",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# Custom CSS for styling - Grey and White Theme
st.markdown("""
<style>
    .main-header {
        text-align: center;
        padding: 2rem 0;
        background: linear-gradient(135deg, #6c757d 0%, #495057 100%);
        border-radius: 10px;
        margin-bottom: 2rem;
        color: white;
    }
    
    .feature-card {
        background: #f8f9fa;
        padding: 1.5rem;
        border-radius: 10px;
        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        text-align: center;
        margin: 1rem 0;
        border: 1px solid #dee2e6;
    }
    
    .demo-button {
        background: linear-gradient(45deg, #6c757d, #495057);
        color: white;
        border: none;
        padding: 12px 24px;
        border-radius: 25px;
        font-weight: bold;
        cursor: pointer;
        margin: 10px;
    }
    
    .upload-button {
        background: #495057;
        color: white;
        border: none;
        padding: 12px 24px;
        border-radius: 25px;
        font-weight: bold;
        cursor: pointer;
        margin: 10px;
    }
    
    .success-message {
        background: #f8f9fa;
        color: #495057;
        padding: 15px;
        border-radius: 5px;
        border-left: 4px solid #6c757d;
        margin: 20px 0;
    }
    
    .processing-message {
        background: #f8f9fa;
        color: #495057;
        padding: 15px;
        border-radius: 5px;
        border-left: 4px solid #adb5bd;
        margin: 20px 0;
    }
    
    .method-tab {
        background: #f8f9fa;
        padding: 10px 15px;
        border-radius: 5px;
        margin: 5px;
        cursor: pointer;
        border: 2px solid #dee2e6;
    }
    
    .method-tab-active {
        background: #6c757d;
        color: white;
        border: 2px solid #495057;
    }
    
    .html-file-card {
        background: #f8f9fa;
        padding: 15px;
        border-radius: 8px;
        margin: 10px 0;
        border-left: 4px solid #6c757d;
    }
    
    .file-info-card {
        background: #f8f9fa;
        padding: 12px;
        border-radius: 8px;
        margin: 5px 0;
        border-left: 4px solid #6c757d;
        font-size: 0.9em;
    }
    
    .file-stats {
        color: #6c757d;
        font-size: 0.85em;
        margin-top: 5px;
    }
    
    .stSelectbox > div > div {
        background-color: #f8f9fa;
    }
    
    .hidden-text {
        color: #adb5bd;
        font-style: italic;
    }
    
    .table-container {
        max-height: 400px;
        overflow-y: auto;
        border: 1px solid #dee2e6;
        border-radius: 5px;
        padding: 10px;
        margin: 10px 0;
        background-color: white;
    }
    
    .table-header {
        background: #f8f9fa;
        padding: 10px;
        border-radius: 5px;
        margin-bottom: 10px;
        border-left: 4px solid #6c757d;
    }
    
    /* Override Streamlit button styles */
    .stButton > button {
        background-color: #6c757d !important;
        color: white !important;
        border: 1px solid #495057 !important;
        border-radius: 5px !important;
    }
    
    .stButton > button:hover {
        background-color: #495057 !important;
        border-color: #343a40 !important;
    }
    
    /* Override primary button styles */
    .stButton > button[kind="primary"] {
        background-color: #495057 !important;
        color: white !important;
        border: 1px solid #343a40 !important;
    }
    
    .stButton > button[kind="primary"]:hover {
        background-color: #343a40 !important;
    }
    
    /* Style checkboxes */
    .stCheckbox > label {
        color: #495057 !important;
    }
    
    /* Style text inputs */
    .stTextInput > div > div > input {
        background-color: #f8f9fa !important;
        border-color: #dee2e6 !important;
    }
    
    /* Style file uploader */
    .stFileUploader > div {
        background-color: #f8f9fa !important;
        border-color: #dee2e6 !important;
    }
    
    /* Style dataframes */
    .stDataFrame {
        background-color: white !important;
        border: 1px solid #dee2e6 !important;
    }
    
    /* Style selectbox */
    .stSelectbox > div > div {
        background-color: #f8f9fa !important;
        border-color: #dee2e6 !important;
    }
    
    /* Style progress bar */
    .stProgress > div > div > div {
        background-color: #6c757d !important;
    }
</style>
""", unsafe_allow_html=True)

# Initialize session state
if 'page' not in st.session_state:
    st.session_state.page = 'home'
if 'processing' not in st.session_state:
    st.session_state.processing = False
if 'results' not in st.session_state:
    st.session_state.results = None
if 'show_output_dir' not in st.session_state:
    st.session_state.show_output_dir = False
if 'selected_method' not in st.session_state:
    st.session_state.selected_method = None
if 'demo_results' not in st.session_state:
    st.session_state.demo_results = None
if 'demo_selected_methods' not in st.session_state:
    st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}

# Get the directory where the script is located (src)
SCRIPT_DIR = Path(__file__).parent

# Tesla demo document path (assuming it's in the src directory or adjust as needed)
TESLA_DOC_PATH = SCRIPT_DIR / "tesla_docs_28-41 (1)-9-14.pdf"

# Output directory is src/output
OUTPUT_BASE_PATH = SCRIPT_DIR / "output"

def show_home_page():
    # Header
    st.markdown("""
    <div class="main-header">
        <h1 style="font-size: 3rem; margin: 0; color: #f8f9fa;">Transform PDF Tables to</h1>
        <h1 style="font-size: 3rem; margin: 0; color: #ffffff;">HTML and Excel</h1>
        <p style="margin-top: 1rem; font-size: 1.2rem; opacity: 0.9;">Powered by Traversaal.ai</p>
        <p style="margin-top: 0.5rem; opacity: 0.8;">Perfect for financial reports, research papers, and data analysis.</p>
    </div>
    """, unsafe_allow_html=True)
    
    # Main buttons
    col1, col2, col3 = st.columns([1, 2, 1])
    with col2:
        col_btn1, col_btn2 = st.columns(2)
        with col_btn1:
            if st.button("📄 Upload PDF Document", key="upload_btn", help="Upload your own PDF document"):
                st.session_state.page = 'upload'
                st.rerun()
        
        with col_btn2:
            if st.button("⚡ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"):
                st.session_state.page = 'demo_setup'
                st.rerun()
    
    # Features section
    st.markdown("---")
    col1, col2, col3 = st.columns(3)
    
    with col1:
        st.markdown("""
        <div class="feature-card">
            <h3 style="color: #495057;">⚡ Lightning Fast</h3>
            <p style="color: #6c757d;">Process complex PDFs in seconds with our advanced AI algorithms</p>
        </div>
        """, unsafe_allow_html=True)
    
    with col2:
        st.markdown("""
        <div class="feature-card">
            <h3 style="color: #495057;">🔒 Secure & Private</h3>
            <p style="color: #6c757d;">Your documents are processed securely and never stored permanently</p>
        </div>
        """, unsafe_allow_html=True)
    
    with col3:
        st.markdown("""
        <div class="feature-card">
            <h3 style="color: #495057;">🔄 Batch Processing</h3>
            <p style="color: #6c757d;">Handle multiple documents and tables simultaneously</p>
        </div>
        """, unsafe_allow_html=True)

def show_upload_page():
    st.markdown("## 📄 Upload Your Document")
    
    # File upload
    uploaded_file = st.file_uploader(
        "Choose a PDF file", 
        type=['pdf'],
        help="Upload a PDF document to extract tables from"
    )
    
    # Input file path (alternative)
    st.markdown("**Or specify file path:**")
    input_file_path = st.text_input(
        "Input File Path",
        placeholder="C:\\path\\to\\your\\document.pdf",
        help="Enter the full path to your PDF file"
    )
    
    # Output directory with show/hide functionality
    output_dir = st.text_input(
        "Output Directory",
        placeholder="C:\\path\\to\\output\\folder",
        help="Directory where extracted tables will be saved",
        type="password" if not st.session_state.show_output_dir else "default"
    )
    
    # Show/Hide output directory toggle
    col1, col2 = st.columns([3, 1])
    with col2:
        if st.button("👁️ View/Hide Path"):
            st.session_state.show_output_dir = not st.session_state.show_output_dir
            st.rerun()
    
    # Extraction method selection
    st.markdown("### 🔧 Select Extraction Methods")
    col1, col2, col3 = st.columns(3)
    
    with col1:
        docling = st.checkbox("Docling", value=True, help="Advanced document processing")
    with col2:
        llamaparse = st.checkbox("LlamaParse", value=False, help="AI-powered parsing")
    with col3:
        unstructured = st.checkbox("Unstructured", value=False, help="General purpose extraction")
    
    # Process button
    if st.button("🚀 Process Document", type="primary"):
        if (uploaded_file or input_file_path) and output_dir and (docling or llamaparse or unstructured):
            file_path = input_file_path if input_file_path else uploaded_file.name
            process_document(file_path, output_dir, docling, llamaparse, unstructured)
        else:
            st.error("Please provide input file, output directory, and select at least one extraction method.")
    
    # Back button
    if st.button("← Back to Home"):
        st.session_state.page = 'home'
        st.rerun()

def show_demo_setup_page():
    st.markdown("## ⚡ Tesla 10K Demo Setup")
    st.markdown("*Configure extraction methods for Tesla's 10K document processing*")
    
    # Document info
    st.markdown("### 📄 Document Information")
    st.info("**Document:** tesla_docs_28-41 (1)-9-14.pdf")
    
    # Extraction method selection (removed output directory section completely)
    st.markdown("### 🔧 Select Extraction Methods")
    col1, col2, col3 = st.columns(3)
    
    with col1:
        docling = st.checkbox("Docling", 
                             value=st.session_state.demo_selected_methods['docling'], 
                             help="Advanced document processing")
    with col2:
        llamaparse = st.checkbox("LlamaParse", 
                                value=st.session_state.demo_selected_methods['llamaparse'], 
                                help="AI-powered parsing")
    with col3:
        unstructured = st.checkbox("Unstructured", 
                                  value=st.session_state.demo_selected_methods['unstructured'], 
                                  help="General purpose extraction")
    
    # Update session state
    st.session_state.demo_selected_methods = {
        'docling': docling,
        'llamaparse': llamaparse,
        'unstructured': unstructured
    }
    
    # Process button
    col1, col2 = st.columns([2, 1])
    with col1:
        if st.button("🚀 Process Tesla Document", type="primary"):
            if docling or llamaparse or unstructured:
                st.session_state.page = 'demo'
                st.session_state.processing = True
                st.rerun()
            else:
                st.error("Please select at least one extraction method.")
    
    with col2:
        if st.button("← Back to Home"):
            st.session_state.page = 'home'
            st.rerun()

def show_demo_page():
    if st.session_state.processing:
        show_processing_demo()
    else:
        show_demo_results()

def show_processing_demo():
    st.markdown("## ⚡ Processing Tesla 10K Document...")
    
    # Show selected methods
    selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
    st.markdown(f"*Processing with selected methods: {', '.join([m.title() for m in selected_methods])}*")
    
    # Progress bar
    progress_bar = st.progress(0)
    status_text = st.empty()
    method_status = st.empty()
    
    # Calculate total steps based on selected methods
    total_methods = len(selected_methods)
    steps_per_method = 30
    total_steps = total_methods * steps_per_method
    
    current_method_index = 0
    for i in range(total_steps):
        progress = (i + 1) / total_steps
        progress_bar.progress(progress)
        
        # Determine current method
        method_step = i % steps_per_method
        if method_step == 0 and i > 0:
            current_method_index += 1
        
        current_method = selected_methods[current_method_index]
        method_progress = (method_step + 1) / steps_per_method
        
        # Update status messages
        if method_progress < 0.3:
            status_text.text(f"📄 {current_method.title()}: Reading document... {int(method_progress * 100)}%")
        elif method_progress < 0.7:
            status_text.text(f"🔍 {current_method.title()}: Extracting tables... {int(method_progress * 100)}%")
        else:
            status_text.text(f"💾 {current_method.title()}: Generating HTML outputs... {int(method_progress * 100)}%")
        
        method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}")
        
        time.sleep(0.33)
    
    # Show completion
    st.markdown("""
    <div class="success-message">
        ✅ <strong>Document processed successfully!</strong><br>
        Tables have been extracted using selected methods and HTML files are ready for viewing.
    </div>
    """, unsafe_allow_html=True)
    
    # Process Tesla demo
    process_tesla_demo()
    
    st.session_state.processing = False
    time.sleep(2)
    st.rerun()

def process_tesla_demo():
    """Process Tesla demo document using selected extraction methods"""
    try:
        # Create output directory for demo (using the base path)
        demo_output_dir = OUTPUT_BASE_PATH / "tesla_demo"
        
        # Prepare the request data for selected methods only
        data = {
            'input_file_path': str(TESLA_DOC_PATH),
            'output_dir': str(demo_output_dir),
            'docling': st.session_state.demo_selected_methods['docling'],
            'llamaparse': st.session_state.demo_selected_methods['llamaparse'],
            'unstructured': st.session_state.demo_selected_methods['unstructured']
        }
        
        # Make request to FastAPI endpoint (uncomment when ready)
        # response = requests.post('http://localhost:8000/extract', data=data)
        # if response.status_code == 200:
        #     st.session_state.demo_results = response.json()
        
        # For demo purposes, simulate successful processing for selected methods only
        results = {}
        if st.session_state.demo_selected_methods['docling']:
            results['docling'] = {'status': 'success', 'total_tables': 5}
        if st.session_state.demo_selected_methods['llamaparse']:
            results['llamaparse'] = {'status': 'success', 'total_tables': 3}
        if st.session_state.demo_selected_methods['unstructured']:
            results['unstructured'] = {'status': 'success', 'total_tables': 4}
        
        st.session_state.demo_results = {'results': results}
        
    except Exception as e:
        st.error(f"Error processing Tesla demo: {str(e)}")

def count_html_files(directory):
    """Count only HTML files in directory"""
    if not os.path.exists(directory):
        return 0
    
    html_files = glob.glob(os.path.join(str(directory), "*.html"))
    html_files.extend(glob.glob(os.path.join(str(directory), "**", "*.html"), recursive=True))
    return len(html_files)

def get_excel_files(directory):
    """Get all Excel files from directory"""
    if not os.path.exists(directory):
        return []
    
    excel_files = glob.glob(os.path.join(str(directory), "*.xlsx"))
    excel_files.extend(glob.glob(os.path.join(str(directory), "*.xls")))
    excel_files.extend(glob.glob(os.path.join(str(directory), "*.csv")))
    excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xlsx"), recursive=True))
    excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xls"), recursive=True))
    return excel_files

def get_file_info(file_path):
    """Get file information including size and modification time"""
    if not os.path.exists(file_path):
        return {"size": 0, "modified": "Unknown"}
    
    stat = os.stat(file_path)
    size_kb = stat.st_size / 1024
    modified = datetime.fromtimestamp(stat.st_mtime)
    
    return {
        "size": f"{size_kb:.1f} KB",
        "modified": modified.strftime("%Y-%m-%d %H:%M")
    }

def show_demo_results():
    st.markdown("## 📊 Tesla 10K Processing Results")
    
    # Document info
    col1, col2 = st.columns([2, 1])
    with col1:
        st.markdown("### 📄 tesla_docs_28-41 (1)-9-14.pdf")
        st.markdown("**Status:** ✅ Complete")
        processed_methods = [method.title() for method, selected in st.session_state.demo_selected_methods.items() if selected]
        st.markdown(f"**Processed with:** {', '.join(processed_methods)}")
    
    with col2:
        if st.button("🔄 Reset"):
            st.session_state.page = 'home'
            st.session_state.processing = False
            st.session_state.results = None
            st.session_state.demo_results = None
            st.session_state.selected_method = None
            st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
            st.rerun()
    
    # Method selection tabs - only show selected methods
    available_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
    
    if len(available_methods) > 1:
        st.markdown("### 🔧 Select Extraction Method to View")
        
        method_labels = {
            'docling': '🔧 Docling',
            'llamaparse': '🦙 LlamaParse', 
            'unstructured': '📊 Unstructured'
        }
        
        # Create columns based on number of available methods
        cols = st.columns(len(available_methods))
        
        for i, method in enumerate(available_methods):
            with cols[i]:
                # Show HTML file count for each method using the same logic as show_html_tables
                method_output_dir = OUTPUT_BASE_PATH / method
                html_files = []
                if os.path.exists(method_output_dir):
                    html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True)
                    html_files = list(set(html_files))
                html_count = len(html_files)
                button_label = f"{method_labels[method]} ({html_count} HTML files)"
                
                if st.button(button_label, key=f"tab_{method}", use_container_width=True):
                    st.session_state.selected_method = method
    
    # Default to first available method if no method selected
    if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods:
        st.session_state.selected_method = available_methods[0] if available_methods else None
    
    # Show results for selected method
    if st.session_state.selected_method:
        show_method_results(st.session_state.selected_method)

def show_method_results(method):
    st.markdown(f"### 📋 Results from {method.title()}")
    
    # Changed column ratio: 3:1 for HTML tables:Excel files
    col1, col2 = st.columns([3, 1])
    
    with col1:
        st.markdown("#### 📄 HTML Tables")
        show_html_tables(method)
    
    with col2:
        st.markdown("#### 📊 Excel Files")
        show_excel_files(method)

def show_html_tables(method):
    """Display HTML tables from the method's output directory"""
    method_output_dir = OUTPUT_BASE_PATH / method
    
    # Get actual HTML files from directory
    html_files = []
    if os.path.exists(method_output_dir):
        # Use only the recursive glob, which includes the top-level directory
        html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True)
        # Remove duplicates just in case
        html_files = list(set(html_files))
    
    # Sort files by table number if possible (e.g., table_1, table_2, ...)
    import re
    def extract_table_number(filename):
        match = re.search(r"table[_-](\d+)", filename, re.IGNORECASE)
        if match:
            return int(match.group(1))
        return float('inf')  # Put files without a number at the end
    html_files.sort(key=lambda f: extract_table_number(os.path.basename(f)))
    
    if html_files:
        st.markdown(f"**Found {len(html_files)} HTML table(s):**")
        
        # Display all HTML files in one scrollable container
        st.markdown('<div class="table-container">', unsafe_allow_html=True)
        
        for i, html_file in enumerate(html_files):
            st.markdown(f"""
            <div class="table-header">
                <h4 style="color: #495057;">📋 Table {i+1}</h4>
                <small style="color: #6c757d;">File: {os.path.basename(html_file)}</small>
            </div>
            """, unsafe_allow_html=True)
            
            # Display HTML content
            try:
                with open(html_file, 'r', encoding='utf-8') as f:
                    html_content = f.read()
                st.components.v1.html(html_content, height=300, scrolling=True)
                    
            except Exception as e:
                st.error(f"Error displaying HTML file: {e}")
            
            # Download button for individual HTML file
            col_download1, col_download2, col_download3 = st.columns([1, 1, 2])
            with col_download1:
                try:
                    with open(html_file, 'r', encoding='utf-8') as f:
                        html_content = f.read()
                    st.download_button(
                        label=f"⬇️ Table {i+1}",
                        data=html_content,
                        file_name=f"table_{i+1}_{method}.html",
                        mime="text/html",
                        key=f"download_html_{method}_{i}",
                        use_container_width=True
                    )
                except Exception as e:
                    st.error(f"Error reading file for download: {e}")
            
            if i < len(html_files) - 1:
                st.markdown("---")
        
        st.markdown('</div>', unsafe_allow_html=True)
        
    else:
        st.warning(f"No HTML files found in {method_output_dir}")

def show_excel_files(method):
    """Display Excel files from the method's output directory"""
    method_output_dir = OUTPUT_BASE_PATH / method
    
    # Get actual Excel files from directory
    excel_files = get_excel_files(method_output_dir)
    
    if excel_files:
        st.markdown(f"**Found {len(excel_files)} Excel file(s):**")
        
        for i, excel_file in enumerate(excel_files):
            # Get file info
            file_info = get_file_info(excel_file)
            file_name = os.path.basename(excel_file)
            
            # File info card
            st.markdown(f"""
            <div class="file-info-card">
                <strong style="color: #495057;">📊 {file_name}</strong>
                <div class="file-stats">
                    <strong>Size:</strong> {file_info['size']}<br>
                    <strong>Modified:</strong> {file_info['modified']}
                </div>
            </div>
            """, unsafe_allow_html=True)
            
            # Try to read and display Excel file preview
            try:
                df = pd.read_excel(excel_file)
                if not df.empty:
                    st.markdown(f"**Preview (first 5 rows):**")
                    st.dataframe(df.head(), use_container_width=True)
                    st.markdown(f"**Dimensions:** {df.shape[0]} × {df.shape[1]}")
                else:
                    st.info("Excel file is empty")
            except Exception as e:
                # Try reading as CSV if Excel reading fails
                try:
                    df = pd.read_csv(excel_file)
                    if not df.empty:
                        st.markdown(f"**Preview (first 5 rows, read as CSV):**")
                        st.dataframe(df.head(), use_container_width=True)
                        st.markdown(f"**Dimensions:** {df.shape[0]} × {df.shape[1]}")
                    else:
                        st.info("CSV file is empty")
                except Exception as e2:
                    st.warning(f"Could not preview file as Excel or CSV: {e2}")
            
            # Download button for Excel file
            try:
                with open(excel_file, 'rb') as f:
                    excel_data = f.read()
                st.download_button(
                    label=f"⬇️ Download",
                    data=excel_data,
                    file_name=file_name,
                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    key=f"download_excel_{method}_{i}",
                    use_container_width=True
                )
            except Exception as e:
                st.error(f"Error reading Excel file for download: {e}")
            
            if i < len(excel_files) - 1:
                st.markdown("---")
    else:
        st.warning(f"No Excel files found in {method_output_dir}")

def process_document(file_path, output_dir, docling, llamaparse, unstructured):
    """Process document using the FastAPI endpoint"""
    try:
        # Prepare the request data
        data = {
            'input_file_path': file_path,
            'output_dir': output_dir,
            'docling': docling,
            'llamaparse': llamaparse,
            'unstructured': unstructured
        }
        
        # Show processing message
        with st.spinner('Processing document...'):
            # Make request to FastAPI endpoint
            # Replace with your actual FastAPI endpoint URL
            response = requests.post('http://localhost:8000/extract', data=data)
            
            if response.status_code == 200:
                st.session_state.results = response.json()
                st.success("Document processed successfully!")
                
                # Show results
                results = st.session_state.results['results']
                
                # Method selection for viewing results
                st.markdown("### 📊 View Results")
                available_methods = [method for method in ['docling', 'llamaparse', 'unstructured'] 
                                   if method in results and isinstance(results[method], dict)]
                
                if available_methods:
                    selected_method = st.selectbox(
                        "Select extraction method to view:",
                        available_methods,
                        help="Choose which extraction method results to display"
                    )
                    
                    if selected_method and isinstance(results[selected_method], dict):
                        method_result = results[selected_method]
                        st.json(method_result)
                        
                        # List files in output directory
                        method_dir = os.path.join(output_dir, selected_method)
                        
                        # HTML files
                        html_files = glob.glob(os.path.join(method_dir, "*.html"))
                        html_files.extend(glob.glob(os.path.join(method_dir, "**", "*.html"), recursive=True))
                        
                        # Excel files
                        excel_files = get_excel_files(method_dir)
                        
                        if html_files or excel_files:
                            st.markdown("### 📄 Generated Files")
                            
                            if html_files:
                                st.markdown("**HTML Files:**")
                                for html_file in html_files:
                                    st.markdown(f"- {os.path.basename(html_file)}")
                            
                            if excel_files:
                                st.markdown("**Excel Files:**")
                                for excel_file in excel_files:
                                    st.markdown(f"- {os.path.basename(excel_file)}")
                else:
                    st.warning("No successful extractions found.")
                    
            else:
                st.error(f"Error processing document: {response.text}")
                
    except requests.exceptions.ConnectionError:
        st.error("Could not connect to the processing service. Please ensure the FastAPI server is running.")
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")

def main():
    # Navigation header
    col1, col2 = st.columns([1, 1])
    with col1:
        st.markdown("### 📋 PDF Parser")
        st.markdown("*Table Extraction Tool*")
    with col2:
        nav_col1, nav_col2 = st.columns(2)
        with nav_col1:
            if st.button("Dashboard", use_container_width=True):
                st.session_state.page = 'home'
                st.rerun()
        with nav_col2:
            st.button("History", use_container_width=True)
    st.markdown("---")
    # Route to appropriate page
    if st.session_state.page == 'home':
        show_home_page()
    elif st.session_state.page == 'upload':
        show_upload_page()
    elif st.session_state.page == 'demo_setup':
        show_demo_setup_page()
    elif st.session_state.page == 'demo':
        show_demo_page()

if __name__ == "__main__":
    main()