Spaces:

traversaal-internal
/

Transform-PDF-Tables-to-HTML-and-Excel

Running

App Files Files Community

AreejMehboob commited on 4 days ago

Commit

4c10590

verified ·

1 Parent(s): 452d262

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +824 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,826 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import io
 import streamlit as st
+import requests
+import time
+import os
+from pathlib import Path
+import glob
+import base64
+import pandas as pd
+from datetime import datetime
+# Configure page
+st.set_page_config(
+    page_title="PDF Parser - Table Extraction Tool",
+    page_icon="📋",
+    layout="wide",
+    initial_sidebar_state="collapsed"
+)
+# Custom CSS for styling - Grey and White Theme
+st.markdown("""
+<style>
+    .main-header {
+        text-align: center;
+        padding: 2rem 0;
+        background: linear-gradient(135deg, #6c757d 0%, #495057 100%);
+        border-radius: 10px;
+        margin-bottom: 2rem;
+        color: white;
+    }
+    .feature-card {
+        background: #f8f9fa;
+        padding: 1.5rem;
+        border-radius: 10px;
+        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        text-align: center;
+        margin: 1rem 0;
+        border: 1px solid #dee2e6;
+    }
+    .demo-button {
+        background: linear-gradient(45deg, #6c757d, #495057);
+        color: white;
+        border: none;
+        padding: 12px 24px;
+        border-radius: 25px;
+        font-weight: bold;
+        cursor: pointer;
+        margin: 10px;
+    }
+    .upload-button {
+        background: #495057;
+        color: white;
+        border: none;
+        padding: 12px 24px;
+        border-radius: 25px;
+        font-weight: bold;
+        cursor: pointer;
+        margin: 10px;
+    }
+    .success-message {
+        background: #f8f9fa;
+        color: #495057;
+        padding: 15px;
+        border-radius: 5px;
+        border-left: 4px solid #6c757d;
+        margin: 20px 0;
+    }
+    .processing-message {
+        background: #f8f9fa;
+        color: #495057;
+        padding: 15px;
+        border-radius: 5px;
+        border-left: 4px solid #adb5bd;
+        margin: 20px 0;
+    }
+    .method-tab {
+        background: #f8f9fa;
+        padding: 10px 15px;
+        border-radius: 5px;
+        margin: 5px;
+        cursor: pointer;
+        border: 2px solid #dee2e6;
+    }
+    .method-tab-active {
+        background: #6c757d;
+        color: white;
+        border: 2px solid #495057;
+    }
+    .html-file-card {
+        background: #f8f9fa;
+        padding: 15px;
+        border-radius: 8px;
+        margin: 10px 0;
+        border-left: 4px solid #6c757d;
+    }
+    .file-info-card {
+        background: #f8f9fa;
+        padding: 12px;
+        border-radius: 8px;
+        margin: 5px 0;
+        border-left: 4px solid #6c757d;
+        font-size: 0.9em;
+    }
+    .file-stats {
+        color: #6c757d;
+        font-size: 0.85em;
+        margin-top: 5px;
+    }
+    .stSelectbox > div > div {
+        background-color: #f8f9fa;
+    }
+    .hidden-text {
+        color: #adb5bd;
+        font-style: italic;
+    }
+    .table-container {
+        max-height: 400px;
+        overflow-y: auto;
+        border: 1px solid #dee2e6;
+        border-radius: 5px;
+        padding: 10px;
+        margin: 10px 0;
+        background-color: white;
+    }
+    .table-header {
+        background: #f8f9fa;
+        padding: 10px;
+        border-radius: 5px;
+        margin-bottom: 10px;
+        border-left: 4px solid #6c757d;
+    }
+    /* Override Streamlit button styles */
+    .stButton > button {
+        background-color: #6c757d !important;
+        color: white !important;
+        border: 1px solid #495057 !important;
+        border-radius: 5px !important;
+    }
+    .stButton > button:hover {
+        background-color: #495057 !important;
+        border-color: #343a40 !important;
+    }
+    /* Override primary button styles */
+    .stButton > button[kind="primary"] {
+        background-color: #495057 !important;
+        color: white !important;
+        border: 1px solid #343a40 !important;
+    }
+    .stButton > button[kind="primary"]:hover {
+        background-color: #343a40 !important;
+    }
+    /* Style checkboxes */
+    .stCheckbox > label {
+        color: #495057 !important;
+    }
+    /* Style text inputs */
+    .stTextInput > div > div > input {
+        background-color: #f8f9fa !important;
+        border-color: #dee2e6 !important;
+    }
+    /* Style file uploader */
+    .stFileUploader > div {
+        background-color: #f8f9fa !important;
+        border-color: #dee2e6 !important;
+    }
+    /* Style dataframes */
+    .stDataFrame {
+        background-color: white !important;
+        border: 1px solid #dee2e6 !important;
+    }
+    /* Style selectbox */
+    .stSelectbox > div > div {
+        background-color: #f8f9fa !important;
+        border-color: #dee2e6 !important;
+    }
+    /* Style progress bar */
+    .stProgress > div > div > div {
+        background-color: #6c757d !important;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Initialize session state
+if 'page' not in st.session_state:
+    st.session_state.page = 'home'
+if 'processing' not in st.session_state:
+    st.session_state.processing = False
+if 'results' not in st.session_state:
+    st.session_state.results = None
+if 'show_output_dir' not in st.session_state:
+    st.session_state.show_output_dir = False
+if 'selected_method' not in st.session_state:
+    st.session_state.selected_method = None
+if 'demo_results' not in st.session_state:
+    st.session_state.demo_results = None
+if 'demo_selected_methods' not in st.session_state:
+    st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
+# Tesla demo document path (adjust as needed)
+TESLA_DOC_PATH = r"C:\Users\Areej\Desktop\get-tables-fastapi\tesla_docs_28-41 (1)-9-14.pdf"
+OUTPUT_BASE_PATH = r"C:\Users\Areej\Desktop\get-tables-fastapi\output"
+def show_home_page():
+    # Header
+    st.markdown("""
+    <div class="main-header">
+        <h1 style="font-size: 3rem; margin: 0; color: #f8f9fa;">Transform PDF Tables to</h1>
+        <h1 style="font-size: 3rem; margin: 0; color: #ffffff;">HTML and Excel</h1>
+        <p style="margin-top: 1rem; font-size: 1.2rem; opacity: 0.9;">Powered by Traversaal.ai</p>
+        <p style="margin-top: 0.5rem; opacity: 0.8;">Perfect for financial reports, research papers, and data analysis.</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Main buttons
+    col1, col2, col3 = st.columns([1, 2, 1])
+    with col2:
+        col_btn1, col_btn2 = st.columns(2)
+        with col_btn1:
+            if st.button("📄 Upload PDF Document", key="upload_btn", help="Upload your own PDF document"):
+                st.session_state.page = 'upload'
+                st.rerun()
+        with col_btn2:
+            if st.button("⚡ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"):
+                st.session_state.page = 'demo_setup'
+                st.rerun()
+    # Features section
+    st.markdown("---")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.markdown("""
+        <div class="feature-card">
+            <h3 style="color: #495057;">⚡ Lightning Fast</h3>
+            <p style="color: #6c757d;">Process complex PDFs in seconds with our advanced AI algorithms</p>
+        </div>
+        """, unsafe_allow_html=True)
+    with col2:
+        st.markdown("""
+        <div class="feature-card">
+            <h3 style="color: #495057;">🔒 Secure & Private</h3>
+            <p style="color: #6c757d;">Your documents are processed securely and never stored permanently</p>
+        </div>
+        """, unsafe_allow_html=True)
+    with col3:
+        st.markdown("""
+        <div class="feature-card">
+            <h3 style="color: #495057;">🔄 Batch Processing</h3>
+            <p style="color: #6c757d;">Handle multiple documents and tables simultaneously</p>
+        </div>
+        """, unsafe_allow_html=True)
+def show_upload_page():
+    st.markdown("## 📄 Upload Your Document")
+    # File upload
+    uploaded_file = st.file_uploader(
+        "Choose a PDF file",
+        type=['pdf'],
+        help="Upload a PDF document to extract tables from"
+    )
+    # Input file path (alternative)
+    st.markdown("**Or specify file path:**")
+    input_file_path = st.text_input(
+        "Input File Path",
+        placeholder="C:\\path\\to\\your\\document.pdf",
+        help="Enter the full path to your PDF file"
+    )
+    # Output directory with show/hide functionality
+    output_dir = st.text_input(
+        "Output Directory",
+        placeholder="C:\\path\\to\\output\\folder",
+        help="Directory where extracted tables will be saved",
+        type="password" if not st.session_state.show_output_dir else "default"
+    )
+    # Show/Hide output directory toggle
+    col1, col2 = st.columns([3, 1])
+    with col2:
+        if st.button("👁️ View/Hide Path"):
+            st.session_state.show_output_dir = not st.session_state.show_output_dir
+            st.rerun()
+    # Extraction method selection
+    st.markdown("### 🔧 Select Extraction Methods")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        docling = st.checkbox("Docling", value=True, help="Advanced document processing")
+    with col2:
+        llamaparse = st.checkbox("LlamaParse", value=False, help="AI-powered parsing")
+    with col3:
+        unstructured = st.checkbox("Unstructured", value=False, help="General purpose extraction")
+    # Process button
+    if st.button("🚀 Process Document", type="primary"):
+        if (uploaded_file or input_file_path) and output_dir and (docling or llamaparse or unstructured):
+            file_path = input_file_path if input_file_path else uploaded_file.name
+            process_document(file_path, output_dir, docling, llamaparse, unstructured)
+        else:
+            st.error("Please provide input file, output directory, and select at least one extraction method.")
+    # Back button
+    if st.button("← Back to Home"):
+        st.session_state.page = 'home'
+        st.rerun()
+def show_demo_setup_page():
+    st.markdown("## ⚡ Tesla 10K Demo Setup")
+    st.markdown("*Configure extraction methods for Tesla's 10K document processing*")
+    # Document info
+    st.markdown("### 📄 Document Information")
+    st.info("**Document:** tesla_docs_28-41 (1)-9-14.pdf")
+    # Extraction method selection (removed output directory section completely)
+    st.markdown("### 🔧 Select Extraction Methods")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        docling = st.checkbox("Docling",
+                             value=st.session_state.demo_selected_methods['docling'],
+                             help="Advanced document processing")
+    with col2:
+        llamaparse = st.checkbox("LlamaParse",
+                                value=st.session_state.demo_selected_methods['llamaparse'],
+                                help="AI-powered parsing")
+    with col3:
+        unstructured = st.checkbox("Unstructured",
+                                  value=st.session_state.demo_selected_methods['unstructured'],
+                                  help="General purpose extraction")
+    # Update session state
+    st.session_state.demo_selected_methods = {
+        'docling': docling,
+        'llamaparse': llamaparse,
+        'unstructured': unstructured
+    }
+    # Process button
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        if st.button("🚀 Process Tesla Document", type="primary"):
+            if docling or llamaparse or unstructured:
+                st.session_state.page = 'demo'
+                st.session_state.processing = True
+                st.rerun()
+            else:
+                st.error("Please select at least one extraction method.")
+    with col2:
+        if st.button("← Back to Home"):
+            st.session_state.page = 'home'
+            st.rerun()
+def show_demo_page():
+    if st.session_state.processing:
+        show_processing_demo()
+    else:
+        show_demo_results()
+def show_processing_demo():
+    st.markdown("## ⚡ Processing Tesla 10K Document...")
+    # Show selected methods
+    selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
+    st.markdown(f"*Processing with selected methods: {', '.join([m.title() for m in selected_methods])}*")
+    # Progress bar
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    method_status = st.empty()
+    # Calculate total steps based on selected methods
+    total_methods = len(selected_methods)
+    steps_per_method = 30
+    total_steps = total_methods * steps_per_method
+    current_method_index = 0
+    for i in range(total_steps):
+        progress = (i + 1) / total_steps
+        progress_bar.progress(progress)
+        # Determine current method
+        method_step = i % steps_per_method
+        if method_step == 0 and i > 0:
+            current_method_index += 1
+        current_method = selected_methods[current_method_index]
+        method_progress = (method_step + 1) / steps_per_method
+        # Update status messages
+        if method_progress < 0.3:
+            status_text.text(f"📄 {current_method.title()}: Reading document... {int(method_progress * 100)}%")
+        elif method_progress < 0.7:
+            status_text.text(f"🔍 {current_method.title()}: Extracting tables... {int(method_progress * 100)}%")
+        else:
+            status_text.text(f"💾 {current_method.title()}: Generating HTML outputs... {int(method_progress * 100)}%")
+        method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}")
+        time.sleep(0.33)
+    # Show completion
+    st.markdown("""
+    <div class="success-message">
+        ✅ <strong>Document processed successfully!</strong><br>
+        Tables have been extracted using selected methods and HTML files are ready for viewing.
+    </div>
+    """, unsafe_allow_html=True)
+    # Process Tesla demo
+    process_tesla_demo()
+    st.session_state.processing = False
+    time.sleep(2)
+    st.rerun()
+def process_tesla_demo():
+    """Process Tesla demo document using selected extraction methods"""
+    try:
+        # Create output directory for demo (using the base path)
+        demo_output_dir = os.path.join(OUTPUT_BASE_PATH, "tesla_demo")
+        # Prepare the request data for selected methods only
+        data = {
+            'input_file_path': TESLA_DOC_PATH,
+            'output_dir': demo_output_dir,
+            'docling': st.session_state.demo_selected_methods['docling'],
+            'llamaparse': st.session_state.demo_selected_methods['llamaparse'],
+            'unstructured': st.session_state.demo_selected_methods['unstructured']
+        }
+        # Make request to FastAPI endpoint (uncomment when ready)
+        # response = requests.post('http://localhost:8000/extract', data=data)
+        # if response.status_code == 200:
+        #     st.session_state.demo_results = response.json()
+        # For demo purposes, simulate successful processing for selected methods only
+        results = {}
+        if st.session_state.demo_selected_methods['docling']:
+            results['docling'] = {'status': 'success', 'total_tables': 5}
+        if st.session_state.demo_selected_methods['llamaparse']:
+            results['llamaparse'] = {'status': 'success', 'total_tables': 3}
+        if st.session_state.demo_selected_methods['unstructured']:
+            results['unstructured'] = {'status': 'success', 'total_tables': 4}
+        st.session_state.demo_results = {'results': results}
+    except Exception as e:
+        st.error(f"Error processing Tesla demo: {str(e)}")
+def count_html_files(directory):
+    """Count only HTML files in directory"""
+    if not os.path.exists(directory):
+        return 0
+    html_files = glob.glob(os.path.join(directory, "*.html"))
+    html_files.extend(glob.glob(os.path.join(directory, "**", "*.html"), recursive=True))
+    return len(html_files)
+def get_excel_files(directory):
+    """Get all Excel files from directory"""
+    if not os.path.exists(directory):
+        return []
+    excel_files = glob.glob(os.path.join(directory, "*.xlsx"))
+    excel_files.extend(glob.glob(os.path.join(directory, "*.xls")))
+    excel_files.extend(glob.glob(os.path.join(directory, "*.csv")))
+    excel_files.extend(glob.glob(os.path.join(directory, "**", "*.xlsx"), recursive=True))
+    excel_files.extend(glob.glob(os.path.join(directory, "**", "*.xls"), recursive=True))
+    return excel_files
+def get_file_info(file_path):
+    """Get file information including size and modification time"""
+    if not os.path.exists(file_path):
+        return {"size": 0, "modified": "Unknown"}
+    stat = os.stat(file_path)
+    size_kb = stat.st_size / 1024
+    modified = datetime.fromtimestamp(stat.st_mtime)
+    return {
+        "size": f"{size_kb:.1f} KB",
+        "modified": modified.strftime("%Y-%m-%d %H:%M")
+    }
+def show_demo_results():
+    st.markdown("## 📊 Tesla 10K Processing Results")
+    # Document info
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.markdown("### 📄 tesla_docs_28-41 (1)-9-14.pdf")
+        st.markdown("**Status:** ✅ Complete")
+        processed_methods = [method.title() for method, selected in st.session_state.demo_selected_methods.items() if selected]
+        st.markdown(f"**Processed with:** {', '.join(processed_methods)}")
+    with col2:
+        if st.button("🔄 Reset"):
+            st.session_state.page = 'home'
+            st.session_state.processing = False
+            st.session_state.results = None
+            st.session_state.demo_results = None
+            st.session_state.selected_method = None
+            st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
+            st.rerun()
+    # Method selection tabs - only show selected methods
+    available_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
+    if len(available_methods) > 1:
+        st.markdown("### 🔧 Select Extraction Method to View")
+        method_labels = {
+            'docling': '🔧 Docling',
+            'llamaparse': '🦙 LlamaParse',
+            'unstructured': '📊 Unstructured'
+        }
+        # Create columns based on number of available methods
+        cols = st.columns(len(available_methods))
+        for i, method in enumerate(available_methods):
+            with cols[i]:
+                # Show HTML file count for each method using the same logic as show_html_tables
+                method_output_dir = os.path.join(OUTPUT_BASE_PATH, method)
+                html_files = []
+                if os.path.exists(method_output_dir):
+                    html_files = glob.glob(os.path.join(method_output_dir, "**", "*.html"), recursive=True)
+                    html_files = list(set(html_files))
+                html_count = len(html_files)
+                button_label = f"{method_labels[method]} ({html_count} HTML files)"
+                if st.button(button_label, key=f"tab_{method}", use_container_width=True):
+                    st.session_state.selected_method = method
+    # Default to first available method if no method selected
+    if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods:
+        st.session_state.selected_method = available_methods[0] if available_methods else None
+    # Show results for selected method
+    if st.session_state.selected_method:
+        show_method_results(st.session_state.selected_method)
+def show_method_results(method):
+    st.markdown(f"### 📋 Results from {method.title()}")
+    # Changed column ratio: 3:1 for HTML tables:Excel files
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        st.markdown("#### 📄 HTML Tables")
+        show_html_tables(method)
+    with col2:
+        st.markdown("#### 📊 Excel Files")
+        show_excel_files(method)
+def show_html_tables(method):
+    """Display HTML tables from the method's output directory"""
+    method_output_dir = os.path.join(OUTPUT_BASE_PATH, method)
+    # Get actual HTML files from directory
+    html_files = []
+    if os.path.exists(method_output_dir):
+        # Use only the recursive glob, which includes the top-level directory
+        html_files = glob.glob(os.path.join(method_output_dir, "**", "*.html"), recursive=True)
+        # Remove duplicates just in case
+        html_files = list(set(html_files))
+    # Sort files by table number if possible (e.g., table_1, table_2, ...)
+    import re
+    def extract_table_number(filename):
+        match = re.search(r"table[_-](\d+)", filename, re.IGNORECASE)
+        if match:
+            return int(match.group(1))
+        return float('inf')  # Put files without a number at the end
+    html_files.sort(key=lambda f: extract_table_number(os.path.basename(f)))
+    if html_files:
+        st.markdown(f"**Found {len(html_files)} HTML table(s):**")
+        # Display all HTML files in one scrollable container
+        st.markdown('<div class="table-container">', unsafe_allow_html=True)
+        for i, html_file in enumerate(html_files):
+            st.markdown(f"""
+            <div class="table-header">
+                <h4 style="color: #495057;">📋 Table {i+1}</h4>
+                <small style="color: #6c757d;">File: {os.path.basename(html_file)}</small>
+            </div>
+            """, unsafe_allow_html=True)
+            # Display HTML content
+            try:
+                with open(html_file, 'r', encoding='utf-8') as f:
+                    html_content = f.read()
+                st.components.v1.html(html_content, height=300, scrolling=True)
+            except Exception as e:
+                st.error(f"Error displaying HTML file: {e}")
+            # Download button for individual HTML file
+            col_download1, col_download2, col_download3 = st.columns([1, 1, 2])
+            with col_download1:
+                try:
+                    with open(html_file, 'r', encoding='utf-8') as f:
+                        html_content = f.read()
+                    st.download_button(
+                        label=f"⬇️ Table {i+1}",
+                        data=html_content,
+                        file_name=f"table_{i+1}_{method}.html",
+                        mime="text/html",
+                        key=f"download_html_{method}_{i}",
+                        use_container_width=True
+                    )
+                except Exception as e:
+                    st.error(f"Error reading file for download: {e}")
+            if i < len(html_files) - 1:
+                st.markdown("---")
+        st.markdown('</div>', unsafe_allow_html=True)
+    else:
+        st.warning(f"No HTML files found in {method_output_dir}")
+def show_excel_files(method):
+    """Display Excel files from the method's output directory"""
+    method_output_dir = os.path.join(OUTPUT_BASE_PATH, method)
+    # Get actual Excel files from directory
+    excel_files = get_excel_files(method_output_dir)
+    if excel_files:
+        st.markdown(f"**Found {len(excel_files)} Excel file(s):**")
+        for i, excel_file in enumerate(excel_files):
+            # Get file info
+            file_info = get_file_info(excel_file)
+            file_name = os.path.basename(excel_file)
+            # File info card
+            st.markdown(f"""
+            <div class="file-info-card">
+                <strong style="color: #495057;">📊 {file_name}</strong>
+                <div class="file-stats">
+                    <strong>Size:</strong> {file_info['size']}<br>
+                    <strong>Modified:</strong> {file_info['modified']}
+                </div>
+            </div>
+            """, unsafe_allow_html=True)
+            # Try to read and display Excel file preview
+            try:
+                df = pd.read_excel(excel_file)
+                if not df.empty:
+                    st.markdown(f"**Preview (first 5 rows):**")
+                    st.dataframe(df.head(), use_container_width=True)
+                    st.markdown(f"**Dimensions:** {df.shape[0]} × {df.shape[1]}")
+                else:
+                    st.info("Excel file is empty")
+            except Exception as e:
+                # Try reading as CSV if Excel reading fails
+                try:
+                    df = pd.read_csv(excel_file)
+                    if not df.empty:
+                        st.markdown(f"**Preview (first 5 rows, read as CSV):**")
+                        st.dataframe(df.head(), use_container_width=True)
+                        st.markdown(f"**Dimensions:** {df.shape[0]} × {df.shape[1]}")
+                    else:
+                        st.info("CSV file is empty")
+                except Exception as e2:
+                    st.warning(f"Could not preview file as Excel or CSV: {e2}")
+            # Download button for Excel file
+            try:
+                with open(excel_file, 'rb') as f:
+                    excel_data = f.read()
+                st.download_button(
+                    label=f"⬇️ Download",
+                    data=excel_data,
+                    file_name=file_name,
+                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    key=f"download_excel_{method}_{i}",
+                    use_container_width=True
+                )
+            except Exception as e:
+                st.error(f"Error reading Excel file for download: {e}")
+            if i < len(excel_files) - 1:
+                st.markdown("---")
+    else:
+        st.warning(f"No Excel files found in {method_output_dir}")
+def process_document(file_path, output_dir, docling, llamaparse, unstructured):
+    """Process document using the FastAPI endpoint"""
+    try:
+        # Prepare the request data
+        data = {
+            'input_file_path': file_path,
+            'output_dir': output_dir,
+            'docling': docling,
+            'llamaparse': llamaparse,
+            'unstructured': unstructured
+        }
+        # Show processing message
+        with st.spinner('Processing document...'):
+            # Make request to FastAPI endpoint
+            # Replace with your actual FastAPI endpoint URL
+            response = requests.post('http://localhost:8000/extract', data=data)
+            if response.status_code == 200:
+                st.session_state.results = response.json()
+                st.success("Document processed successfully!")
+                # Show results
+                results = st.session_state.results['results']
+                # Method selection for viewing results
+                st.markdown("### 📊 View Results")
+                available_methods = [method for method in ['docling', 'llamaparse', 'unstructured']
+                                   if method in results and isinstance(results[method], dict)]
+                if available_methods:
+                    selected_method = st.selectbox(
+                        "Select extraction method to view:",
+                        available_methods,
+                        help="Choose which extraction method results to display"
+                    )
+                    if selected_method and isinstance(results[selected_method], dict):
+                        method_result = results[selected_method]
+                        st.json(method_result)
+                        # List files in output directory
+                        method_dir = os.path.join(output_dir, selected_method)
+                        # HTML files
+                        html_files = glob.glob(os.path.join(method_dir, "*.html"))
+                        html_files.extend(glob.glob(os.path.join(method_dir, "**", "*.html"), recursive=True))
+                        # Excel files
+                        excel_files = get_excel_files(method_dir)
+                        if html_files or excel_files:
+                            st.markdown("### 📄 Generated Files")
+                            if html_files:
+                                st.markdown("**HTML Files:**")
+                                for html_file in html_files:
+                                    st.markdown(f"- {os.path.basename(html_file)}")
+                            if excel_files:
+                                st.markdown("**Excel Files:**")
+                                for excel_file in excel_files:
+                                    st.markdown(f"- {os.path.basename(excel_file)}")
+                else:
+                    st.warning("No successful extractions found.")
+            else:
+                st.error(f"Error processing document: {response.text}")
+    except requests.exceptions.ConnectionError:
+        st.error("Could not connect to the processing service. Please ensure the FastAPI server is running.")
+    except Exception as e:
+        st.error(f"An error occurred: {str(e)}")
+def main():
+    # Navigation header
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.markdown("### 📋 PDF Parser")
+        st.markdown("*Table Extraction Tool*")
+    with col2:
+        nav_col1, nav_col2 = st.columns(2)
+        with nav_col1:
+            if st.button("Dashboard", use_container_width=True):
+                st.session_state.page = 'home'
+                st.rerun()
+        with nav_col2:
+            st.button("History", use_container_width=True)
+    st.markdown("---")
+    # Route to appropriate page
+    if st.session_state.page == 'home':
+        show_home_page()
+    elif st.session_state.page == 'upload':
+        show_upload_page()
+    elif st.session_state.page == 'demo_setup':
+        show_demo_setup_page()
+    elif st.session_state.page == 'demo':
+        show_demo_page()
+if __name__ == "__main__":
+    main()