|
import io |
|
import streamlit as st |
|
import requests |
|
import time |
|
import os |
|
from pathlib import Path |
|
import glob |
|
import base64 |
|
import pandas as pd |
|
from datetime import datetime |
|
|
|
|
|
st.set_page_config( |
|
page_title="PDF Parser - Table Extraction Tool", |
|
page_icon="π", |
|
layout="wide", |
|
initial_sidebar_state="collapsed" |
|
) |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.main-header { |
|
text-align: center; |
|
padding: 2rem 0; |
|
background: linear-gradient(135deg, #6c757d 0%, #495057 100%); |
|
border-radius: 10px; |
|
margin-bottom: 2rem; |
|
color: white; |
|
} |
|
|
|
.feature-card { |
|
background: #f8f9fa; |
|
padding: 1.5rem; |
|
border-radius: 10px; |
|
box-shadow: 0 2px 10px rgba(0,0,0,0.1); |
|
text-align: center; |
|
margin: 1rem 0; |
|
border: 1px solid #dee2e6; |
|
} |
|
|
|
.demo-button { |
|
background: linear-gradient(45deg, #6c757d, #495057); |
|
color: white; |
|
border: none; |
|
padding: 12px 24px; |
|
border-radius: 25px; |
|
font-weight: bold; |
|
cursor: pointer; |
|
margin: 10px; |
|
} |
|
|
|
.upload-button { |
|
background: #495057; |
|
color: white; |
|
border: none; |
|
padding: 12px 24px; |
|
border-radius: 25px; |
|
font-weight: bold; |
|
cursor: pointer; |
|
margin: 10px; |
|
} |
|
|
|
.success-message { |
|
background: #f8f9fa; |
|
color: #495057; |
|
padding: 15px; |
|
border-radius: 5px; |
|
border-left: 4px solid #6c757d; |
|
margin: 20px 0; |
|
} |
|
|
|
.processing-message { |
|
background: #f8f9fa; |
|
color: #495057; |
|
padding: 15px; |
|
border-radius: 5px; |
|
border-left: 4px solid #adb5bd; |
|
margin: 20px 0; |
|
} |
|
|
|
.method-tab { |
|
background: #f8f9fa; |
|
padding: 10px 15px; |
|
border-radius: 5px; |
|
margin: 5px; |
|
cursor: pointer; |
|
border: 2px solid #dee2e6; |
|
} |
|
|
|
.method-tab-active { |
|
background: #6c757d; |
|
color: white; |
|
border: 2px solid #495057; |
|
} |
|
|
|
.html-file-card { |
|
background: #f8f9fa; |
|
padding: 15px; |
|
border-radius: 8px; |
|
margin: 10px 0; |
|
border-left: 4px solid #6c757d; |
|
} |
|
|
|
.file-info-card { |
|
background: #f8f9fa; |
|
padding: 12px; |
|
border-radius: 8px; |
|
margin: 5px 0; |
|
border-left: 4px solid #6c757d; |
|
font-size: 0.9em; |
|
} |
|
|
|
.file-stats { |
|
color: #6c757d; |
|
font-size: 0.85em; |
|
margin-top: 5px; |
|
} |
|
|
|
.stSelectbox > div > div { |
|
background-color: #f8f9fa; |
|
} |
|
|
|
.hidden-text { |
|
color: #adb5bd; |
|
font-style: italic; |
|
} |
|
|
|
.table-container { |
|
max-height: 400px; |
|
overflow-y: auto; |
|
border: 1px solid #dee2e6; |
|
border-radius: 5px; |
|
padding: 10px; |
|
margin: 10px 0; |
|
background-color: white; |
|
} |
|
|
|
.table-header { |
|
background: #f8f9fa; |
|
padding: 10px; |
|
border-radius: 5px; |
|
margin-bottom: 10px; |
|
border-left: 4px solid #6c757d; |
|
} |
|
|
|
/* Override Streamlit button styles */ |
|
.stButton > button { |
|
background-color: #6c757d !important; |
|
color: white !important; |
|
border: 1px solid #495057 !important; |
|
border-radius: 5px !important; |
|
} |
|
|
|
.stButton > button:hover { |
|
background-color: #495057 !important; |
|
border-color: #343a40 !important; |
|
} |
|
|
|
/* Override primary button styles */ |
|
.stButton > button[kind="primary"] { |
|
background-color: #495057 !important; |
|
color: white !important; |
|
border: 1px solid #343a40 !important; |
|
} |
|
|
|
.stButton > button[kind="primary"]:hover { |
|
background-color: #343a40 !important; |
|
} |
|
|
|
/* Style checkboxes */ |
|
.stCheckbox > label { |
|
color: #495057 !important; |
|
} |
|
|
|
/* Style text inputs */ |
|
.stTextInput > div > div > input { |
|
background-color: #f8f9fa !important; |
|
border-color: #dee2e6 !important; |
|
} |
|
|
|
/* Style file uploader */ |
|
.stFileUploader > div { |
|
background-color: #f8f9fa !important; |
|
border-color: #dee2e6 !important; |
|
} |
|
|
|
/* Style dataframes */ |
|
.stDataFrame { |
|
background-color: white !important; |
|
border: 1px solid #dee2e6 !important; |
|
} |
|
|
|
/* Style selectbox */ |
|
.stSelectbox > div > div { |
|
background-color: #f8f9fa !important; |
|
border-color: #dee2e6 !important; |
|
} |
|
|
|
/* Style progress bar */ |
|
.stProgress > div > div > div { |
|
background-color: #6c757d !important; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
if 'page' not in st.session_state: |
|
st.session_state.page = 'home' |
|
if 'processing' not in st.session_state: |
|
st.session_state.processing = False |
|
if 'results' not in st.session_state: |
|
st.session_state.results = None |
|
if 'show_output_dir' not in st.session_state: |
|
st.session_state.show_output_dir = False |
|
if 'selected_method' not in st.session_state: |
|
st.session_state.selected_method = None |
|
if 'demo_results' not in st.session_state: |
|
st.session_state.demo_results = None |
|
if 'demo_selected_methods' not in st.session_state: |
|
st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False} |
|
|
|
|
|
SCRIPT_DIR = Path(__file__).parent |
|
|
|
|
|
TESLA_DOC_PATH = SCRIPT_DIR / "tesla_docs_28-41 (1)-9-14.pdf" |
|
|
|
|
|
OUTPUT_BASE_PATH = SCRIPT_DIR / "output" |
|
|
|
def show_home_page(): |
|
|
|
st.markdown(""" |
|
<div class="main-header"> |
|
<h1 style="font-size: 3rem; margin: 0; color: #f8f9fa;">Transform PDF Tables to</h1> |
|
<h1 style="font-size: 3rem; margin: 0; color: #ffffff;">HTML and Excel</h1> |
|
<p style="margin-top: 1rem; font-size: 1.2rem; opacity: 0.9;">Powered by Traversaal.ai</p> |
|
<p style="margin-top: 0.5rem; opacity: 0.8;">Perfect for financial reports, research papers, and data analysis.</p> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
col1, col2, col3 = st.columns([1, 2, 1]) |
|
with col2: |
|
col_btn1, col_btn2 = st.columns(2) |
|
with col_btn1: |
|
if st.button("π Upload PDF Document", key="upload_btn", help="Upload your own PDF document"): |
|
st.session_state.page = 'upload' |
|
st.rerun() |
|
|
|
with col_btn2: |
|
if st.button("β‘ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"): |
|
st.session_state.page = 'demo_setup' |
|
st.rerun() |
|
|
|
|
|
st.markdown("---") |
|
col1, col2, col3 = st.columns(3) |
|
|
|
with col1: |
|
st.markdown(""" |
|
<div class="feature-card"> |
|
<h3 style="color: #495057;">β‘ Lightning Fast</h3> |
|
<p style="color: #6c757d;">Process complex PDFs in seconds with our advanced AI algorithms</p> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
with col2: |
|
st.markdown(""" |
|
<div class="feature-card"> |
|
<h3 style="color: #495057;">π Secure & Private</h3> |
|
<p style="color: #6c757d;">Your documents are processed securely and never stored permanently</p> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
with col3: |
|
st.markdown(""" |
|
<div class="feature-card"> |
|
<h3 style="color: #495057;">π Batch Processing</h3> |
|
<p style="color: #6c757d;">Handle multiple documents and tables simultaneously</p> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
def show_upload_page(): |
|
st.markdown("## π Upload Your Document") |
|
|
|
|
|
uploaded_file = st.file_uploader( |
|
"Choose a PDF file", |
|
type=['pdf'], |
|
help="Upload a PDF document to extract tables from" |
|
) |
|
|
|
|
|
st.markdown("**Or specify file path:**") |
|
input_file_path = st.text_input( |
|
"Input File Path", |
|
placeholder="C:\\path\\to\\your\\document.pdf", |
|
help="Enter the full path to your PDF file" |
|
) |
|
|
|
|
|
output_dir = st.text_input( |
|
"Output Directory", |
|
placeholder="C:\\path\\to\\output\\folder", |
|
help="Directory where extracted tables will be saved", |
|
type="password" if not st.session_state.show_output_dir else "default" |
|
) |
|
|
|
|
|
col1, col2 = st.columns([3, 1]) |
|
with col2: |
|
if st.button("ποΈ View/Hide Path"): |
|
st.session_state.show_output_dir = not st.session_state.show_output_dir |
|
st.rerun() |
|
|
|
|
|
st.markdown("### π§ Select Extraction Methods") |
|
col1, col2, col3 = st.columns(3) |
|
|
|
with col1: |
|
docling = st.checkbox("Docling", value=True, help="Advanced document processing") |
|
with col2: |
|
llamaparse = st.checkbox("LlamaParse", value=False, help="AI-powered parsing") |
|
with col3: |
|
unstructured = st.checkbox("Unstructured", value=False, help="General purpose extraction") |
|
|
|
|
|
if st.button("π Process Document", type="primary"): |
|
if (uploaded_file or input_file_path) and output_dir and (docling or llamaparse or unstructured): |
|
file_path = input_file_path if input_file_path else uploaded_file.name |
|
process_document(file_path, output_dir, docling, llamaparse, unstructured) |
|
else: |
|
st.error("Please provide input file, output directory, and select at least one extraction method.") |
|
|
|
|
|
if st.button("β Back to Home"): |
|
st.session_state.page = 'home' |
|
st.rerun() |
|
|
|
def show_demo_setup_page(): |
|
st.markdown("## β‘ Tesla 10K Demo Setup") |
|
st.markdown("*Configure extraction methods for Tesla's 10K document processing*") |
|
|
|
|
|
st.markdown("### π Document Information") |
|
st.info("**Document:** tesla_docs_28-41 (1)-9-14.pdf") |
|
|
|
|
|
st.markdown("### π§ Select Extraction Methods") |
|
col1, col2, col3 = st.columns(3) |
|
|
|
with col1: |
|
docling = st.checkbox("Docling", |
|
value=st.session_state.demo_selected_methods['docling'], |
|
help="Advanced document processing") |
|
with col2: |
|
llamaparse = st.checkbox("LlamaParse", |
|
value=st.session_state.demo_selected_methods['llamaparse'], |
|
help="AI-powered parsing") |
|
with col3: |
|
unstructured = st.checkbox("Unstructured", |
|
value=st.session_state.demo_selected_methods['unstructured'], |
|
help="General purpose extraction") |
|
|
|
|
|
st.session_state.demo_selected_methods = { |
|
'docling': docling, |
|
'llamaparse': llamaparse, |
|
'unstructured': unstructured |
|
} |
|
|
|
|
|
col1, col2 = st.columns([2, 1]) |
|
with col1: |
|
if st.button("π Process Tesla Document", type="primary"): |
|
if docling or llamaparse or unstructured: |
|
st.session_state.page = 'demo' |
|
st.session_state.processing = True |
|
st.rerun() |
|
else: |
|
st.error("Please select at least one extraction method.") |
|
|
|
with col2: |
|
if st.button("β Back to Home"): |
|
st.session_state.page = 'home' |
|
st.rerun() |
|
|
|
def show_demo_page(): |
|
if st.session_state.processing: |
|
show_processing_demo() |
|
else: |
|
show_demo_results() |
|
|
|
def show_processing_demo(): |
|
st.markdown("## β‘ Processing Tesla 10K Document...") |
|
|
|
|
|
selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected] |
|
st.markdown(f"*Processing with selected methods: {', '.join([m.title() for m in selected_methods])}*") |
|
|
|
|
|
progress_bar = st.progress(0) |
|
status_text = st.empty() |
|
method_status = st.empty() |
|
|
|
|
|
total_methods = len(selected_methods) |
|
steps_per_method = 30 |
|
total_steps = total_methods * steps_per_method |
|
|
|
current_method_index = 0 |
|
for i in range(total_steps): |
|
progress = (i + 1) / total_steps |
|
progress_bar.progress(progress) |
|
|
|
|
|
method_step = i % steps_per_method |
|
if method_step == 0 and i > 0: |
|
current_method_index += 1 |
|
|
|
current_method = selected_methods[current_method_index] |
|
method_progress = (method_step + 1) / steps_per_method |
|
|
|
|
|
if method_progress < 0.3: |
|
status_text.text(f"π {current_method.title()}: Reading document... {int(method_progress * 100)}%") |
|
elif method_progress < 0.7: |
|
status_text.text(f"π {current_method.title()}: Extracting tables... {int(method_progress * 100)}%") |
|
else: |
|
status_text.text(f"πΎ {current_method.title()}: Generating HTML outputs... {int(method_progress * 100)}%") |
|
|
|
method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}") |
|
|
|
time.sleep(0.33) |
|
|
|
|
|
st.markdown(""" |
|
<div class="success-message"> |
|
β
<strong>Document processed successfully!</strong><br> |
|
Tables have been extracted using selected methods and HTML files are ready for viewing. |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
process_tesla_demo() |
|
|
|
st.session_state.processing = False |
|
time.sleep(2) |
|
st.rerun() |
|
|
|
def process_tesla_demo(): |
|
"""Process Tesla demo document using selected extraction methods""" |
|
try: |
|
|
|
demo_output_dir = OUTPUT_BASE_PATH / "tesla_demo" |
|
|
|
|
|
data = { |
|
'input_file_path': str(TESLA_DOC_PATH), |
|
'output_dir': str(demo_output_dir), |
|
'docling': st.session_state.demo_selected_methods['docling'], |
|
'llamaparse': st.session_state.demo_selected_methods['llamaparse'], |
|
'unstructured': st.session_state.demo_selected_methods['unstructured'] |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results = {} |
|
if st.session_state.demo_selected_methods['docling']: |
|
results['docling'] = {'status': 'success', 'total_tables': 5} |
|
if st.session_state.demo_selected_methods['llamaparse']: |
|
results['llamaparse'] = {'status': 'success', 'total_tables': 3} |
|
if st.session_state.demo_selected_methods['unstructured']: |
|
results['unstructured'] = {'status': 'success', 'total_tables': 4} |
|
|
|
st.session_state.demo_results = {'results': results} |
|
|
|
except Exception as e: |
|
st.error(f"Error processing Tesla demo: {str(e)}") |
|
|
|
def count_html_files(directory): |
|
"""Count only HTML files in directory""" |
|
if not os.path.exists(directory): |
|
return 0 |
|
|
|
html_files = glob.glob(os.path.join(str(directory), "*.html")) |
|
html_files.extend(glob.glob(os.path.join(str(directory), "**", "*.html"), recursive=True)) |
|
return len(html_files) |
|
|
|
def get_excel_files(directory): |
|
"""Get all Excel files from directory""" |
|
if not os.path.exists(directory): |
|
return [] |
|
|
|
excel_files = glob.glob(os.path.join(str(directory), "*.xlsx")) |
|
excel_files.extend(glob.glob(os.path.join(str(directory), "*.xls"))) |
|
excel_files.extend(glob.glob(os.path.join(str(directory), "*.csv"))) |
|
excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xlsx"), recursive=True)) |
|
excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xls"), recursive=True)) |
|
return excel_files |
|
|
|
def get_file_info(file_path): |
|
"""Get file information including size and modification time""" |
|
if not os.path.exists(file_path): |
|
return {"size": 0, "modified": "Unknown"} |
|
|
|
stat = os.stat(file_path) |
|
size_kb = stat.st_size / 1024 |
|
modified = datetime.fromtimestamp(stat.st_mtime) |
|
|
|
return { |
|
"size": f"{size_kb:.1f} KB", |
|
"modified": modified.strftime("%Y-%m-%d %H:%M") |
|
} |
|
|
|
def show_demo_results(): |
|
st.markdown("## π Tesla 10K Processing Results") |
|
|
|
|
|
col1, col2 = st.columns([2, 1]) |
|
with col1: |
|
st.markdown("### π tesla_docs_28-41 (1)-9-14.pdf") |
|
st.markdown("**Status:** β
Complete") |
|
processed_methods = [method.title() for method, selected in st.session_state.demo_selected_methods.items() if selected] |
|
st.markdown(f"**Processed with:** {', '.join(processed_methods)}") |
|
|
|
with col2: |
|
if st.button("π Reset"): |
|
st.session_state.page = 'home' |
|
st.session_state.processing = False |
|
st.session_state.results = None |
|
st.session_state.demo_results = None |
|
st.session_state.selected_method = None |
|
st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False} |
|
st.rerun() |
|
|
|
|
|
available_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected] |
|
|
|
if len(available_methods) > 1: |
|
st.markdown("### π§ Select Extraction Method to View") |
|
|
|
method_labels = { |
|
'docling': 'π§ Docling', |
|
'llamaparse': 'π¦ LlamaParse', |
|
'unstructured': 'π Unstructured' |
|
} |
|
|
|
|
|
cols = st.columns(len(available_methods)) |
|
|
|
for i, method in enumerate(available_methods): |
|
with cols[i]: |
|
|
|
method_output_dir = OUTPUT_BASE_PATH / method |
|
html_files = [] |
|
if os.path.exists(method_output_dir): |
|
html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True) |
|
html_files = list(set(html_files)) |
|
html_count = len(html_files) |
|
button_label = f"{method_labels[method]} ({html_count} HTML files)" |
|
|
|
if st.button(button_label, key=f"tab_{method}", use_container_width=True): |
|
st.session_state.selected_method = method |
|
|
|
|
|
if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods: |
|
st.session_state.selected_method = available_methods[0] if available_methods else None |
|
|
|
|
|
if st.session_state.selected_method: |
|
show_method_results(st.session_state.selected_method) |
|
|
|
def show_method_results(method): |
|
st.markdown(f"### π Results from {method.title()}") |
|
|
|
|
|
col1, col2 = st.columns([3, 1]) |
|
|
|
with col1: |
|
st.markdown("#### π HTML Tables") |
|
show_html_tables(method) |
|
|
|
with col2: |
|
st.markdown("#### π Excel Files") |
|
show_excel_files(method) |
|
|
|
def show_html_tables(method): |
|
"""Display HTML tables from the method's output directory""" |
|
method_output_dir = OUTPUT_BASE_PATH / method |
|
|
|
|
|
html_files = [] |
|
if os.path.exists(method_output_dir): |
|
|
|
html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True) |
|
|
|
html_files = list(set(html_files)) |
|
|
|
|
|
import re |
|
def extract_table_number(filename): |
|
match = re.search(r"table[_-](\d+)", filename, re.IGNORECASE) |
|
if match: |
|
return int(match.group(1)) |
|
return float('inf') |
|
html_files.sort(key=lambda f: extract_table_number(os.path.basename(f))) |
|
|
|
if html_files: |
|
st.markdown(f"**Found {len(html_files)} HTML table(s):**") |
|
|
|
|
|
st.markdown('<div class="table-container">', unsafe_allow_html=True) |
|
|
|
for i, html_file in enumerate(html_files): |
|
st.markdown(f""" |
|
<div class="table-header"> |
|
<h4 style="color: #495057;">π Table {i+1}</h4> |
|
<small style="color: #6c757d;">File: {os.path.basename(html_file)}</small> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
try: |
|
with open(html_file, 'r', encoding='utf-8') as f: |
|
html_content = f.read() |
|
st.components.v1.html(html_content, height=300, scrolling=True) |
|
|
|
except Exception as e: |
|
st.error(f"Error displaying HTML file: {e}") |
|
|
|
|
|
col_download1, col_download2, col_download3 = st.columns([1, 1, 2]) |
|
with col_download1: |
|
try: |
|
with open(html_file, 'r', encoding='utf-8') as f: |
|
html_content = f.read() |
|
st.download_button( |
|
label=f"β¬οΈ Table {i+1}", |
|
data=html_content, |
|
file_name=f"table_{i+1}_{method}.html", |
|
mime="text/html", |
|
key=f"download_html_{method}_{i}", |
|
use_container_width=True |
|
) |
|
except Exception as e: |
|
st.error(f"Error reading file for download: {e}") |
|
|
|
if i < len(html_files) - 1: |
|
st.markdown("---") |
|
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
else: |
|
st.warning(f"No HTML files found in {method_output_dir}") |
|
|
|
def show_excel_files(method): |
|
"""Display Excel files from the method's output directory""" |
|
method_output_dir = OUTPUT_BASE_PATH / method |
|
|
|
|
|
excel_files = get_excel_files(method_output_dir) |
|
|
|
if excel_files: |
|
st.markdown(f"**Found {len(excel_files)} Excel file(s):**") |
|
|
|
for i, excel_file in enumerate(excel_files): |
|
|
|
file_info = get_file_info(excel_file) |
|
file_name = os.path.basename(excel_file) |
|
|
|
|
|
st.markdown(f""" |
|
<div class="file-info-card"> |
|
<strong style="color: #495057;">π {file_name}</strong> |
|
<div class="file-stats"> |
|
<strong>Size:</strong> {file_info['size']}<br> |
|
<strong>Modified:</strong> {file_info['modified']} |
|
</div> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
try: |
|
df = pd.read_excel(excel_file) |
|
if not df.empty: |
|
st.markdown(f"**Preview (first 5 rows):**") |
|
st.dataframe(df.head(), use_container_width=True) |
|
st.markdown(f"**Dimensions:** {df.shape[0]} Γ {df.shape[1]}") |
|
else: |
|
st.info("Excel file is empty") |
|
except Exception as e: |
|
|
|
try: |
|
df = pd.read_csv(excel_file) |
|
if not df.empty: |
|
st.markdown(f"**Preview (first 5 rows, read as CSV):**") |
|
st.dataframe(df.head(), use_container_width=True) |
|
st.markdown(f"**Dimensions:** {df.shape[0]} Γ {df.shape[1]}") |
|
else: |
|
st.info("CSV file is empty") |
|
except Exception as e2: |
|
st.warning(f"Could not preview file as Excel or CSV: {e2}") |
|
|
|
|
|
try: |
|
with open(excel_file, 'rb') as f: |
|
excel_data = f.read() |
|
st.download_button( |
|
label=f"β¬οΈ Download", |
|
data=excel_data, |
|
file_name=file_name, |
|
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
|
key=f"download_excel_{method}_{i}", |
|
use_container_width=True |
|
) |
|
except Exception as e: |
|
st.error(f"Error reading Excel file for download: {e}") |
|
|
|
if i < len(excel_files) - 1: |
|
st.markdown("---") |
|
else: |
|
st.warning(f"No Excel files found in {method_output_dir}") |
|
|
|
def process_document(file_path, output_dir, docling, llamaparse, unstructured): |
|
"""Process document using the FastAPI endpoint""" |
|
try: |
|
|
|
data = { |
|
'input_file_path': file_path, |
|
'output_dir': output_dir, |
|
'docling': docling, |
|
'llamaparse': llamaparse, |
|
'unstructured': unstructured |
|
} |
|
|
|
|
|
with st.spinner('Processing document...'): |
|
|
|
|
|
response = requests.post('http://localhost:8000/extract', data=data) |
|
|
|
if response.status_code == 200: |
|
st.session_state.results = response.json() |
|
st.success("Document processed successfully!") |
|
|
|
|
|
results = st.session_state.results['results'] |
|
|
|
|
|
st.markdown("### π View Results") |
|
available_methods = [method for method in ['docling', 'llamaparse', 'unstructured'] |
|
if method in results and isinstance(results[method], dict)] |
|
|
|
if available_methods: |
|
selected_method = st.selectbox( |
|
"Select extraction method to view:", |
|
available_methods, |
|
help="Choose which extraction method results to display" |
|
) |
|
|
|
if selected_method and isinstance(results[selected_method], dict): |
|
method_result = results[selected_method] |
|
st.json(method_result) |
|
|
|
|
|
method_dir = os.path.join(output_dir, selected_method) |
|
|
|
|
|
html_files = glob.glob(os.path.join(method_dir, "*.html")) |
|
html_files.extend(glob.glob(os.path.join(method_dir, "**", "*.html"), recursive=True)) |
|
|
|
|
|
excel_files = get_excel_files(method_dir) |
|
|
|
if html_files or excel_files: |
|
st.markdown("### π Generated Files") |
|
|
|
if html_files: |
|
st.markdown("**HTML Files:**") |
|
for html_file in html_files: |
|
st.markdown(f"- {os.path.basename(html_file)}") |
|
|
|
if excel_files: |
|
st.markdown("**Excel Files:**") |
|
for excel_file in excel_files: |
|
st.markdown(f"- {os.path.basename(excel_file)}") |
|
else: |
|
st.warning("No successful extractions found.") |
|
|
|
else: |
|
st.error(f"Error processing document: {response.text}") |
|
|
|
except requests.exceptions.ConnectionError: |
|
st.error("Could not connect to the processing service. Please ensure the FastAPI server is running.") |
|
except Exception as e: |
|
st.error(f"An error occurred: {str(e)}") |
|
|
|
def main(): |
|
|
|
col1, col2 = st.columns([1, 1]) |
|
with col1: |
|
st.markdown("### π PDF Parser") |
|
st.markdown("*Table Extraction Tool*") |
|
with col2: |
|
nav_col1, nav_col2 = st.columns(2) |
|
with nav_col1: |
|
if st.button("Dashboard", use_container_width=True): |
|
st.session_state.page = 'home' |
|
st.rerun() |
|
with nav_col2: |
|
st.button("History", use_container_width=True) |
|
st.markdown("---") |
|
|
|
if st.session_state.page == 'home': |
|
show_home_page() |
|
elif st.session_state.page == 'upload': |
|
show_upload_page() |
|
elif st.session_state.page == 'demo_setup': |
|
show_demo_setup_page() |
|
elif st.session_state.page == 'demo': |
|
show_demo_page() |
|
|
|
if __name__ == "__main__": |
|
main() |