import io
import streamlit as st
import requests
import time
import os
from pathlib import Path
import glob
import base64
import pandas as pd
from datetime import datetime
# Configure page
st.set_page_config(
page_title="PDF Parser - Table Extraction Tool",
page_icon="📋",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS for styling - Grey and White Theme
st.markdown("""
""", unsafe_allow_html=True)
# Initialize session state
if 'page' not in st.session_state:
st.session_state.page = 'home'
if 'processing' not in st.session_state:
st.session_state.processing = False
if 'results' not in st.session_state:
st.session_state.results = None
if 'show_output_dir' not in st.session_state:
st.session_state.show_output_dir = False
if 'selected_method' not in st.session_state:
st.session_state.selected_method = None
if 'demo_results' not in st.session_state:
st.session_state.demo_results = None
if 'demo_selected_methods' not in st.session_state:
st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
# Get the current directory (src) and set output path
CURRENT_DIR = Path(__file__).parent
OUTPUT_BASE_PATH = CURRENT_DIR / "output"
# Create output directory if it doesn't exist
OUTPUT_BASE_PATH.mkdir(exist_ok=True)
def check_existing_results():
"""Check if there are existing results in the output directory"""
existing_methods = []
for method in ['docling', 'llamaparse', 'unstructured']:
method_dir = OUTPUT_BASE_PATH / method
if method_dir.exists():
# Check for HTML files
html_files = list(method_dir.glob("**/*.html"))
if html_files:
existing_methods.append(method)
return existing_methods
def show_home_page():
# Check for existing results
existing_methods = check_existing_results()
# Header
st.markdown("""
Transform PDF Tables to
HTML and Excel
Powered by Traversaal.ai
Perfect for financial reports, research papers, and data analysis.
""", unsafe_allow_html=True)
# Show existing results notification if any
if existing_methods:
st.info(f"📁 Found existing results from: {', '.join([m.title() for m in existing_methods])}. Click 'View Results' to see them.")
# Main buttons
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
if existing_methods:
# Show three buttons if results exist
col_btn1, col_btn2, col_btn3 = st.columns(3)
with col_btn1:
if st.button("📄 Upload PDF", key="upload_btn", help="Upload your own PDF document"):
st.session_state.page = 'upload'
st.rerun()
with col_btn2:
if st.button("⚡ Try Demo", key="demo_btn", help="Try with Tesla's 10K form"):
st.session_state.page = 'demo_setup'
st.rerun()
with col_btn3:
if st.button("👁️ View Results", key="view_results_btn", help="View existing results"):
st.session_state.page = 'demo'
st.session_state.processing = False
st.session_state.demo_selected_methods = {method: method in existing_methods for method in ['docling', 'llamaparse', 'unstructured']}
st.rerun()
else:
# Show two buttons if no results exist
col_btn1, col_btn2 = st.columns(2)
with col_btn1:
if st.button("📄 Upload PDF Document", key="upload_btn", help="Upload your own PDF document"):
st.session_state.page = 'upload'
st.rerun()
with col_btn2:
if st.button("⚡ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"):
st.session_state.page = 'demo_setup'
st.rerun()
# Features section
st.markdown("---")
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("""
⚡ Lightning Fast
Process complex PDFs in seconds with our advanced AI algorithms
""", unsafe_allow_html=True)
with col2:
st.markdown("""
🔒 Secure & Private
Your documents are processed securely and never stored permanently
""", unsafe_allow_html=True)
with col3:
st.markdown("""
🔄 Batch Processing
Handle multiple documents and tables simultaneously
""", unsafe_allow_html=True)
def show_upload_page():
st.markdown("## 📄 Upload Your Document")
# File upload
uploaded_file = st.file_uploader(
"Choose a PDF file",
type=['pdf'],
help="Upload a PDF document to extract tables from"
)
# Input file path (alternative)
st.markdown("**Or specify file path:**")
input_file_path = st.text_input(
"Input File Path",
placeholder="path/to/your/document.pdf",
help="Enter the path to your PDF file"
)
# Output directory with show/hide functionality
output_dir = st.text_input(
"Output Directory",
value=str(OUTPUT_BASE_PATH),
help="Directory where extracted tables will be saved",
type="password" if not st.session_state.show_output_dir else "default"
)
# Show/Hide output directory toggle
col1, col2 = st.columns([3, 1])
with col2:
if st.button("👁️ View/Hide Path"):
st.session_state.show_output_dir = not st.session_state.show_output_dir
st.rerun()
# Extraction method selection
st.markdown("### 🔧 Select Extraction Methods")
col1, col2, col3 = st.columns(3)
with col1:
docling = st.checkbox("Docling", value=True, help="Advanced document processing")
with col2:
llamaparse = st.checkbox("LlamaParse", value=False, help="AI-powered parsing")
with col3:
unstructured = st.checkbox("Unstructured", value=False, help="General purpose extraction")
# Process button
if st.button("🚀 Process Document", type="primary"):
if (uploaded_file or input_file_path) and output_dir and (docling or llamaparse or unstructured):
file_path = input_file_path if input_file_path else uploaded_file.name
process_document(file_path, output_dir, docling, llamaparse, unstructured)
else:
st.error("Please provide input file, output directory, and select at least one extraction method.")
# Back button
if st.button("← Back to Home"):
st.session_state.page = 'home'
st.rerun()
def show_demo_setup_page():
st.markdown("## ⚡ Tesla 10K Demo Setup")
st.markdown("*Configure extraction methods for Tesla's 10K document processing*")
# Check for existing results
existing_methods = check_existing_results()
# Document info
st.markdown("### 📄 Document Information")
if existing_methods:
st.success(f"**Found existing results from:** {', '.join([m.title() for m in existing_methods])}")
st.info("**Note:** You can view existing results or process with different methods")
else:
st.info("**Document:** Tesla 10K form - Financial tables extraction demo")
# Extraction method selection
st.markdown("### 🔧 Select Extraction Methods")
col1, col2, col3 = st.columns(3)
with col1:
docling = st.checkbox("Docling",
value=st.session_state.demo_selected_methods.get('docling', True),
help="Advanced document processing")
with col2:
llamaparse = st.checkbox("LlamaParse",
value=st.session_state.demo_selected_methods.get('llamaparse', False),
help="AI-powered parsing")
with col3:
unstructured = st.checkbox("Unstructured",
value=st.session_state.demo_selected_methods.get('unstructured', False),
help="General purpose extraction")
# Update session state
st.session_state.demo_selected_methods = {
'docling': docling,
'llamaparse': llamaparse,
'unstructured': unstructured
}
# Process button
col1, col2 = st.columns([2, 1])
with col1:
if existing_methods:
# Show two buttons if results exist
col_btn1, col_btn2 = st.columns(2)
with col_btn1:
if st.button("👁️ View Existing Results", type="secondary"):
st.session_state.page = 'demo'
st.session_state.processing = False
st.session_state.demo_selected_methods = {method: method in existing_methods for method in ['docling', 'llamaparse', 'unstructured']}
st.rerun()
with col_btn2:
if st.button("🚀 Process New", type="primary"):
if docling or llamaparse or unstructured:
st.session_state.page = 'demo'
st.session_state.processing = True
st.rerun()
else:
st.error("Please select at least one extraction method.")
else:
# Show single process button if no results exist
if st.button("🚀 Process Tesla Document", type="primary"):
if docling or llamaparse or unstructured:
st.session_state.page = 'demo'
st.session_state.processing = True
st.rerun()
else:
st.error("Please select at least one extraction method.")
with col2:
if st.button("← Back to Home"):
st.session_state.page = 'home'
st.rerun()
def show_demo_page():
if st.session_state.processing:
show_processing_demo()
else:
show_demo_results()
def show_processing_demo():
st.markdown("## ⚡ Processing Tesla 10K Document...")
# Show selected methods
selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
st.markdown(f"*Processing with selected methods: {', '.join([m.title() for m in selected_methods])}*")
# Progress bar
progress_bar = st.progress(0)
status_text = st.empty()
method_status = st.empty()
# Calculate total steps based on selected methods
total_methods = len(selected_methods)
steps_per_method = 30
total_steps = total_methods * steps_per_method
current_method_index = 0
for i in range(total_steps):
progress = (i + 1) / total_steps
progress_bar.progress(progress)
# Determine current method
method_step = i % steps_per_method
if method_step == 0 and i > 0:
current_method_index += 1
current_method = selected_methods[current_method_index]
method_progress = (method_step + 1) / steps_per_method
# Update status messages
if method_progress < 0.3:
status_text.text(f"📄 {current_method.title()}: Reading document... {int(method_progress * 100)}%")
elif method_progress < 0.7:
status_text.text(f"🔍 {current_method.title()}: Extracting tables... {int(method_progress * 100)}%")
else:
status_text.text(f"💾 {current_method.title()}: Generating HTML outputs... {int(method_progress * 100)}%")
method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}")
time.sleep(0.1) # Reduced sleep time for faster demo
# Show completion
st.markdown("""
✅ Document processed successfully!
Tables have been extracted using selected methods and HTML files are ready for viewing.
""", unsafe_allow_html=True)
# Process Tesla demo
process_tesla_demo()
st.session_state.processing = False
time.sleep(1)
st.rerun()
def process_tesla_demo():
"""Process Tesla demo document using selected extraction methods"""
try:
# For demo purposes, simulate successful processing for selected methods only
results = {}
selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
for method in selected_methods:
results[method] = {'status': 'success', 'total_tables': 3 + hash(method) % 3} # Simulate different table counts
st.session_state.demo_results = {'results': results}
except Exception as e:
st.error(f"Error processing Tesla demo: {str(e)}")
def count_html_files(directory):
"""Count only HTML files in directory"""
if not directory.exists():
return 0
html_files = list(directory.glob("**/*.html"))
return len(html_files)
def get_excel_files(directory):
"""Get all Excel files from directory"""
if not directory.exists():
return []
excel_files = []
for ext in ['*.xlsx', '*.xls', '*.csv']:
excel_files.extend(directory.glob(f"**/{ext}"))
return excel_files
def get_file_info(file_path):
"""Get file information including size and modification time"""
if not file_path.exists():
return {"size": 0, "modified": "Unknown"}
stat = file_path.stat()
size_kb = stat.st_size / 1024
modified = datetime.fromtimestamp(stat.st_mtime)
return {
"size": f"{size_kb:.1f} KB",
"modified": modified.strftime("%Y-%m-%d %H:%M")
}
def show_demo_results():
st.markdown("## 📊 Tesla 10K Processing Results")
# Check for existing results
existing_methods = check_existing_results()
# Document info
col1, col2 = st.columns([2, 1])
with col1:
st.markdown("### 📄 Tesla 10K Document")
st.markdown("**Status:** ✅ Complete")
if existing_methods:
st.markdown(f"**Available results:** {', '.join([m.title() for m in existing_methods])}")
else:
st.warning("No results found in output directory")
with col2:
if st.button("🔄 Reset"):
st.session_state.page = 'home'
st.session_state.processing = False
st.session_state.results = None
st.session_state.demo_results = None
st.session_state.selected_method = None
st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
st.rerun()
# Method selection tabs - only show available methods
available_methods = existing_methods
if available_methods:
if len(available_methods) > 1:
st.markdown("### 🔧 Select Extraction Method to View")
method_labels = {
'docling': '🔧 Docling',
'llamaparse': '🦙 LlamaParse',
'unstructured': '📊 Unstructured'
}
# Create columns based on number of available methods
cols = st.columns(len(available_methods))
for i, method in enumerate(available_methods):
with cols[i]:
# Show HTML file count for each method
method_output_dir = OUTPUT_BASE_PATH / method
html_count = count_html_files(method_output_dir)
button_label = f"{method_labels[method]} ({html_count} HTML files)"
if st.button(button_label, key=f"tab_{method}", use_container_width=True):
st.session_state.selected_method = method
# Default to first available method if no method selected
if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods:
st.session_state.selected_method = available_methods[0]
# Show results for selected method
if st.session_state.selected_method:
show_method_results(st.session_state.selected_method)
else:
st.info("No results found. Please process a document first.")
def show_method_results(method):
st.markdown(f"### 📋 Results from {method.title()}")
# Changed column ratio: 3:1 for HTML tables:Excel files
col1, col2 = st.columns([3, 1])
with col1:
st.markdown("#### 📄 HTML Tables")
show_html_tables(method)
with col2:
st.markdown("#### 📊 Excel Files")
show_excel_files(method)
def show_html_tables(method):
"""Display HTML tables from the method's output directory"""
method_output_dir = OUTPUT_BASE_PATH / method
# Get actual HTML files from directory
html_files = []
if method_output_dir.exists():
html_files = list(method_output_dir.glob("**/*.html"))
# Sort files by table number if possible
import re
def extract_table_number(filename):
match = re.search(r"table[_-](\d+)", filename.name, re.IGNORECASE)
if match:
return int(match.group(1))
return float('inf')
html_files.sort(key=extract_table_number)
if html_files:
st.markdown(f"**Found {len(html_files)} HTML table(s):**")
# Display all HTML files in one scrollable container
st.markdown('', unsafe_allow_html=True)
for i, html_file in enumerate(html_files):
st.markdown(f"""
""", unsafe_allow_html=True)
# Display HTML content
try:
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
st.components.v1.html(html_content, height=300, scrolling=True)
except Exception as e:
st.error(f"Error displaying HTML file: {e}")
# Download button for individual HTML file
col_download1, col_download2, col_download3 = st.columns([1, 1, 2])
with col_download1:
try:
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
st.download_button(
label=f"⬇️ Table {i+1}",
data=html_content,
file_name=f"table_{i+1}_{method}.html",
mime="text/html",
key=f"download_html_{method}_{i}",
use_container_width=True
)
except Exception as e:
st.error(f"Error reading file for download: {e}")
if i < len(html_files) - 1:
st.markdown("---")
st.markdown('
', unsafe_allow_html=True)
else:
st.warning(f"No HTML files found in {method_output_dir}")
def show_excel_files(method):
"""Display Excel files from the method's output directory"""
method_output_dir = OUTPUT_BASE_PATH / method
# Get actual Excel files from directory
excel_files = get_excel_files(method_output_dir)
if excel_files:
st.markdown(f"**Found {len(excel_files)} Excel file(s):**")
for i, excel_file in enumerate(excel_files):
# Get file info
file_info = get_file_info(excel_file)
file_name = excel_file.name
# File info card
st.markdown(f"""
📊 {file_name}
Size: {file_info['size']}
Modified: {file_info['modified']}
""", unsafe_allow_html=True)
# Try to read and display Excel file preview
try:
if excel_file.suffix.lower() in ['.xlsx', '.xls']:
df = pd.read_excel(excel_file)
else:
df = pd.read_csv(excel_file)
if not df.empty:
st.markdown(f"**Preview (first 5 rows):**")
st.dataframe(df.head(), use_container_width=True)
st.markdown(f"**Dimensions:** {df.shape[0]} × {df.shape[1]}")
else:
st.info("File is empty")
except Exception as e:
st.warning(f"Could not preview file: {e}")
# Download button for Excel file
try:
with open(excel_file, 'rb') as f:
file_data = f.read()
st.download_button(
label=f"⬇️ Download",
data=file_data,
file_name=file_name,
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
key=f"download_excel_{method}_{i}",
use_container_width=True
)
except Exception as e:
st.error(f"Error reading file for download: {e}")
if i < len(excel_files) - 1:
st.markdown("---")
else:
st.warning(f"No Excel files found in {method_output_dir}")
def process_document(file_path, output_dir, docling, llamaparse, unstructured):
"""Process document using the FastAPI endpoint"""
try:
# Prepare the request data
data = {
'input_file_path': file_path,
'output_dir': output_dir,
'docling': docling,
'llamaparse': llamaparse,
'unstructured': unstructured
}
# Show processing message
with st.spinner('Processing document...'):
# Make request to FastAPI endpoint
# Replace with your actual FastAPI endpoint URL
response = requests.post('http://localhost:8000/extract', data=data)
if response.status_code == 200:
st.session_state.results = response.json()
st.success("Document processed successfully!")
# Show results
results = st.session_state.results['results']
# Method selection for viewing results
st.markdown("### 📊 View Results")
available_methods = [method for method in ['docling', 'llamaparse', 'unstructured']
if method in results and isinstance(results[method], dict)]
if available_methods:
selected_method = st.selectbox(
"Select extraction method to view:",
available_methods,
help="Choose which extraction method results to display"
)
if selected_method and isinstance(results[selected_method], dict):
method_result = results[selected_method]
st.json(method_result)
# List files in output directory
method_dir = Path(output_dir) / selected_method
# HTML files
html_files = list(method_dir.glob("**/*.html"))
# Excel files
excel_files = get_excel_files(method_dir)
if html_files or excel_files:
st.markdown("### 📄 Generated Files")
if html_files:
st.markdown("**HTML Files:**")
for html_file in html_files:
st.markdown(f"- {html_file.name}")
if excel_files:
st.markdown("**Excel Files:**")
for excel_file in excel_files:
st.markdown(f"- {excel_file.name}")
else:
st.warning("No successful extractions found.")
else:
st.error(f"Error processing document: {response.text}")
except requests.exceptions.ConnectionError:
st.error("Could not connect to the processing service. Please ensure the FastAPI server is running.")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
def main():
# Navigation header
col1, col2 = st.columns([1, 1])
with col1:
st.markdown("### 📋 PDF Parser")
st.markdown("*Table Extraction Tool*")
with col2:
nav_col1, nav_col2 = st.columns(2)
with nav_col1:
if st.button("Dashboard", use_container_width=True):
st.session_state.page = 'home'
st.rerun()
with nav_col2:
st.button("History", use_container_width=True)
st.markdown("---")
# Route to appropriate page
if st.session_state.page == 'home':
show_home_page()
elif st.session_state.page == 'upload':
show_upload_page()
elif st.session_state.page == 'demo_setup':
show_demo_setup_page()
elif st.session_state.page == 'demo':
show_demo_page()
if __name__ == "__main__":
main()