import io
import streamlit as st
import requests
import time
import os
from pathlib import Path
import glob
import base64
import pandas as pd
from datetime import datetime
# Configure page
st.set_page_config(
page_title="PDF Parser - Table Extraction Tool",
page_icon="📋",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS for styling - Grey and White Theme
st.markdown("""
""", unsafe_allow_html=True)
# Initialize session state
if 'page' not in st.session_state:
st.session_state.page = 'home'
if 'processing' not in st.session_state:
st.session_state.processing = False
if 'results' not in st.session_state:
st.session_state.results = None
if 'show_output_dir' not in st.session_state:
st.session_state.show_output_dir = False
if 'selected_method' not in st.session_state:
st.session_state.selected_method = None
if 'demo_results' not in st.session_state:
st.session_state.demo_results = None
if 'demo_selected_methods' not in st.session_state:
st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
# Get the directory where the script is located (src)
SCRIPT_DIR = Path(__file__).parent
# Tesla demo document path (assuming it's in the src directory or adjust as needed)
TESLA_DOC_PATH = SCRIPT_DIR / "tesla_docs_28-41 (1)-9-14.pdf"
# Output directory is src/output
OUTPUT_BASE_PATH = SCRIPT_DIR / "output"
def show_home_page():
# Header
st.markdown("""
Transform PDF Tables to
HTML and Excel
Powered by Traversaal.ai
Perfect for financial reports, research papers, and data analysis.
""", unsafe_allow_html=True)
# Main buttons
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
col_btn1, col_btn2 = st.columns(2)
with col_btn1:
if st.button("📄 Upload PDF Document", key="upload_btn", help="Upload your own PDF document"):
st.session_state.page = 'upload'
st.rerun()
with col_btn2:
if st.button("⚡ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"):
st.session_state.page = 'demo_setup'
st.rerun()
# Features section
st.markdown("---")
col1, col2, col3 = st.columns(3)
with col1:
st.markdown("""
⚡ Lightning Fast
Process complex PDFs in seconds with our advanced AI algorithms
""", unsafe_allow_html=True)
with col2:
st.markdown("""
🔒 Secure & Private
Your documents are processed securely and never stored permanently
""", unsafe_allow_html=True)
with col3:
st.markdown("""
🔄 Batch Processing
Handle multiple documents and tables simultaneously
""", unsafe_allow_html=True)
def show_upload_page():
st.markdown("## 📄 Upload Your Document")
# File upload
uploaded_file = st.file_uploader(
"Choose a PDF file",
type=['pdf'],
help="Upload a PDF document to extract tables from"
)
# Input file path (alternative)
st.markdown("**Or specify file path:**")
input_file_path = st.text_input(
"Input File Path",
placeholder="C:\\path\\to\\your\\document.pdf",
help="Enter the full path to your PDF file"
)
# Output directory with show/hide functionality
output_dir = st.text_input(
"Output Directory",
placeholder="C:\\path\\to\\output\\folder",
help="Directory where extracted tables will be saved",
type="password" if not st.session_state.show_output_dir else "default"
)
# Show/Hide output directory toggle
col1, col2 = st.columns([3, 1])
with col2:
if st.button("👁️ View/Hide Path"):
st.session_state.show_output_dir = not st.session_state.show_output_dir
st.rerun()
# Extraction method selection
st.markdown("### 🔧 Select Extraction Methods")
col1, col2, col3 = st.columns(3)
with col1:
docling = st.checkbox("Docling", value=True, help="Advanced document processing")
with col2:
llamaparse = st.checkbox("LlamaParse", value=False, help="AI-powered parsing")
with col3:
unstructured = st.checkbox("Unstructured", value=False, help="General purpose extraction")
# Process button
if st.button("🚀 Process Document", type="primary"):
if (uploaded_file or input_file_path) and output_dir and (docling or llamaparse or unstructured):
file_path = input_file_path if input_file_path else uploaded_file.name
process_document(file_path, output_dir, docling, llamaparse, unstructured)
else:
st.error("Please provide input file, output directory, and select at least one extraction method.")
# Back button
if st.button("← Back to Home"):
st.session_state.page = 'home'
st.rerun()
def show_demo_setup_page():
st.markdown("## ⚡ Tesla 10K Demo Setup")
st.markdown("*Configure extraction methods for Tesla's 10K document processing*")
# Document info
st.markdown("### 📄 Document Information")
st.info("**Document:** tesla_docs_28-41 (1)-9-14.pdf")
# Extraction method selection (removed output directory section completely)
st.markdown("### 🔧 Select Extraction Methods")
col1, col2, col3 = st.columns(3)
with col1:
docling = st.checkbox("Docling",
value=st.session_state.demo_selected_methods['docling'],
help="Advanced document processing")
with col2:
llamaparse = st.checkbox("LlamaParse",
value=st.session_state.demo_selected_methods['llamaparse'],
help="AI-powered parsing")
with col3:
unstructured = st.checkbox("Unstructured",
value=st.session_state.demo_selected_methods['unstructured'],
help="General purpose extraction")
# Update session state
st.session_state.demo_selected_methods = {
'docling': docling,
'llamaparse': llamaparse,
'unstructured': unstructured
}
# Process button
col1, col2 = st.columns([2, 1])
with col1:
if st.button("🚀 Process Tesla Document", type="primary"):
if docling or llamaparse or unstructured:
st.session_state.page = 'demo'
st.session_state.processing = True
st.rerun()
else:
st.error("Please select at least one extraction method.")
with col2:
if st.button("← Back to Home"):
st.session_state.page = 'home'
st.rerun()
def show_demo_page():
if st.session_state.processing:
show_processing_demo()
else:
show_demo_results()
def show_processing_demo():
st.markdown("## ⚡ Processing Tesla 10K Document...")
# Show selected methods
selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
st.markdown(f"*Processing with selected methods: {', '.join([m.title() for m in selected_methods])}*")
# Progress bar
progress_bar = st.progress(0)
status_text = st.empty()
method_status = st.empty()
# Calculate total steps based on selected methods
total_methods = len(selected_methods)
steps_per_method = 30
total_steps = total_methods * steps_per_method
current_method_index = 0
for i in range(total_steps):
progress = (i + 1) / total_steps
progress_bar.progress(progress)
# Determine current method
method_step = i % steps_per_method
if method_step == 0 and i > 0:
current_method_index += 1
current_method = selected_methods[current_method_index]
method_progress = (method_step + 1) / steps_per_method
# Update status messages
if method_progress < 0.3:
status_text.text(f"📄 {current_method.title()}: Reading document... {int(method_progress * 100)}%")
elif method_progress < 0.7:
status_text.text(f"🔍 {current_method.title()}: Extracting tables... {int(method_progress * 100)}%")
else:
status_text.text(f"💾 {current_method.title()}: Generating HTML outputs... {int(method_progress * 100)}%")
method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}")
time.sleep(0.33)
# Show completion
st.markdown("""
✅ Document processed successfully!
Tables have been extracted using selected methods and HTML files are ready for viewing.
""", unsafe_allow_html=True)
# Process Tesla demo
process_tesla_demo()
st.session_state.processing = False
time.sleep(2)
st.rerun()
def process_tesla_demo():
"""Process Tesla demo document using selected extraction methods"""
try:
# Create output directory for demo (using the base path)
demo_output_dir = OUTPUT_BASE_PATH / "tesla_demo"
# Prepare the request data for selected methods only
data = {
'input_file_path': str(TESLA_DOC_PATH),
'output_dir': str(demo_output_dir),
'docling': st.session_state.demo_selected_methods['docling'],
'llamaparse': st.session_state.demo_selected_methods['llamaparse'],
'unstructured': st.session_state.demo_selected_methods['unstructured']
}
# Make request to FastAPI endpoint (uncomment when ready)
# response = requests.post('http://localhost:8000/extract', data=data)
# if response.status_code == 200:
# st.session_state.demo_results = response.json()
# For demo purposes, simulate successful processing for selected methods only
results = {}
if st.session_state.demo_selected_methods['docling']:
results['docling'] = {'status': 'success', 'total_tables': 5}
if st.session_state.demo_selected_methods['llamaparse']:
results['llamaparse'] = {'status': 'success', 'total_tables': 3}
if st.session_state.demo_selected_methods['unstructured']:
results['unstructured'] = {'status': 'success', 'total_tables': 4}
st.session_state.demo_results = {'results': results}
except Exception as e:
st.error(f"Error processing Tesla demo: {str(e)}")
def count_html_files(directory):
"""Count only HTML files in directory"""
if not os.path.exists(directory):
return 0
html_files = glob.glob(os.path.join(str(directory), "*.html"))
html_files.extend(glob.glob(os.path.join(str(directory), "**", "*.html"), recursive=True))
return len(html_files)
def get_excel_files(directory):
"""Get all Excel files from directory"""
if not os.path.exists(directory):
return []
excel_files = glob.glob(os.path.join(str(directory), "*.xlsx"))
excel_files.extend(glob.glob(os.path.join(str(directory), "*.xls")))
excel_files.extend(glob.glob(os.path.join(str(directory), "*.csv")))
excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xlsx"), recursive=True))
excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xls"), recursive=True))
return excel_files
def get_file_info(file_path):
"""Get file information including size and modification time"""
if not os.path.exists(file_path):
return {"size": 0, "modified": "Unknown"}
stat = os.stat(file_path)
size_kb = stat.st_size / 1024
modified = datetime.fromtimestamp(stat.st_mtime)
return {
"size": f"{size_kb:.1f} KB",
"modified": modified.strftime("%Y-%m-%d %H:%M")
}
def show_demo_results():
st.markdown("## 📊 Tesla 10K Processing Results")
# Document info
col1, col2 = st.columns([2, 1])
with col1:
st.markdown("### 📄 tesla_docs_28-41 (1)-9-14.pdf")
st.markdown("**Status:** ✅ Complete")
processed_methods = [method.title() for method, selected in st.session_state.demo_selected_methods.items() if selected]
st.markdown(f"**Processed with:** {', '.join(processed_methods)}")
with col2:
if st.button("🔄 Reset"):
st.session_state.page = 'home'
st.session_state.processing = False
st.session_state.results = None
st.session_state.demo_results = None
st.session_state.selected_method = None
st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
st.rerun()
# Method selection tabs - only show selected methods
available_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
if len(available_methods) > 1:
st.markdown("### 🔧 Select Extraction Method to View")
method_labels = {
'docling': '🔧 Docling',
'llamaparse': '🦙 LlamaParse',
'unstructured': '📊 Unstructured'
}
# Create columns based on number of available methods
cols = st.columns(len(available_methods))
for i, method in enumerate(available_methods):
with cols[i]:
# Show HTML file count for each method using the same logic as show_html_tables
method_output_dir = OUTPUT_BASE_PATH / method
html_files = []
if os.path.exists(method_output_dir):
html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True)
html_files = list(set(html_files))
html_count = len(html_files)
button_label = f"{method_labels[method]} ({html_count} HTML files)"
if st.button(button_label, key=f"tab_{method}", use_container_width=True):
st.session_state.selected_method = method
# Default to first available method if no method selected
if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods:
st.session_state.selected_method = available_methods[0] if available_methods else None
# Show results for selected method
if st.session_state.selected_method:
show_method_results(st.session_state.selected_method)
def show_method_results(method):
st.markdown(f"### 📋 Results from {method.title()}")
# Changed column ratio: 3:1 for HTML tables:Excel files
col1, col2 = st.columns([3, 1])
with col1:
st.markdown("#### 📄 HTML Tables")
show_html_tables(method)
with col2:
st.markdown("#### 📊 Excel Files")
show_excel_files(method)
def show_html_tables(method):
"""Display HTML tables from the method's output directory"""
method_output_dir = OUTPUT_BASE_PATH / method
# Get actual HTML files from directory
html_files = []
if os.path.exists(method_output_dir):
# Use only the recursive glob, which includes the top-level directory
html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True)
# Remove duplicates just in case
html_files = list(set(html_files))
# Sort files by table number if possible (e.g., table_1, table_2, ...)
import re
def extract_table_number(filename):
match = re.search(r"table[_-](\d+)", filename, re.IGNORECASE)
if match:
return int(match.group(1))
return float('inf') # Put files without a number at the end
html_files.sort(key=lambda f: extract_table_number(os.path.basename(f)))
if html_files:
st.markdown(f"**Found {len(html_files)} HTML table(s):**")
# Display all HTML files in one scrollable container
st.markdown('', unsafe_allow_html=True)
for i, html_file in enumerate(html_files):
st.markdown(f"""
""", unsafe_allow_html=True)
# Display HTML content
try:
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
st.components.v1.html(html_content, height=300, scrolling=True)
except Exception as e:
st.error(f"Error displaying HTML file: {e}")
# Download button for individual HTML file
col_download1, col_download2, col_download3 = st.columns([1, 1, 2])
with col_download1:
try:
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
st.download_button(
label=f"⬇️ Table {i+1}",
data=html_content,
file_name=f"table_{i+1}_{method}.html",
mime="text/html",
key=f"download_html_{method}_{i}",
use_container_width=True
)
except Exception as e:
st.error(f"Error reading file for download: {e}")
if i < len(html_files) - 1:
st.markdown("---")
st.markdown('
', unsafe_allow_html=True)
else:
st.warning(f"No HTML files found in {method_output_dir}")
def show_excel_files(method):
"""Display Excel files from the method's output directory"""
method_output_dir = OUTPUT_BASE_PATH / method
# Get actual Excel files from directory
excel_files = get_excel_files(method_output_dir)
if excel_files:
st.markdown(f"**Found {len(excel_files)} Excel file(s):**")
for i, excel_file in enumerate(excel_files):
# Get file info
file_info = get_file_info(excel_file)
file_name = os.path.basename(excel_file)
# File info card
st.markdown(f"""
📊 {file_name}
Size: {file_info['size']}
Modified: {file_info['modified']}
""", unsafe_allow_html=True)
# Try to read and display Excel file preview
try:
df = pd.read_excel(excel_file)
if not df.empty:
st.markdown(f"**Preview (first 5 rows):**")
st.dataframe(df.head(), use_container_width=True)
st.markdown(f"**Dimensions:** {df.shape[0]} × {df.shape[1]}")
else:
st.info("Excel file is empty")
except Exception as e:
# Try reading as CSV if Excel reading fails
try:
df = pd.read_csv(excel_file)
if not df.empty:
st.markdown(f"**Preview (first 5 rows, read as CSV):**")
st.dataframe(df.head(), use_container_width=True)
st.markdown(f"**Dimensions:** {df.shape[0]} × {df.shape[1]}")
else:
st.info("CSV file is empty")
except Exception as e2:
st.warning(f"Could not preview file as Excel or CSV: {e2}")
# Download button for Excel file
try:
with open(excel_file, 'rb') as f:
excel_data = f.read()
st.download_button(
label=f"⬇️ Download",
data=excel_data,
file_name=file_name,
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
key=f"download_excel_{method}_{i}",
use_container_width=True
)
except Exception as e:
st.error(f"Error reading Excel file for download: {e}")
if i < len(excel_files) - 1:
st.markdown("---")
else:
st.warning(f"No Excel files found in {method_output_dir}")
def process_document(file_path, output_dir, docling, llamaparse, unstructured):
"""Process document using the FastAPI endpoint"""
try:
# Prepare the request data
data = {
'input_file_path': file_path,
'output_dir': output_dir,
'docling': docling,
'llamaparse': llamaparse,
'unstructured': unstructured
}
# Show processing message
with st.spinner('Processing document...'):
# Make request to FastAPI endpoint
# Replace with your actual FastAPI endpoint URL
response = requests.post('http://localhost:8000/extract', data=data)
if response.status_code == 200:
st.session_state.results = response.json()
st.success("Document processed successfully!")
# Show results
results = st.session_state.results['results']
# Method selection for viewing results
st.markdown("### 📊 View Results")
available_methods = [method for method in ['docling', 'llamaparse', 'unstructured']
if method in results and isinstance(results[method], dict)]
if available_methods:
selected_method = st.selectbox(
"Select extraction method to view:",
available_methods,
help="Choose which extraction method results to display"
)
if selected_method and isinstance(results[selected_method], dict):
method_result = results[selected_method]
st.json(method_result)
# List files in output directory
method_dir = os.path.join(output_dir, selected_method)
# HTML files
html_files = glob.glob(os.path.join(method_dir, "*.html"))
html_files.extend(glob.glob(os.path.join(method_dir, "**", "*.html"), recursive=True))
# Excel files
excel_files = get_excel_files(method_dir)
if html_files or excel_files:
st.markdown("### 📄 Generated Files")
if html_files:
st.markdown("**HTML Files:**")
for html_file in html_files:
st.markdown(f"- {os.path.basename(html_file)}")
if excel_files:
st.markdown("**Excel Files:**")
for excel_file in excel_files:
st.markdown(f"- {os.path.basename(excel_file)}")
else:
st.warning("No successful extractions found.")
else:
st.error(f"Error processing document: {response.text}")
except requests.exceptions.ConnectionError:
st.error("Could not connect to the processing service. Please ensure the FastAPI server is running.")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
def main():
# Navigation header
col1, col2 = st.columns([1, 1])
with col1:
st.markdown("### 📋 PDF Parser")
st.markdown("*Table Extraction Tool*")
with col2:
nav_col1, nav_col2 = st.columns(2)
with nav_col1:
if st.button("Dashboard", use_container_width=True):
st.session_state.page = 'home'
st.rerun()
with nav_col2:
st.button("History", use_container_width=True)
st.markdown("---")
# Route to appropriate page
if st.session_state.page == 'home':
show_home_page()
elif st.session_state.page == 'upload':
show_upload_page()
elif st.session_state.page == 'demo_setup':
show_demo_setup_page()
elif st.session_state.page == 'demo':
show_demo_page()
if __name__ == "__main__":
main()