import io import streamlit as st import requests import time import os from pathlib import Path import glob import base64 import pandas as pd from datetime import datetime # Configure page st.set_page_config( page_title="PDF Parser - Table Extraction Tool", page_icon="📋", layout="wide", initial_sidebar_state="collapsed" ) # Custom CSS for styling - Grey and White Theme st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'page' not in st.session_state: st.session_state.page = 'home' if 'processing' not in st.session_state: st.session_state.processing = False if 'results' not in st.session_state: st.session_state.results = None if 'show_output_dir' not in st.session_state: st.session_state.show_output_dir = False if 'selected_method' not in st.session_state: st.session_state.selected_method = None if 'demo_results' not in st.session_state: st.session_state.demo_results = None if 'demo_selected_methods' not in st.session_state: st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False} # Get the current directory (src) and set output path CURRENT_DIR = Path(__file__).parent OUTPUT_BASE_PATH = CURRENT_DIR / "output" # Create output directory if it doesn't exist OUTPUT_BASE_PATH.mkdir(exist_ok=True) def check_existing_results(): """Check if there are existing results in the output directory""" existing_methods = [] for method in ['docling', 'llamaparse', 'unstructured']: method_dir = OUTPUT_BASE_PATH / method if method_dir.exists(): # Check for HTML files html_files = list(method_dir.glob("**/*.html")) if html_files: existing_methods.append(method) return existing_methods def show_home_page(): # Check for existing results existing_methods = check_existing_results() # Header st.markdown("""

Transform PDF Tables to

HTML and Excel

Perfect for financial reports, research papers, and data analysis.

""", unsafe_allow_html=True) # Show existing results notification if any if existing_methods: st.info(f"📁 Found existing results from: {', '.join([m.title() for m in existing_methods])}. Click 'View Results' to see them.") # Main buttons col1, col2, col3 = st.columns([1, 2, 1]) with col2: if existing_methods: # Show three buttons if results exist col_btn1, col_btn2, col_btn3 = st.columns(3) with col_btn1: if st.button("📄 Upload PDF", key="upload_btn", help="Upload your own PDF document"): st.session_state.page = 'upload' st.rerun() with col_btn2: if st.button("⚡ Try Demo", key="demo_btn", help="Try with Tesla's 10K form"): st.session_state.page = 'demo_setup' st.rerun() with col_btn3: if st.button("👁️ View Results", key="view_results_btn", help="View existing results"): st.session_state.page = 'demo' st.session_state.processing = False st.session_state.demo_selected_methods = {method: method in existing_methods for method in ['docling', 'llamaparse', 'unstructured']} st.rerun() else: # Show two buttons if no results exist col_btn1, col_btn2 = st.columns(2) with col_btn1: if st.button("📄 Upload PDF Document", key="upload_btn", help="Upload your own PDF document"): st.session_state.page = 'upload' st.rerun() with col_btn2: if st.button("⚡ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"): st.session_state.page = 'demo_setup' st.rerun() # Features section st.markdown("---") col1, col2, col3 = st.columns(3) with col1: st.markdown("""

⚡ Lightning Fast

Process complex PDFs in seconds with our advanced AI algorithms

""", unsafe_allow_html=True) with col2: st.markdown("""

🔒 Secure & Private

Your documents are processed securely and never stored permanently

""", unsafe_allow_html=True) with col3: st.markdown("""

🔄 Batch Processing

Handle multiple documents and tables simultaneously

""", unsafe_allow_html=True) def show_upload_page(): st.markdown("## 📄 Upload Your Document") # File upload uploaded_file = st.file_uploader( "Choose a PDF file", type=['pdf'], help="Upload a PDF document to extract tables from" ) # Input file path (alternative) st.markdown("**Or specify file path:**") input_file_path = st.text_input( "Input File Path", placeholder="path/to/your/document.pdf", help="Enter the path to your PDF file" ) # Output directory with show/hide functionality output_dir = st.text_input( "Output Directory", value=str(OUTPUT_BASE_PATH), help="Directory where extracted tables will be saved", type="password" if not st.session_state.show_output_dir else "default" ) # Show/Hide output directory toggle col1, col2 = st.columns([3, 1]) with col2: if st.button("👁️ View/Hide Path"): st.session_state.show_output_dir = not st.session_state.show_output_dir st.rerun() # Extraction method selection st.markdown("### 🔧 Select Extraction Methods") col1, col2, col3 = st.columns(3) with col1: docling = st.checkbox("Docling", value=True, help="Advanced document processing") with col2: llamaparse = st.checkbox("LlamaParse", value=False, help="AI-powered parsing") with col3: unstructured = st.checkbox("Unstructured", value=False, help="General purpose extraction") # Process button if st.button("🚀 Process Document", type="primary"): if (uploaded_file or input_file_path) and output_dir and (docling or llamaparse or unstructured): file_path = input_file_path if input_file_path else uploaded_file.name process_document(file_path, output_dir, docling, llamaparse, unstructured) else: st.error("Please provide input file, output directory, and select at least one extraction method.") # Back button if st.button("← Back to Home"): st.session_state.page = 'home' st.rerun() def show_demo_setup_page(): st.markdown("## ⚡ Tesla 10K Demo Setup") st.markdown("*Configure extraction methods for Tesla's 10K document processing*") # Check for existing results existing_methods = check_existing_results() # Document info st.markdown("### 📄 Document Information") if existing_methods: st.success(f"**Found existing results from:** {', '.join([m.title() for m in existing_methods])}") st.info("**Note:** You can view existing results or process with different methods") else: st.info("**Document:** Tesla 10K form - Financial tables extraction demo") # Extraction method selection st.markdown("### 🔧 Select Extraction Methods") col1, col2, col3 = st.columns(3) with col1: docling = st.checkbox("Docling", value=st.session_state.demo_selected_methods.get('docling', True), help="Advanced document processing") with col2: llamaparse = st.checkbox("LlamaParse", value=st.session_state.demo_selected_methods.get('llamaparse', False), help="AI-powered parsing") with col3: unstructured = st.checkbox("Unstructured", value=st.session_state.demo_selected_methods.get('unstructured', False), help="General purpose extraction") # Update session state st.session_state.demo_selected_methods = { 'docling': docling, 'llamaparse': llamaparse, 'unstructured': unstructured } # Process button col1, col2 = st.columns([2, 1]) with col1: if existing_methods: # Show two buttons if results exist col_btn1, col_btn2 = st.columns(2) with col_btn1: if st.button("👁️ View Existing Results", type="secondary"): st.session_state.page = 'demo' st.session_state.processing = False st.session_state.demo_selected_methods = {method: method in existing_methods for method in ['docling', 'llamaparse', 'unstructured']} st.rerun() with col_btn2: if st.button("🚀 Process New", type="primary"): if docling or llamaparse or unstructured: st.session_state.page = 'demo' st.session_state.processing = True st.rerun() else: st.error("Please select at least one extraction method.") else: # Show single process button if no results exist if st.button("🚀 Process Tesla Document", type="primary"): if docling or llamaparse or unstructured: st.session_state.page = 'demo' st.session_state.processing = True st.rerun() else: st.error("Please select at least one extraction method.") with col2: if st.button("← Back to Home"): st.session_state.page = 'home' st.rerun() def show_demo_page(): if st.session_state.processing: show_processing_demo() else: show_demo_results() def show_processing_demo(): st.markdown("## ⚡ Processing Tesla 10K Document...") # Show selected methods selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected] st.markdown(f"*Processing with selected methods: {', '.join([m.title() for m in selected_methods])}*") # Progress bar progress_bar = st.progress(0) status_text = st.empty() method_status = st.empty() # Calculate total steps based on selected methods total_methods = len(selected_methods) steps_per_method = 30 total_steps = total_methods * steps_per_method current_method_index = 0 for i in range(total_steps): progress = (i + 1) / total_steps progress_bar.progress(progress) # Determine current method method_step = i % steps_per_method if method_step == 0 and i > 0: current_method_index += 1 current_method = selected_methods[current_method_index] method_progress = (method_step + 1) / steps_per_method # Update status messages if method_progress < 0.3: status_text.text(f"📄 {current_method.title()}: Reading document... {int(method_progress * 100)}%") elif method_progress < 0.7: status_text.text(f"🔍 {current_method.title()}: Extracting tables... {int(method_progress * 100)}%") else: status_text.text(f"💾 {current_method.title()}: Generating HTML outputs... {int(method_progress * 100)}%") method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}") time.sleep(0.1) # Reduced sleep time for faster demo # Show completion st.markdown("""

✅ Document processed successfully!
Tables have been extracted using selected methods and HTML files are ready for viewing.

""", unsafe_allow_html=True) # Process Tesla demo process_tesla_demo() st.session_state.processing = False time.sleep(1) st.rerun() def process_tesla_demo(): """Process Tesla demo document using selected extraction methods""" try: # For demo purposes, simulate successful processing for selected methods only results = {} selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected] for method in selected_methods: results[method] = {'status': 'success', 'total_tables': 3 + hash(method) % 3} # Simulate different table counts st.session_state.demo_results = {'results': results} except Exception as e: st.error(f"Error processing Tesla demo: {str(e)}") def count_html_files(directory): """Count only HTML files in directory""" if not directory.exists(): return 0 html_files = list(directory.glob("**/*.html")) return len(html_files) def get_excel_files(directory): """Get all Excel files from directory""" if not directory.exists(): return [] excel_files = [] for ext in ['*.xlsx', '*.xls', '*.csv']: excel_files.extend(directory.glob(f"**/{ext}")) return excel_files def get_file_info(file_path): """Get file information including size and modification time""" if not file_path.exists(): return {"size": 0, "modified": "Unknown"} stat = file_path.stat() size_kb = stat.st_size / 1024 modified = datetime.fromtimestamp(stat.st_mtime) return { "size": f"{size_kb:.1f} KB", "modified": modified.strftime("%Y-%m-%d %H:%M") } def show_demo_results(): st.markdown("## 📊 Tesla 10K Processing Results") # Check for existing results existing_methods = check_existing_results() # Document info col1, col2 = st.columns([2, 1]) with col1: st.markdown("### 📄 Tesla 10K Document") st.markdown("**Status:** ✅ Complete") if existing_methods: st.markdown(f"**Available results:** {', '.join([m.title() for m in existing_methods])}") else: st.warning("No results found in output directory") with col2: if st.button("🔄 Reset"): st.session_state.page = 'home' st.session_state.processing = False st.session_state.results = None st.session_state.demo_results = None st.session_state.selected_method = None st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False} st.rerun() # Method selection tabs - only show available methods available_methods = existing_methods if available_methods: if len(available_methods) > 1: st.markdown("### 🔧 Select Extraction Method to View") method_labels = { 'docling': '🔧 Docling', 'llamaparse': '🦙 LlamaParse', 'unstructured': '📊 Unstructured' } # Create columns based on number of available methods cols = st.columns(len(available_methods)) for i, method in enumerate(available_methods): with cols[i]: # Show HTML file count for each method method_output_dir = OUTPUT_BASE_PATH / method html_count = count_html_files(method_output_dir) button_label = f"{method_labels[method]} ({html_count} HTML files)" if st.button(button_label, key=f"tab_{method}", use_container_width=True): st.session_state.selected_method = method # Default to first available method if no method selected if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods: st.session_state.selected_method = available_methods[0] # Show results for selected method if st.session_state.selected_method: show_method_results(st.session_state.selected_method) else: st.info("No results found. Please process a document first.") def show_method_results(method): st.markdown(f"### 📋 Results from {method.title()}") # Changed column ratio: 3:1 for HTML tables:Excel files col1, col2 = st.columns([3, 1]) with col1: st.markdown("#### 📄 HTML Tables") show_html_tables(method) with col2: st.markdown("#### 📊 Excel Files") show_excel_files(method) def show_html_tables(method): """Display HTML tables from the method's output directory""" method_output_dir = OUTPUT_BASE_PATH / method # Get actual HTML files from directory html_files = [] if method_output_dir.exists(): html_files = list(method_output_dir.glob("**/*.html")) # Sort files by table number if possible import re def extract_table_number(filename): match = re.search(r"table[_-](\d+)", filename.name, re.IGNORECASE) if match: return int(match.group(1)) return float('inf') html_files.sort(key=extract_table_number) if html_files: st.markdown(f"**Found {len(html_files)} HTML table(s):**") # Display all HTML files in one scrollable container st.markdown('

', unsafe_allow_html=True) for i, html_file in enumerate(html_files): st.markdown(f"""

📋 Table {i+1}

File: {html_file.name}

""", unsafe_allow_html=True) # Display HTML content try: with open(html_file, 'r', encoding='utf-8') as f: html_content = f.read() st.components.v1.html(html_content, height=300, scrolling=True) except Exception as e: st.error(f"Error displaying HTML file: {e}") # Download button for individual HTML file col_download1, col_download2, col_download3 = st.columns([1, 1, 2]) with col_download1: try: with open(html_file, 'r', encoding='utf-8') as f: html_content = f.read() st.download_button( label=f"⬇️ Table {i+1}", data=html_content, file_name=f"table_{i+1}_{method}.html", mime="text/html", key=f"download_html_{method}_{i}", use_container_width=True ) except Exception as e: st.error(f"Error reading file for download: {e}") if i < len(html_files) - 1: st.markdown("---") st.markdown('

', unsafe_allow_html=True) else: st.warning(f"No HTML files found in {method_output_dir}") def show_excel_files(method): """Display Excel files from the method's output directory""" method_output_dir = OUTPUT_BASE_PATH / method # Get actual Excel files from directory excel_files = get_excel_files(method_output_dir) if excel_files: st.markdown(f"**Found {len(excel_files)} Excel file(s):**") for i, excel_file in enumerate(excel_files): # Get file info file_info = get_file_info(excel_file) file_name = excel_file.name # File info card st.markdown(f"""

📊 {file_name}

Size: {file_info['size']}
Modified: {file_info['modified']}

""", unsafe_allow_html=True) # Try to read and display Excel file preview try: if excel_file.suffix.lower() in ['.xlsx', '.xls']: df = pd.read_excel(excel_file) else: df = pd.read_csv(excel_file) if not df.empty: st.markdown(f"**Preview (first 5 rows):**") st.dataframe(df.head(), use_container_width=True) st.markdown(f"**Dimensions:** {df.shape[0]} × {df.shape[1]}") else: st.info("File is empty") except Exception as e: st.warning(f"Could not preview file: {e}") # Download button for Excel file try: with open(excel_file, 'rb') as f: file_data = f.read() st.download_button( label=f"⬇️ Download", data=file_data, file_name=file_name, mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", key=f"download_excel_{method}_{i}", use_container_width=True ) except Exception as e: st.error(f"Error reading file for download: {e}") if i < len(excel_files) - 1: st.markdown("---") else: st.warning(f"No Excel files found in {method_output_dir}") def process_document(file_path, output_dir, docling, llamaparse, unstructured): """Process document using the FastAPI endpoint""" try: # Prepare the request data data = { 'input_file_path': file_path, 'output_dir': output_dir, 'docling': docling, 'llamaparse': llamaparse, 'unstructured': unstructured } # Show processing message with st.spinner('Processing document...'): # Make request to FastAPI endpoint # Replace with your actual FastAPI endpoint URL response = requests.post('http://localhost:8000/extract', data=data) if response.status_code == 200: st.session_state.results = response.json() st.success("Document processed successfully!") # Show results results = st.session_state.results['results'] # Method selection for viewing results st.markdown("### 📊 View Results") available_methods = [method for method in ['docling', 'llamaparse', 'unstructured'] if method in results and isinstance(results[method], dict)] if available_methods: selected_method = st.selectbox( "Select extraction method to view:", available_methods, help="Choose which extraction method results to display" ) if selected_method and isinstance(results[selected_method], dict): method_result = results[selected_method] st.json(method_result) # List files in output directory method_dir = Path(output_dir) / selected_method # HTML files html_files = list(method_dir.glob("**/*.html")) # Excel files excel_files = get_excel_files(method_dir) if html_files or excel_files: st.markdown("### 📄 Generated Files") if html_files: st.markdown("**HTML Files:**") for html_file in html_files: st.markdown(f"- {html_file.name}") if excel_files: st.markdown("**Excel Files:**") for excel_file in excel_files: st.markdown(f"- {excel_file.name}") else: st.warning("No successful extractions found.") else: st.error(f"Error processing document: {response.text}") except requests.exceptions.ConnectionError: st.error("Could not connect to the processing service. Please ensure the FastAPI server is running.") except Exception as e: st.error(f"An error occurred: {str(e)}") def main(): # Navigation header col1, col2 = st.columns([1, 1]) with col1: st.markdown("### 📋 PDF Parser") st.markdown("*Table Extraction Tool*") with col2: nav_col1, nav_col2 = st.columns(2) with nav_col1: if st.button("Dashboard", use_container_width=True): st.session_state.page = 'home' st.rerun() with nav_col2: st.button("History", use_container_width=True) st.markdown("---") # Route to appropriate page if st.session_state.page == 'home': show_home_page() elif st.session_state.page == 'upload': show_upload_page() elif st.session_state.page == 'demo_setup': show_demo_setup_page() elif st.session_state.page == 'demo': show_demo_page() if __name__ == "__main__": main()