import io import streamlit as st import requests import time import os from pathlib import Path import glob import base64 import pandas as pd from datetime import datetime # Configure page st.set_page_config( page_title="PDF Parser - Table Extraction Tool", page_icon="📋", layout="wide", initial_sidebar_state="collapsed" ) # Custom CSS for styling - Grey and White Theme st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'page' not in st.session_state: st.session_state.page = 'home' if 'processing' not in st.session_state: st.session_state.processing = False if 'results' not in st.session_state: st.session_state.results = None if 'show_output_dir' not in st.session_state: st.session_state.show_output_dir = False if 'selected_method' not in st.session_state: st.session_state.selected_method = None if 'demo_results' not in st.session_state: st.session_state.demo_results = None if 'demo_selected_methods' not in st.session_state: st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False} # Get the directory where the script is located (src) SCRIPT_DIR = Path(__file__).parent # Tesla demo document path (assuming it's in the src directory or adjust as needed) TESLA_DOC_PATH = SCRIPT_DIR / "tesla_docs_28-41 (1)-9-14.pdf" # Output directory is src/output OUTPUT_BASE_PATH = SCRIPT_DIR / "output" def show_home_page(): # Header st.markdown("""

Transform PDF Tables to

HTML and Excel

Perfect for financial reports, research papers, and data analysis.

""", unsafe_allow_html=True) # Main buttons col1, col2, col3 = st.columns([1, 2, 1]) with col2: col_btn1, col_btn2 = st.columns(2) with col_btn1: if st.button("📄 Upload PDF Document", key="upload_btn", help="Upload your own PDF document"): st.session_state.page = 'upload' st.rerun() with col_btn2: if st.button("⚡ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"): st.session_state.page = 'demo_setup' st.rerun() # Features section st.markdown("---") col1, col2, col3 = st.columns(3) with col1: st.markdown("""

⚡ Lightning Fast

Process complex PDFs in seconds with our advanced AI algorithms

""", unsafe_allow_html=True) with col2: st.markdown("""

🔒 Secure & Private

Your documents are processed securely and never stored permanently

""", unsafe_allow_html=True) with col3: st.markdown("""

🔄 Batch Processing

Handle multiple documents and tables simultaneously

""", unsafe_allow_html=True) def show_upload_page(): st.markdown("## 📄 Upload Your Document") # File upload uploaded_file = st.file_uploader( "Choose a PDF file", type=['pdf'], help="Upload a PDF document to extract tables from" ) # Input file path (alternative) st.markdown("**Or specify file path:**") input_file_path = st.text_input( "Input File Path", placeholder="C:\\path\\to\\your\\document.pdf", help="Enter the full path to your PDF file" ) # Output directory with show/hide functionality output_dir = st.text_input( "Output Directory", placeholder="C:\\path\\to\\output\\folder", help="Directory where extracted tables will be saved", type="password" if not st.session_state.show_output_dir else "default" ) # Show/Hide output directory toggle col1, col2 = st.columns([3, 1]) with col2: if st.button("👁️ View/Hide Path"): st.session_state.show_output_dir = not st.session_state.show_output_dir st.rerun() # Extraction method selection st.markdown("### 🔧 Select Extraction Methods") col1, col2, col3 = st.columns(3) with col1: docling = st.checkbox("Docling", value=True, help="Advanced document processing") with col2: llamaparse = st.checkbox("LlamaParse", value=False, help="AI-powered parsing") with col3: unstructured = st.checkbox("Unstructured", value=False, help="General purpose extraction") # Process button if st.button("🚀 Process Document", type="primary"): if (uploaded_file or input_file_path) and output_dir and (docling or llamaparse or unstructured): file_path = input_file_path if input_file_path else uploaded_file.name process_document(file_path, output_dir, docling, llamaparse, unstructured) else: st.error("Please provide input file, output directory, and select at least one extraction method.") # Back button if st.button("← Back to Home"): st.session_state.page = 'home' st.rerun() def show_demo_setup_page(): st.markdown("## ⚡ Tesla 10K Demo Setup") st.markdown("*Configure extraction methods for Tesla's 10K document processing*") # Document info st.markdown("### 📄 Document Information") st.info("**Document:** tesla_docs_28-41 (1)-9-14.pdf") # Extraction method selection (removed output directory section completely) st.markdown("### 🔧 Select Extraction Methods") col1, col2, col3 = st.columns(3) with col1: docling = st.checkbox("Docling", value=st.session_state.demo_selected_methods['docling'], help="Advanced document processing") with col2: llamaparse = st.checkbox("LlamaParse", value=st.session_state.demo_selected_methods['llamaparse'], help="AI-powered parsing") with col3: unstructured = st.checkbox("Unstructured", value=st.session_state.demo_selected_methods['unstructured'], help="General purpose extraction") # Update session state st.session_state.demo_selected_methods = { 'docling': docling, 'llamaparse': llamaparse, 'unstructured': unstructured } # Process button col1, col2 = st.columns([2, 1]) with col1: if st.button("🚀 Process Tesla Document", type="primary"): if docling or llamaparse or unstructured: st.session_state.page = 'demo' st.session_state.processing = True st.rerun() else: st.error("Please select at least one extraction method.") with col2: if st.button("← Back to Home"): st.session_state.page = 'home' st.rerun() def show_demo_page(): if st.session_state.processing: show_processing_demo() else: show_demo_results() def show_processing_demo(): st.markdown("## ⚡ Processing Tesla 10K Document...") # Show selected methods selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected] st.markdown(f"*Processing with selected methods: {', '.join([m.title() for m in selected_methods])}*") # Progress bar progress_bar = st.progress(0) status_text = st.empty() method_status = st.empty() # Calculate total steps based on selected methods total_methods = len(selected_methods) steps_per_method = 30 total_steps = total_methods * steps_per_method current_method_index = 0 for i in range(total_steps): progress = (i + 1) / total_steps progress_bar.progress(progress) # Determine current method method_step = i % steps_per_method if method_step == 0 and i > 0: current_method_index += 1 current_method = selected_methods[current_method_index] method_progress = (method_step + 1) / steps_per_method # Update status messages if method_progress < 0.3: status_text.text(f"📄 {current_method.title()}: Reading document... {int(method_progress * 100)}%") elif method_progress < 0.7: status_text.text(f"🔍 {current_method.title()}: Extracting tables... {int(method_progress * 100)}%") else: status_text.text(f"💾 {current_method.title()}: Generating HTML outputs... {int(method_progress * 100)}%") method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}") time.sleep(0.33) # Show completion st.markdown("""

✅ Document processed successfully!
Tables have been extracted using selected methods and HTML files are ready for viewing.

""", unsafe_allow_html=True) # Process Tesla demo process_tesla_demo() st.session_state.processing = False time.sleep(2) st.rerun() def process_tesla_demo(): """Process Tesla demo document using selected extraction methods""" try: # Create output directory for demo (using the base path) demo_output_dir = OUTPUT_BASE_PATH / "tesla_demo" # Prepare the request data for selected methods only data = { 'input_file_path': str(TESLA_DOC_PATH), 'output_dir': str(demo_output_dir), 'docling': st.session_state.demo_selected_methods['docling'], 'llamaparse': st.session_state.demo_selected_methods['llamaparse'], 'unstructured': st.session_state.demo_selected_methods['unstructured'] } # Make request to FastAPI endpoint (uncomment when ready) # response = requests.post('http://localhost:8000/extract', data=data) # if response.status_code == 200: # st.session_state.demo_results = response.json() # For demo purposes, simulate successful processing for selected methods only results = {} if st.session_state.demo_selected_methods['docling']: results['docling'] = {'status': 'success', 'total_tables': 5} if st.session_state.demo_selected_methods['llamaparse']: results['llamaparse'] = {'status': 'success', 'total_tables': 3} if st.session_state.demo_selected_methods['unstructured']: results['unstructured'] = {'status': 'success', 'total_tables': 4} st.session_state.demo_results = {'results': results} except Exception as e: st.error(f"Error processing Tesla demo: {str(e)}") def count_html_files(directory): """Count only HTML files in directory""" if not os.path.exists(directory): return 0 html_files = glob.glob(os.path.join(str(directory), "*.html")) html_files.extend(glob.glob(os.path.join(str(directory), "**", "*.html"), recursive=True)) return len(html_files) def get_excel_files(directory): """Get all Excel files from directory""" if not os.path.exists(directory): return [] excel_files = glob.glob(os.path.join(str(directory), "*.xlsx")) excel_files.extend(glob.glob(os.path.join(str(directory), "*.xls"))) excel_files.extend(glob.glob(os.path.join(str(directory), "*.csv"))) excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xlsx"), recursive=True)) excel_files.extend(glob.glob(os.path.join(str(directory), "**", "*.xls"), recursive=True)) return excel_files def get_file_info(file_path): """Get file information including size and modification time""" if not os.path.exists(file_path): return {"size": 0, "modified": "Unknown"} stat = os.stat(file_path) size_kb = stat.st_size / 1024 modified = datetime.fromtimestamp(stat.st_mtime) return { "size": f"{size_kb:.1f} KB", "modified": modified.strftime("%Y-%m-%d %H:%M") } def show_demo_results(): st.markdown("## 📊 Tesla 10K Processing Results") # Document info col1, col2 = st.columns([2, 1]) with col1: st.markdown("### 📄 tesla_docs_28-41 (1)-9-14.pdf") st.markdown("**Status:** ✅ Complete") processed_methods = [method.title() for method, selected in st.session_state.demo_selected_methods.items() if selected] st.markdown(f"**Processed with:** {', '.join(processed_methods)}") with col2: if st.button("🔄 Reset"): st.session_state.page = 'home' st.session_state.processing = False st.session_state.results = None st.session_state.demo_results = None st.session_state.selected_method = None st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False} st.rerun() # Method selection tabs - only show selected methods available_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected] if len(available_methods) > 1: st.markdown("### 🔧 Select Extraction Method to View") method_labels = { 'docling': '🔧 Docling', 'llamaparse': '🦙 LlamaParse', 'unstructured': '📊 Unstructured' } # Create columns based on number of available methods cols = st.columns(len(available_methods)) for i, method in enumerate(available_methods): with cols[i]: # Show HTML file count for each method using the same logic as show_html_tables method_output_dir = OUTPUT_BASE_PATH / method html_files = [] if os.path.exists(method_output_dir): html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True) html_files = list(set(html_files)) html_count = len(html_files) button_label = f"{method_labels[method]} ({html_count} HTML files)" if st.button(button_label, key=f"tab_{method}", use_container_width=True): st.session_state.selected_method = method # Default to first available method if no method selected if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods: st.session_state.selected_method = available_methods[0] if available_methods else None # Show results for selected method if st.session_state.selected_method: show_method_results(st.session_state.selected_method) def show_method_results(method): st.markdown(f"### 📋 Results from {method.title()}") # Changed column ratio: 3:1 for HTML tables:Excel files col1, col2 = st.columns([3, 1]) with col1: st.markdown("#### 📄 HTML Tables") show_html_tables(method) with col2: st.markdown("#### 📊 Excel Files") show_excel_files(method) def show_html_tables(method): """Display HTML tables from the method's output directory""" method_output_dir = OUTPUT_BASE_PATH / method # Get actual HTML files from directory html_files = [] if os.path.exists(method_output_dir): # Use only the recursive glob, which includes the top-level directory html_files = glob.glob(os.path.join(str(method_output_dir), "**", "*.html"), recursive=True) # Remove duplicates just in case html_files = list(set(html_files)) # Sort files by table number if possible (e.g., table_1, table_2, ...) import re def extract_table_number(filename): match = re.search(r"table[_-](\d+)", filename, re.IGNORECASE) if match: return int(match.group(1)) return float('inf') # Put files without a number at the end html_files.sort(key=lambda f: extract_table_number(os.path.basename(f))) if html_files: st.markdown(f"**Found {len(html_files)} HTML table(s):**") # Display all HTML files in one scrollable container st.markdown('

', unsafe_allow_html=True) for i, html_file in enumerate(html_files): st.markdown(f"""

📋 Table {i+1}

File: {os.path.basename(html_file)}

""", unsafe_allow_html=True) # Display HTML content try: with open(html_file, 'r', encoding='utf-8') as f: html_content = f.read() st.components.v1.html(html_content, height=300, scrolling=True) except Exception as e: st.error(f"Error displaying HTML file: {e}") # Download button for individual HTML file col_download1, col_download2, col_download3 = st.columns([1, 1, 2]) with col_download1: try: with open(html_file, 'r', encoding='utf-8') as f: html_content = f.read() st.download_button( label=f"⬇️ Table {i+1}", data=html_content, file_name=f"table_{i+1}_{method}.html", mime="text/html", key=f"download_html_{method}_{i}", use_container_width=True ) except Exception as e: st.error(f"Error reading file for download: {e}") if i < len(html_files) - 1: st.markdown("---") st.markdown('

', unsafe_allow_html=True) else: st.warning(f"No HTML files found in {method_output_dir}") def show_excel_files(method): """Display Excel files from the method's output directory""" method_output_dir = OUTPUT_BASE_PATH / method # Get actual Excel files from directory excel_files = get_excel_files(method_output_dir) if excel_files: st.markdown(f"**Found {len(excel_files)} Excel file(s):**") for i, excel_file in enumerate(excel_files): # Get file info file_info = get_file_info(excel_file) file_name = os.path.basename(excel_file) # File info card st.markdown(f"""

📊 {file_name}

Size: {file_info['size']}
Modified: {file_info['modified']}

""", unsafe_allow_html=True) # Try to read and display Excel file preview try: df = pd.read_excel(excel_file) if not df.empty: st.markdown(f"**Preview (first 5 rows):**") st.dataframe(df.head(), use_container_width=True) st.markdown(f"**Dimensions:** {df.shape[0]} × {df.shape[1]}") else: st.info("Excel file is empty") except Exception as e: # Try reading as CSV if Excel reading fails try: df = pd.read_csv(excel_file) if not df.empty: st.markdown(f"**Preview (first 5 rows, read as CSV):**") st.dataframe(df.head(), use_container_width=True) st.markdown(f"**Dimensions:** {df.shape[0]} × {df.shape[1]}") else: st.info("CSV file is empty") except Exception as e2: st.warning(f"Could not preview file as Excel or CSV: {e2}") # Download button for Excel file try: with open(excel_file, 'rb') as f: excel_data = f.read() st.download_button( label=f"⬇️ Download", data=excel_data, file_name=file_name, mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", key=f"download_excel_{method}_{i}", use_container_width=True ) except Exception as e: st.error(f"Error reading Excel file for download: {e}") if i < len(excel_files) - 1: st.markdown("---") else: st.warning(f"No Excel files found in {method_output_dir}") def process_document(file_path, output_dir, docling, llamaparse, unstructured): """Process document using the FastAPI endpoint""" try: # Prepare the request data data = { 'input_file_path': file_path, 'output_dir': output_dir, 'docling': docling, 'llamaparse': llamaparse, 'unstructured': unstructured } # Show processing message with st.spinner('Processing document...'): # Make request to FastAPI endpoint # Replace with your actual FastAPI endpoint URL response = requests.post('http://localhost:8000/extract', data=data) if response.status_code == 200: st.session_state.results = response.json() st.success("Document processed successfully!") # Show results results = st.session_state.results['results'] # Method selection for viewing results st.markdown("### 📊 View Results") available_methods = [method for method in ['docling', 'llamaparse', 'unstructured'] if method in results and isinstance(results[method], dict)] if available_methods: selected_method = st.selectbox( "Select extraction method to view:", available_methods, help="Choose which extraction method results to display" ) if selected_method and isinstance(results[selected_method], dict): method_result = results[selected_method] st.json(method_result) # List files in output directory method_dir = os.path.join(output_dir, selected_method) # HTML files html_files = glob.glob(os.path.join(method_dir, "*.html")) html_files.extend(glob.glob(os.path.join(method_dir, "**", "*.html"), recursive=True)) # Excel files excel_files = get_excel_files(method_dir) if html_files or excel_files: st.markdown("### 📄 Generated Files") if html_files: st.markdown("**HTML Files:**") for html_file in html_files: st.markdown(f"- {os.path.basename(html_file)}") if excel_files: st.markdown("**Excel Files:**") for excel_file in excel_files: st.markdown(f"- {os.path.basename(excel_file)}") else: st.warning("No successful extractions found.") else: st.error(f"Error processing document: {response.text}") except requests.exceptions.ConnectionError: st.error("Could not connect to the processing service. Please ensure the FastAPI server is running.") except Exception as e: st.error(f"An error occurred: {str(e)}") def main(): # Navigation header col1, col2 = st.columns([1, 1]) with col1: st.markdown("### 📋 PDF Parser") st.markdown("*Table Extraction Tool*") with col2: nav_col1, nav_col2 = st.columns(2) with nav_col1: if st.button("Dashboard", use_container_width=True): st.session_state.page = 'home' st.rerun() with nav_col2: st.button("History", use_container_width=True) st.markdown("---") # Route to appropriate page if st.session_state.page == 'home': show_home_page() elif st.session_state.page == 'upload': show_upload_page() elif st.session_state.page == 'demo_setup': show_demo_setup_page() elif st.session_state.page == 'demo': show_demo_page() if __name__ == "__main__": main()