AreejMehboob commited on
Commit
4c10590
Β·
verified Β·
1 Parent(s): 452d262

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +824 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,826 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
1
+ import io
 
 
2
  import streamlit as st
3
+ import requests
4
+ import time
5
+ import os
6
+ from pathlib import Path
7
+ import glob
8
+ import base64
9
+ import pandas as pd
10
+ from datetime import datetime
11
+
12
+ # Configure page
13
+ st.set_page_config(
14
+ page_title="PDF Parser - Table Extraction Tool",
15
+ page_icon="πŸ“‹",
16
+ layout="wide",
17
+ initial_sidebar_state="collapsed"
18
+ )
19
+
20
+ # Custom CSS for styling - Grey and White Theme
21
+ st.markdown("""
22
+ <style>
23
+ .main-header {
24
+ text-align: center;
25
+ padding: 2rem 0;
26
+ background: linear-gradient(135deg, #6c757d 0%, #495057 100%);
27
+ border-radius: 10px;
28
+ margin-bottom: 2rem;
29
+ color: white;
30
+ }
31
+
32
+ .feature-card {
33
+ background: #f8f9fa;
34
+ padding: 1.5rem;
35
+ border-radius: 10px;
36
+ box-shadow: 0 2px 10px rgba(0,0,0,0.1);
37
+ text-align: center;
38
+ margin: 1rem 0;
39
+ border: 1px solid #dee2e6;
40
+ }
41
+
42
+ .demo-button {
43
+ background: linear-gradient(45deg, #6c757d, #495057);
44
+ color: white;
45
+ border: none;
46
+ padding: 12px 24px;
47
+ border-radius: 25px;
48
+ font-weight: bold;
49
+ cursor: pointer;
50
+ margin: 10px;
51
+ }
52
+
53
+ .upload-button {
54
+ background: #495057;
55
+ color: white;
56
+ border: none;
57
+ padding: 12px 24px;
58
+ border-radius: 25px;
59
+ font-weight: bold;
60
+ cursor: pointer;
61
+ margin: 10px;
62
+ }
63
+
64
+ .success-message {
65
+ background: #f8f9fa;
66
+ color: #495057;
67
+ padding: 15px;
68
+ border-radius: 5px;
69
+ border-left: 4px solid #6c757d;
70
+ margin: 20px 0;
71
+ }
72
+
73
+ .processing-message {
74
+ background: #f8f9fa;
75
+ color: #495057;
76
+ padding: 15px;
77
+ border-radius: 5px;
78
+ border-left: 4px solid #adb5bd;
79
+ margin: 20px 0;
80
+ }
81
+
82
+ .method-tab {
83
+ background: #f8f9fa;
84
+ padding: 10px 15px;
85
+ border-radius: 5px;
86
+ margin: 5px;
87
+ cursor: pointer;
88
+ border: 2px solid #dee2e6;
89
+ }
90
+
91
+ .method-tab-active {
92
+ background: #6c757d;
93
+ color: white;
94
+ border: 2px solid #495057;
95
+ }
96
+
97
+ .html-file-card {
98
+ background: #f8f9fa;
99
+ padding: 15px;
100
+ border-radius: 8px;
101
+ margin: 10px 0;
102
+ border-left: 4px solid #6c757d;
103
+ }
104
+
105
+ .file-info-card {
106
+ background: #f8f9fa;
107
+ padding: 12px;
108
+ border-radius: 8px;
109
+ margin: 5px 0;
110
+ border-left: 4px solid #6c757d;
111
+ font-size: 0.9em;
112
+ }
113
+
114
+ .file-stats {
115
+ color: #6c757d;
116
+ font-size: 0.85em;
117
+ margin-top: 5px;
118
+ }
119
+
120
+ .stSelectbox > div > div {
121
+ background-color: #f8f9fa;
122
+ }
123
+
124
+ .hidden-text {
125
+ color: #adb5bd;
126
+ font-style: italic;
127
+ }
128
+
129
+ .table-container {
130
+ max-height: 400px;
131
+ overflow-y: auto;
132
+ border: 1px solid #dee2e6;
133
+ border-radius: 5px;
134
+ padding: 10px;
135
+ margin: 10px 0;
136
+ background-color: white;
137
+ }
138
+
139
+ .table-header {
140
+ background: #f8f9fa;
141
+ padding: 10px;
142
+ border-radius: 5px;
143
+ margin-bottom: 10px;
144
+ border-left: 4px solid #6c757d;
145
+ }
146
+
147
+ /* Override Streamlit button styles */
148
+ .stButton > button {
149
+ background-color: #6c757d !important;
150
+ color: white !important;
151
+ border: 1px solid #495057 !important;
152
+ border-radius: 5px !important;
153
+ }
154
+
155
+ .stButton > button:hover {
156
+ background-color: #495057 !important;
157
+ border-color: #343a40 !important;
158
+ }
159
+
160
+ /* Override primary button styles */
161
+ .stButton > button[kind="primary"] {
162
+ background-color: #495057 !important;
163
+ color: white !important;
164
+ border: 1px solid #343a40 !important;
165
+ }
166
+
167
+ .stButton > button[kind="primary"]:hover {
168
+ background-color: #343a40 !important;
169
+ }
170
+
171
+ /* Style checkboxes */
172
+ .stCheckbox > label {
173
+ color: #495057 !important;
174
+ }
175
+
176
+ /* Style text inputs */
177
+ .stTextInput > div > div > input {
178
+ background-color: #f8f9fa !important;
179
+ border-color: #dee2e6 !important;
180
+ }
181
+
182
+ /* Style file uploader */
183
+ .stFileUploader > div {
184
+ background-color: #f8f9fa !important;
185
+ border-color: #dee2e6 !important;
186
+ }
187
+
188
+ /* Style dataframes */
189
+ .stDataFrame {
190
+ background-color: white !important;
191
+ border: 1px solid #dee2e6 !important;
192
+ }
193
+
194
+ /* Style selectbox */
195
+ .stSelectbox > div > div {
196
+ background-color: #f8f9fa !important;
197
+ border-color: #dee2e6 !important;
198
+ }
199
+
200
+ /* Style progress bar */
201
+ .stProgress > div > div > div {
202
+ background-color: #6c757d !important;
203
+ }
204
+ </style>
205
+ """, unsafe_allow_html=True)
206
+
207
+ # Initialize session state
208
+ if 'page' not in st.session_state:
209
+ st.session_state.page = 'home'
210
+ if 'processing' not in st.session_state:
211
+ st.session_state.processing = False
212
+ if 'results' not in st.session_state:
213
+ st.session_state.results = None
214
+ if 'show_output_dir' not in st.session_state:
215
+ st.session_state.show_output_dir = False
216
+ if 'selected_method' not in st.session_state:
217
+ st.session_state.selected_method = None
218
+ if 'demo_results' not in st.session_state:
219
+ st.session_state.demo_results = None
220
+ if 'demo_selected_methods' not in st.session_state:
221
+ st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
222
+
223
+ # Tesla demo document path (adjust as needed)
224
+ TESLA_DOC_PATH = r"C:\Users\Areej\Desktop\get-tables-fastapi\tesla_docs_28-41 (1)-9-14.pdf"
225
+ OUTPUT_BASE_PATH = r"C:\Users\Areej\Desktop\get-tables-fastapi\output"
226
+
227
+ def show_home_page():
228
+ # Header
229
+ st.markdown("""
230
+ <div class="main-header">
231
+ <h1 style="font-size: 3rem; margin: 0; color: #f8f9fa;">Transform PDF Tables to</h1>
232
+ <h1 style="font-size: 3rem; margin: 0; color: #ffffff;">HTML and Excel</h1>
233
+ <p style="margin-top: 1rem; font-size: 1.2rem; opacity: 0.9;">Powered by Traversaal.ai</p>
234
+ <p style="margin-top: 0.5rem; opacity: 0.8;">Perfect for financial reports, research papers, and data analysis.</p>
235
+ </div>
236
+ """, unsafe_allow_html=True)
237
+
238
+ # Main buttons
239
+ col1, col2, col3 = st.columns([1, 2, 1])
240
+ with col2:
241
+ col_btn1, col_btn2 = st.columns(2)
242
+ with col_btn1:
243
+ if st.button("πŸ“„ Upload PDF Document", key="upload_btn", help="Upload your own PDF document"):
244
+ st.session_state.page = 'upload'
245
+ st.rerun()
246
+
247
+ with col_btn2:
248
+ if st.button("⚑ Try Tesla 10K Demo", key="demo_btn", help="Try with Tesla's 10K form"):
249
+ st.session_state.page = 'demo_setup'
250
+ st.rerun()
251
+
252
+ # Features section
253
+ st.markdown("---")
254
+ col1, col2, col3 = st.columns(3)
255
+
256
+ with col1:
257
+ st.markdown("""
258
+ <div class="feature-card">
259
+ <h3 style="color: #495057;">⚑ Lightning Fast</h3>
260
+ <p style="color: #6c757d;">Process complex PDFs in seconds with our advanced AI algorithms</p>
261
+ </div>
262
+ """, unsafe_allow_html=True)
263
+
264
+ with col2:
265
+ st.markdown("""
266
+ <div class="feature-card">
267
+ <h3 style="color: #495057;">πŸ”’ Secure & Private</h3>
268
+ <p style="color: #6c757d;">Your documents are processed securely and never stored permanently</p>
269
+ </div>
270
+ """, unsafe_allow_html=True)
271
+
272
+ with col3:
273
+ st.markdown("""
274
+ <div class="feature-card">
275
+ <h3 style="color: #495057;">πŸ”„ Batch Processing</h3>
276
+ <p style="color: #6c757d;">Handle multiple documents and tables simultaneously</p>
277
+ </div>
278
+ """, unsafe_allow_html=True)
279
+
280
+ def show_upload_page():
281
+ st.markdown("## πŸ“„ Upload Your Document")
282
+
283
+ # File upload
284
+ uploaded_file = st.file_uploader(
285
+ "Choose a PDF file",
286
+ type=['pdf'],
287
+ help="Upload a PDF document to extract tables from"
288
+ )
289
+
290
+ # Input file path (alternative)
291
+ st.markdown("**Or specify file path:**")
292
+ input_file_path = st.text_input(
293
+ "Input File Path",
294
+ placeholder="C:\\path\\to\\your\\document.pdf",
295
+ help="Enter the full path to your PDF file"
296
+ )
297
+
298
+ # Output directory with show/hide functionality
299
+ output_dir = st.text_input(
300
+ "Output Directory",
301
+ placeholder="C:\\path\\to\\output\\folder",
302
+ help="Directory where extracted tables will be saved",
303
+ type="password" if not st.session_state.show_output_dir else "default"
304
+ )
305
+
306
+ # Show/Hide output directory toggle
307
+ col1, col2 = st.columns([3, 1])
308
+ with col2:
309
+ if st.button("πŸ‘οΈ View/Hide Path"):
310
+ st.session_state.show_output_dir = not st.session_state.show_output_dir
311
+ st.rerun()
312
+
313
+ # Extraction method selection
314
+ st.markdown("### πŸ”§ Select Extraction Methods")
315
+ col1, col2, col3 = st.columns(3)
316
+
317
+ with col1:
318
+ docling = st.checkbox("Docling", value=True, help="Advanced document processing")
319
+ with col2:
320
+ llamaparse = st.checkbox("LlamaParse", value=False, help="AI-powered parsing")
321
+ with col3:
322
+ unstructured = st.checkbox("Unstructured", value=False, help="General purpose extraction")
323
+
324
+ # Process button
325
+ if st.button("πŸš€ Process Document", type="primary"):
326
+ if (uploaded_file or input_file_path) and output_dir and (docling or llamaparse or unstructured):
327
+ file_path = input_file_path if input_file_path else uploaded_file.name
328
+ process_document(file_path, output_dir, docling, llamaparse, unstructured)
329
+ else:
330
+ st.error("Please provide input file, output directory, and select at least one extraction method.")
331
+
332
+ # Back button
333
+ if st.button("← Back to Home"):
334
+ st.session_state.page = 'home'
335
+ st.rerun()
336
+
337
+ def show_demo_setup_page():
338
+ st.markdown("## ⚑ Tesla 10K Demo Setup")
339
+ st.markdown("*Configure extraction methods for Tesla's 10K document processing*")
340
+
341
+ # Document info
342
+ st.markdown("### πŸ“„ Document Information")
343
+ st.info("**Document:** tesla_docs_28-41 (1)-9-14.pdf")
344
+
345
+ # Extraction method selection (removed output directory section completely)
346
+ st.markdown("### πŸ”§ Select Extraction Methods")
347
+ col1, col2, col3 = st.columns(3)
348
+
349
+ with col1:
350
+ docling = st.checkbox("Docling",
351
+ value=st.session_state.demo_selected_methods['docling'],
352
+ help="Advanced document processing")
353
+ with col2:
354
+ llamaparse = st.checkbox("LlamaParse",
355
+ value=st.session_state.demo_selected_methods['llamaparse'],
356
+ help="AI-powered parsing")
357
+ with col3:
358
+ unstructured = st.checkbox("Unstructured",
359
+ value=st.session_state.demo_selected_methods['unstructured'],
360
+ help="General purpose extraction")
361
+
362
+ # Update session state
363
+ st.session_state.demo_selected_methods = {
364
+ 'docling': docling,
365
+ 'llamaparse': llamaparse,
366
+ 'unstructured': unstructured
367
+ }
368
+
369
+ # Process button
370
+ col1, col2 = st.columns([2, 1])
371
+ with col1:
372
+ if st.button("πŸš€ Process Tesla Document", type="primary"):
373
+ if docling or llamaparse or unstructured:
374
+ st.session_state.page = 'demo'
375
+ st.session_state.processing = True
376
+ st.rerun()
377
+ else:
378
+ st.error("Please select at least one extraction method.")
379
+
380
+ with col2:
381
+ if st.button("← Back to Home"):
382
+ st.session_state.page = 'home'
383
+ st.rerun()
384
+
385
+ def show_demo_page():
386
+ if st.session_state.processing:
387
+ show_processing_demo()
388
+ else:
389
+ show_demo_results()
390
+
391
+ def show_processing_demo():
392
+ st.markdown("## ⚑ Processing Tesla 10K Document...")
393
+
394
+ # Show selected methods
395
+ selected_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
396
+ st.markdown(f"*Processing with selected methods: {', '.join([m.title() for m in selected_methods])}*")
397
+
398
+ # Progress bar
399
+ progress_bar = st.progress(0)
400
+ status_text = st.empty()
401
+ method_status = st.empty()
402
+
403
+ # Calculate total steps based on selected methods
404
+ total_methods = len(selected_methods)
405
+ steps_per_method = 30
406
+ total_steps = total_methods * steps_per_method
407
+
408
+ current_method_index = 0
409
+ for i in range(total_steps):
410
+ progress = (i + 1) / total_steps
411
+ progress_bar.progress(progress)
412
+
413
+ # Determine current method
414
+ method_step = i % steps_per_method
415
+ if method_step == 0 and i > 0:
416
+ current_method_index += 1
417
+
418
+ current_method = selected_methods[current_method_index]
419
+ method_progress = (method_step + 1) / steps_per_method
420
+
421
+ # Update status messages
422
+ if method_progress < 0.3:
423
+ status_text.text(f"πŸ“„ {current_method.title()}: Reading document... {int(method_progress * 100)}%")
424
+ elif method_progress < 0.7:
425
+ status_text.text(f"πŸ” {current_method.title()}: Extracting tables... {int(method_progress * 100)}%")
426
+ else:
427
+ status_text.text(f"πŸ’Ύ {current_method.title()}: Generating HTML outputs... {int(method_progress * 100)}%")
428
+
429
+ method_status.markdown(f"**Overall Progress:** {int(progress * 100)}% | **Current Method:** {current_method.title()}")
430
+
431
+ time.sleep(0.33)
432
+
433
+ # Show completion
434
+ st.markdown("""
435
+ <div class="success-message">
436
+ βœ… <strong>Document processed successfully!</strong><br>
437
+ Tables have been extracted using selected methods and HTML files are ready for viewing.
438
+ </div>
439
+ """, unsafe_allow_html=True)
440
+
441
+ # Process Tesla demo
442
+ process_tesla_demo()
443
+
444
+ st.session_state.processing = False
445
+ time.sleep(2)
446
+ st.rerun()
447
+
448
+ def process_tesla_demo():
449
+ """Process Tesla demo document using selected extraction methods"""
450
+ try:
451
+ # Create output directory for demo (using the base path)
452
+ demo_output_dir = os.path.join(OUTPUT_BASE_PATH, "tesla_demo")
453
+
454
+ # Prepare the request data for selected methods only
455
+ data = {
456
+ 'input_file_path': TESLA_DOC_PATH,
457
+ 'output_dir': demo_output_dir,
458
+ 'docling': st.session_state.demo_selected_methods['docling'],
459
+ 'llamaparse': st.session_state.demo_selected_methods['llamaparse'],
460
+ 'unstructured': st.session_state.demo_selected_methods['unstructured']
461
+ }
462
+
463
+ # Make request to FastAPI endpoint (uncomment when ready)
464
+ # response = requests.post('http://localhost:8000/extract', data=data)
465
+ # if response.status_code == 200:
466
+ # st.session_state.demo_results = response.json()
467
+
468
+ # For demo purposes, simulate successful processing for selected methods only
469
+ results = {}
470
+ if st.session_state.demo_selected_methods['docling']:
471
+ results['docling'] = {'status': 'success', 'total_tables': 5}
472
+ if st.session_state.demo_selected_methods['llamaparse']:
473
+ results['llamaparse'] = {'status': 'success', 'total_tables': 3}
474
+ if st.session_state.demo_selected_methods['unstructured']:
475
+ results['unstructured'] = {'status': 'success', 'total_tables': 4}
476
+
477
+ st.session_state.demo_results = {'results': results}
478
+
479
+ except Exception as e:
480
+ st.error(f"Error processing Tesla demo: {str(e)}")
481
+
482
+ def count_html_files(directory):
483
+ """Count only HTML files in directory"""
484
+ if not os.path.exists(directory):
485
+ return 0
486
+
487
+ html_files = glob.glob(os.path.join(directory, "*.html"))
488
+ html_files.extend(glob.glob(os.path.join(directory, "**", "*.html"), recursive=True))
489
+ return len(html_files)
490
+
491
+ def get_excel_files(directory):
492
+ """Get all Excel files from directory"""
493
+ if not os.path.exists(directory):
494
+ return []
495
+
496
+ excel_files = glob.glob(os.path.join(directory, "*.xlsx"))
497
+ excel_files.extend(glob.glob(os.path.join(directory, "*.xls")))
498
+ excel_files.extend(glob.glob(os.path.join(directory, "*.csv")))
499
+ excel_files.extend(glob.glob(os.path.join(directory, "**", "*.xlsx"), recursive=True))
500
+ excel_files.extend(glob.glob(os.path.join(directory, "**", "*.xls"), recursive=True))
501
+ return excel_files
502
+
503
+ def get_file_info(file_path):
504
+ """Get file information including size and modification time"""
505
+ if not os.path.exists(file_path):
506
+ return {"size": 0, "modified": "Unknown"}
507
+
508
+ stat = os.stat(file_path)
509
+ size_kb = stat.st_size / 1024
510
+ modified = datetime.fromtimestamp(stat.st_mtime)
511
+
512
+ return {
513
+ "size": f"{size_kb:.1f} KB",
514
+ "modified": modified.strftime("%Y-%m-%d %H:%M")
515
+ }
516
+
517
+ def show_demo_results():
518
+ st.markdown("## πŸ“Š Tesla 10K Processing Results")
519
+
520
+ # Document info
521
+ col1, col2 = st.columns([2, 1])
522
+ with col1:
523
+ st.markdown("### πŸ“„ tesla_docs_28-41 (1)-9-14.pdf")
524
+ st.markdown("**Status:** βœ… Complete")
525
+ processed_methods = [method.title() for method, selected in st.session_state.demo_selected_methods.items() if selected]
526
+ st.markdown(f"**Processed with:** {', '.join(processed_methods)}")
527
+
528
+ with col2:
529
+ if st.button("πŸ”„ Reset"):
530
+ st.session_state.page = 'home'
531
+ st.session_state.processing = False
532
+ st.session_state.results = None
533
+ st.session_state.demo_results = None
534
+ st.session_state.selected_method = None
535
+ st.session_state.demo_selected_methods = {'docling': True, 'llamaparse': False, 'unstructured': False}
536
+ st.rerun()
537
+
538
+ # Method selection tabs - only show selected methods
539
+ available_methods = [method for method, selected in st.session_state.demo_selected_methods.items() if selected]
540
+
541
+ if len(available_methods) > 1:
542
+ st.markdown("### πŸ”§ Select Extraction Method to View")
543
+
544
+ method_labels = {
545
+ 'docling': 'πŸ”§ Docling',
546
+ 'llamaparse': 'πŸ¦™ LlamaParse',
547
+ 'unstructured': 'πŸ“Š Unstructured'
548
+ }
549
+
550
+ # Create columns based on number of available methods
551
+ cols = st.columns(len(available_methods))
552
+
553
+ for i, method in enumerate(available_methods):
554
+ with cols[i]:
555
+ # Show HTML file count for each method using the same logic as show_html_tables
556
+ method_output_dir = os.path.join(OUTPUT_BASE_PATH, method)
557
+ html_files = []
558
+ if os.path.exists(method_output_dir):
559
+ html_files = glob.glob(os.path.join(method_output_dir, "**", "*.html"), recursive=True)
560
+ html_files = list(set(html_files))
561
+ html_count = len(html_files)
562
+ button_label = f"{method_labels[method]} ({html_count} HTML files)"
563
+
564
+ if st.button(button_label, key=f"tab_{method}", use_container_width=True):
565
+ st.session_state.selected_method = method
566
+
567
+ # Default to first available method if no method selected
568
+ if st.session_state.selected_method is None or st.session_state.selected_method not in available_methods:
569
+ st.session_state.selected_method = available_methods[0] if available_methods else None
570
+
571
+ # Show results for selected method
572
+ if st.session_state.selected_method:
573
+ show_method_results(st.session_state.selected_method)
574
+
575
+ def show_method_results(method):
576
+ st.markdown(f"### πŸ“‹ Results from {method.title()}")
577
+
578
+ # Changed column ratio: 3:1 for HTML tables:Excel files
579
+ col1, col2 = st.columns([3, 1])
580
+
581
+ with col1:
582
+ st.markdown("#### πŸ“„ HTML Tables")
583
+ show_html_tables(method)
584
+
585
+ with col2:
586
+ st.markdown("#### πŸ“Š Excel Files")
587
+ show_excel_files(method)
588
+
589
+ def show_html_tables(method):
590
+ """Display HTML tables from the method's output directory"""
591
+ method_output_dir = os.path.join(OUTPUT_BASE_PATH, method)
592
+
593
+ # Get actual HTML files from directory
594
+ html_files = []
595
+ if os.path.exists(method_output_dir):
596
+ # Use only the recursive glob, which includes the top-level directory
597
+ html_files = glob.glob(os.path.join(method_output_dir, "**", "*.html"), recursive=True)
598
+ # Remove duplicates just in case
599
+ html_files = list(set(html_files))
600
+
601
+ # Sort files by table number if possible (e.g., table_1, table_2, ...)
602
+ import re
603
+ def extract_table_number(filename):
604
+ match = re.search(r"table[_-](\d+)", filename, re.IGNORECASE)
605
+ if match:
606
+ return int(match.group(1))
607
+ return float('inf') # Put files without a number at the end
608
+ html_files.sort(key=lambda f: extract_table_number(os.path.basename(f)))
609
+
610
+ if html_files:
611
+ st.markdown(f"**Found {len(html_files)} HTML table(s):**")
612
+
613
+ # Display all HTML files in one scrollable container
614
+ st.markdown('<div class="table-container">', unsafe_allow_html=True)
615
+
616
+ for i, html_file in enumerate(html_files):
617
+ st.markdown(f"""
618
+ <div class="table-header">
619
+ <h4 style="color: #495057;">πŸ“‹ Table {i+1}</h4>
620
+ <small style="color: #6c757d;">File: {os.path.basename(html_file)}</small>
621
+ </div>
622
+ """, unsafe_allow_html=True)
623
+
624
+ # Display HTML content
625
+ try:
626
+ with open(html_file, 'r', encoding='utf-8') as f:
627
+ html_content = f.read()
628
+ st.components.v1.html(html_content, height=300, scrolling=True)
629
+
630
+ except Exception as e:
631
+ st.error(f"Error displaying HTML file: {e}")
632
+
633
+ # Download button for individual HTML file
634
+ col_download1, col_download2, col_download3 = st.columns([1, 1, 2])
635
+ with col_download1:
636
+ try:
637
+ with open(html_file, 'r', encoding='utf-8') as f:
638
+ html_content = f.read()
639
+ st.download_button(
640
+ label=f"⬇️ Table {i+1}",
641
+ data=html_content,
642
+ file_name=f"table_{i+1}_{method}.html",
643
+ mime="text/html",
644
+ key=f"download_html_{method}_{i}",
645
+ use_container_width=True
646
+ )
647
+ except Exception as e:
648
+ st.error(f"Error reading file for download: {e}")
649
+
650
+ if i < len(html_files) - 1:
651
+ st.markdown("---")
652
+
653
+ st.markdown('</div>', unsafe_allow_html=True)
654
+
655
+ else:
656
+ st.warning(f"No HTML files found in {method_output_dir}")
657
+
658
+ def show_excel_files(method):
659
+ """Display Excel files from the method's output directory"""
660
+ method_output_dir = os.path.join(OUTPUT_BASE_PATH, method)
661
+
662
+ # Get actual Excel files from directory
663
+ excel_files = get_excel_files(method_output_dir)
664
+
665
+ if excel_files:
666
+ st.markdown(f"**Found {len(excel_files)} Excel file(s):**")
667
+
668
+ for i, excel_file in enumerate(excel_files):
669
+ # Get file info
670
+ file_info = get_file_info(excel_file)
671
+ file_name = os.path.basename(excel_file)
672
+
673
+ # File info card
674
+ st.markdown(f"""
675
+ <div class="file-info-card">
676
+ <strong style="color: #495057;">πŸ“Š {file_name}</strong>
677
+ <div class="file-stats">
678
+ <strong>Size:</strong> {file_info['size']}<br>
679
+ <strong>Modified:</strong> {file_info['modified']}
680
+ </div>
681
+ </div>
682
+ """, unsafe_allow_html=True)
683
+
684
+ # Try to read and display Excel file preview
685
+ try:
686
+ df = pd.read_excel(excel_file)
687
+ if not df.empty:
688
+ st.markdown(f"**Preview (first 5 rows):**")
689
+ st.dataframe(df.head(), use_container_width=True)
690
+ st.markdown(f"**Dimensions:** {df.shape[0]} Γ— {df.shape[1]}")
691
+ else:
692
+ st.info("Excel file is empty")
693
+ except Exception as e:
694
+ # Try reading as CSV if Excel reading fails
695
+ try:
696
+ df = pd.read_csv(excel_file)
697
+ if not df.empty:
698
+ st.markdown(f"**Preview (first 5 rows, read as CSV):**")
699
+ st.dataframe(df.head(), use_container_width=True)
700
+ st.markdown(f"**Dimensions:** {df.shape[0]} Γ— {df.shape[1]}")
701
+ else:
702
+ st.info("CSV file is empty")
703
+ except Exception as e2:
704
+ st.warning(f"Could not preview file as Excel or CSV: {e2}")
705
+
706
+ # Download button for Excel file
707
+ try:
708
+ with open(excel_file, 'rb') as f:
709
+ excel_data = f.read()
710
+ st.download_button(
711
+ label=f"⬇️ Download",
712
+ data=excel_data,
713
+ file_name=file_name,
714
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
715
+ key=f"download_excel_{method}_{i}",
716
+ use_container_width=True
717
+ )
718
+ except Exception as e:
719
+ st.error(f"Error reading Excel file for download: {e}")
720
+
721
+ if i < len(excel_files) - 1:
722
+ st.markdown("---")
723
+ else:
724
+ st.warning(f"No Excel files found in {method_output_dir}")
725
+
726
+ def process_document(file_path, output_dir, docling, llamaparse, unstructured):
727
+ """Process document using the FastAPI endpoint"""
728
+ try:
729
+ # Prepare the request data
730
+ data = {
731
+ 'input_file_path': file_path,
732
+ 'output_dir': output_dir,
733
+ 'docling': docling,
734
+ 'llamaparse': llamaparse,
735
+ 'unstructured': unstructured
736
+ }
737
+
738
+ # Show processing message
739
+ with st.spinner('Processing document...'):
740
+ # Make request to FastAPI endpoint
741
+ # Replace with your actual FastAPI endpoint URL
742
+ response = requests.post('http://localhost:8000/extract', data=data)
743
+
744
+ if response.status_code == 200:
745
+ st.session_state.results = response.json()
746
+ st.success("Document processed successfully!")
747
+
748
+ # Show results
749
+ results = st.session_state.results['results']
750
+
751
+ # Method selection for viewing results
752
+ st.markdown("### πŸ“Š View Results")
753
+ available_methods = [method for method in ['docling', 'llamaparse', 'unstructured']
754
+ if method in results and isinstance(results[method], dict)]
755
+
756
+ if available_methods:
757
+ selected_method = st.selectbox(
758
+ "Select extraction method to view:",
759
+ available_methods,
760
+ help="Choose which extraction method results to display"
761
+ )
762
+
763
+ if selected_method and isinstance(results[selected_method], dict):
764
+ method_result = results[selected_method]
765
+ st.json(method_result)
766
+
767
+ # List files in output directory
768
+ method_dir = os.path.join(output_dir, selected_method)
769
+
770
+ # HTML files
771
+ html_files = glob.glob(os.path.join(method_dir, "*.html"))
772
+ html_files.extend(glob.glob(os.path.join(method_dir, "**", "*.html"), recursive=True))
773
+
774
+ # Excel files
775
+ excel_files = get_excel_files(method_dir)
776
+
777
+ if html_files or excel_files:
778
+ st.markdown("### πŸ“„ Generated Files")
779
+
780
+ if html_files:
781
+ st.markdown("**HTML Files:**")
782
+ for html_file in html_files:
783
+ st.markdown(f"- {os.path.basename(html_file)}")
784
+
785
+ if excel_files:
786
+ st.markdown("**Excel Files:**")
787
+ for excel_file in excel_files:
788
+ st.markdown(f"- {os.path.basename(excel_file)}")
789
+ else:
790
+ st.warning("No successful extractions found.")
791
+
792
+ else:
793
+ st.error(f"Error processing document: {response.text}")
794
+
795
+ except requests.exceptions.ConnectionError:
796
+ st.error("Could not connect to the processing service. Please ensure the FastAPI server is running.")
797
+ except Exception as e:
798
+ st.error(f"An error occurred: {str(e)}")
799
+
800
+ def main():
801
+ # Navigation header
802
+ col1, col2 = st.columns([1, 1])
803
+ with col1:
804
+ st.markdown("### πŸ“‹ PDF Parser")
805
+ st.markdown("*Table Extraction Tool*")
806
+ with col2:
807
+ nav_col1, nav_col2 = st.columns(2)
808
+ with nav_col1:
809
+ if st.button("Dashboard", use_container_width=True):
810
+ st.session_state.page = 'home'
811
+ st.rerun()
812
+ with nav_col2:
813
+ st.button("History", use_container_width=True)
814
+ st.markdown("---")
815
+ # Route to appropriate page
816
+ if st.session_state.page == 'home':
817
+ show_home_page()
818
+ elif st.session_state.page == 'upload':
819
+ show_upload_page()
820
+ elif st.session_state.page == 'demo_setup':
821
+ show_demo_setup_page()
822
+ elif st.session_state.page == 'demo':
823
+ show_demo_page()
824
 
825
+ if __name__ == "__main__":
826
+ main()