akshansh36 commited on
Commit
eef9e83
·
verified ·
1 Parent(s): a1fc950

Upload 10 files

Browse files
app.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from upload_file_to_s3 import upload_file
3
+ import base64
4
+ import httpx
5
+ from extract_table_from_image import process_image_using_llm
6
+ from process_pdf import process_pdf
7
+ from pymongo import MongoClient
8
+ from datetime import datetime
9
+ from table_analysis_for_image import view_table_analysis_page
10
+ from table_analysis_for_pdf import view_pdf_table_analysis_page
11
+ from table_analysis_for_excel import display_csv_analysis
12
+ from view_excel import view_excel
13
+ from copy import deepcopy
14
+
15
+ import uuid
16
+ import os
17
+ import csv
18
+ from view_pdf import view_pdfs
19
+ from view_image import view_images
20
+ from io import StringIO, BytesIO
21
+ from dotenv import load_dotenv
22
+ import boto3
23
+ import pandas as pd
24
+ st.set_page_config(layout='wide',page_title="MoSPI", page_icon="📄")
25
+ load_dotenv()
26
+
27
+ AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
28
+ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
29
+ AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
30
+ MONGO_URI = os.getenv("MONGO_URI")
31
+ DB_NAME = os.getenv("DB_NAME")
32
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
33
+
34
+ mongo_client = MongoClient(MONGO_URI)
35
+ db = mongo_client[DB_NAME]
36
+ collection = db[COLLECTION_NAME]
37
+
38
+ s3 = boto3.client(
39
+ 's3',
40
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
41
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
42
+ )
43
+
44
+ if "page" not in st.session_state:
45
+ st.session_state.page = "home"
46
+ def upload_csv_file(file, csv_filename, content_type):
47
+ try:
48
+ # Generate a unique key for the file using UUID
49
+ uuid_str = str(uuid.uuid4())
50
+ s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'
51
+
52
+ # Upload the CSV to S3
53
+ s3.upload_fileobj(
54
+ file,
55
+ AWS_BUCKET_NAME,
56
+ s3_key,
57
+ ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file
58
+ )
59
+
60
+ upload_time = datetime.now()
61
+
62
+ # Metadata for MongoDB
63
+ metadata = {
64
+ 'name': csv_filename,
65
+ 'type': content_type,
66
+ 's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
67
+ 's3_key': s3_key,
68
+ 'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
69
+ 'date_uploaded': upload_time.strftime('%Y-%m-%d'),
70
+ 'time_uploaded': upload_time.strftime('%H:%M:%S')
71
+ }
72
+
73
+ return metadata
74
+
75
+ except Exception as e:
76
+ print(f"An error occurred during upload: {e}")
77
+ return None
78
+
79
+ def process_image(url, filename):
80
+ try:
81
+
82
+ image_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
83
+ if image_data:
84
+
85
+ result = process_image_using_llm(image_data, 1, 3)
86
+ has_table_data = result.get("has_table_data")
87
+ if has_table_data:
88
+ table_data = result.get("table_data")
89
+ page_number = result.get("page_number")
90
+ description = result.get("description")
91
+ column_summary = result.get("column_summary")
92
+ best_col1=result.get("best_col1")
93
+ best_col2=result.get("best_col2")
94
+
95
+ data={
96
+ "table_data":table_data,
97
+ "page_number":page_number,
98
+ "description":description,
99
+ "column_summary":column_summary,
100
+ "best_col1":best_col1,
101
+ "best_col2":best_col2
102
+ }
103
+
104
+
105
+ collection.update_one({"object_url": url}, {"$set": {"table_data": data}})
106
+ print("Successfully extracted data from image and inserted into MongoDB")
107
+
108
+ # Generate CSV from table data
109
+ csv_buffer = StringIO()
110
+ csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
111
+ csv_writer.writeheader()
112
+ csv_writer.writerows(table_data)
113
+
114
+ # Convert CSV text to bytes for uploading
115
+ csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
116
+
117
+ # Upload CSV to S3
118
+ csv_filename = f"{filename}.csv"
119
+ s3_metadata = upload_csv_file(csv_bytes, csv_filename, content_type="text/csv")
120
+
121
+ if s3_metadata:
122
+ # Update MongoDB with CSV S3 URL
123
+ collection.update_one(
124
+ {"object_url": url},
125
+ {"$set": {
126
+ "csv_object_url": s3_metadata.get("object_url"),
127
+ "csv_s3_url": s3_metadata.get("s3_url")
128
+ }}
129
+ )
130
+ print("CSV file uploaded to S3 and URL saved in MongoDB")
131
+
132
+ return True
133
+ else:
134
+ print(f"No table data was found in the image {url}")
135
+ return False
136
+
137
+ else:
138
+ print(f"No image data found in uploaded image")
139
+ return False
140
+
141
+ except Exception as e:
142
+ print(f"Error occurred in processing image: {e}")
143
+ return False
144
+
145
+
146
+ def convert_excel_to_csv(file, filename):
147
+ # Determine the appropriate engine based on file extension
148
+ file_extension = filename.split('.')[-1].lower()
149
+ if file_extension == 'xlsx':
150
+ engine = 'openpyxl'
151
+ elif file_extension == 'xls':
152
+ engine = 'xlrd'
153
+ else:
154
+ raise ValueError("Unsupported file format for Excel. Please upload an .xls or .xlsx file.")
155
+
156
+ # Load the Excel file into a DataFrame
157
+ df = pd.read_excel(file, engine=engine)
158
+
159
+ # Convert the DataFrame to CSV format in memory
160
+ csv_buffer = BytesIO()
161
+ df.to_csv(csv_buffer, index=False)
162
+ csv_buffer.seek(0) # Move to the start of the buffer
163
+
164
+ # Generate a new filename for CSV
165
+ csv_filename = filename.replace(".xlsx", ".csv").replace(".xls", ".csv")
166
+ return csv_buffer, csv_filename
167
+
168
+
169
+
170
+ if st.session_state.page=="home":
171
+ st.title("Smart Data Extraction and Analysis")
172
+
173
+ uploaded_file = st.file_uploader(
174
+ "Upload a file",
175
+ type=["png", "jpg", "jpeg", "pdf", "xlsx", "xls", "csv"],
176
+ accept_multiple_files=False,
177
+ help="Please upload only one file of type image, PDF, Excel, or CSV."
178
+ )
179
+
180
+ if uploaded_file and st.button("Upload"):
181
+ with st.spinner("Processing your file"):
182
+ file_copy = BytesIO(uploaded_file.getvalue())
183
+ file_type = uploaded_file.type
184
+ metadata = upload_file(uploaded_file, file_type)
185
+ if metadata:
186
+
187
+ object_url = metadata.get("object_url")
188
+ filename = metadata.get("name")
189
+
190
+ if "image" in file_type: # Process image files
191
+ processed = process_image(object_url, filename)
192
+ if processed:
193
+ collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
194
+ st.success("Image processed and CSV file uploaded to S3 successfully.")
195
+ else:
196
+ collection.update_one({"object_url":object_url},{"$set":{"status":"failed"}})
197
+ st.error("Error occured in processing Image, please try again later")
198
+
199
+ elif "pdf" in file_type:
200
+ processed=process_pdf(object_url,filename)
201
+ if processed:
202
+ collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
203
+ st.success("Successfully processed pdf")
204
+ else:
205
+ collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
206
+ st.error("Error occured in processing pdf")
207
+
208
+ elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","application/vnd.ms-excel"]:
209
+ csv_buffer, csv_filename = convert_excel_to_csv(file_copy, filename)
210
+ s3_metadata = upload_csv_file(csv_buffer, csv_filename, content_type="text/csv")
211
+ if s3_metadata:
212
+ collection.update_one({"object_url": object_url}, {
213
+ "$set": {"csv_object_url": s3_metadata["object_url"], "csv_s3_url": s3_metadata["s3_url"],
214
+ "filetype": "excel","status":"processed"}
215
+ })
216
+ st.success("Excel file uploaded to S3 successfully.")
217
+
218
+ else:
219
+ collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
220
+
221
+ elif "csv" in file_type:
222
+ collection.update_one({"object_url": object_url}, {
223
+ "$set": {"csv_object_url": object_url,"filetype": "csv","status":"processed"}})
224
+ st.success("CSV file uploaded to S3 successfully.")
225
+
226
+ st.markdown("<hr>",unsafe_allow_html=True)
227
+ col1, col2, col3 = st.columns([1, 1, 1], gap="small")
228
+
229
+ with col1:
230
+ if st.button("View PDFs", key="View pdf button"):
231
+ st.session_state.page = "view_pdf"
232
+ st.rerun()
233
+
234
+ with col2:
235
+ if st.button("View Images", key="View image button"):
236
+ st.session_state.page = "view_image"
237
+ st.rerun()
238
+
239
+ with col3:
240
+ if st.button("View Excel", key="View excel button"):
241
+ st.session_state.page = "view_excel"
242
+ st.rerun()
243
+
244
+ #in case of csv we are already uploading it.
245
+
246
+ if st.session_state.page=="view_pdf":
247
+ view_pdfs()
248
+
249
+
250
+ elif st.session_state.page=="view_image":
251
+ view_images()
252
+
253
+ elif st.session_state.page=="view_excel":
254
+ view_excel()
255
+
256
+ if st.session_state.page=="view_image_analysis" and "image_url" in st.session_state:
257
+ image_url = st.session_state.image_url
258
+ view_table_analysis_page(image_url)
259
+
260
+ if st.session_state.page=="pdf_analysis" and "pdf_url" in st.session_state:
261
+ pdf_url=st.session_state.pdf_url
262
+ view_pdf_table_analysis_page(pdf_url)
263
+
264
+
265
+ if st.session_state.page=="view_excel_analysis" and "excel_url" in st.session_state:
266
+ excel_url=st.session_state.excel_url
267
+ display_csv_analysis(excel_url)
268
+
269
+
270
+
271
+
272
+
273
+
274
+
275
+
extract_table_from_image.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_google_genai import ChatGoogleGenerativeAI
2
+ from langchain_core.messages import HumanMessage
3
+ import os
4
+ import re
5
+ import json
6
+
7
+ from dotenv import load_dotenv
8
+
9
+
10
+ load_dotenv()
11
+ MONGO_URI = os.getenv("MONGO_URI")
12
+ DB_NAME = os.getenv("DB_NAME")
13
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
14
+ FLASH_API = os.getenv("FLASH_API")
15
+ PINECONE_API=os.getenv("PINECONE_API")
16
+ PINECONE_INDEX=os.getenv("PINECONE_INDEX")
17
+
18
+ model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-002", temperature=0.2, max_tokens=None, google_api_key=FLASH_API)
19
+ system_prompt_text = f"""Please extract the table from the image and return the table data in JSON format, with each row represented as an object containing column headers as keys. Ensure that each cell's content corresponds accurately to its column header. If a cell is empty, Keep None as its value.
20
+ Go through the data and give a summary of the table, describing what the data is about in description field.
21
+ Go through each column and give a column summary telling what each column header means.
22
+ Analyze the data to suggest two columns which can be used to plot the best graph for this table.
23
+ If a table contains both hindi and english translations for header or cell then only give english translations.
24
+ Remember to give the response in correct JSON Format.
25
+
26
+ Expected output format : {{
27
+ "table_data": [
28
+ {{
29
+ "column_1": "Value 1-1",
30
+ "column_2": "Value 1-2",
31
+ "column_3": "Value 1-3"
32
+ }},
33
+ {{
34
+ "column_1": "Value 2-1",
35
+ "column_2": "Value 2-2",
36
+ "column_3": "Value 2-3"
37
+ }}
38
+ // Additional rows as needed
39
+ ],
40
+ "description": "Table Description",
41
+ "column_summary":{{
42
+ "column_1" : "column description",
43
+ "column_2" : "column description",
44
+ "column_3" :"column description"
45
+ }},
46
+ "best_column1" : "Column 1 name",
47
+ "best_column2" : "Column 2 name"
48
+
49
+
50
+ }}
51
+
52
+ """
53
+
54
+
55
+ def process_image_using_llm(image, page_number, max_retries=3):
56
+ for attempt in range(1, max_retries + 1):
57
+ try:
58
+ # Send the image and system prompt to the LLM
59
+ message = HumanMessage(
60
+ content=[
61
+ {"type": "text", "text": system_prompt_text},
62
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
63
+ ],
64
+ )
65
+ response = model.invoke([message])
66
+
67
+ # Clean up the response content
68
+ response_content = response.content.strip("```").replace("json", "").replace("\\n", "").strip()
69
+ print(response_content)
70
+ response_content = response_content.strip("```")
71
+
72
+ try:
73
+ # Attempt direct JSON parsing
74
+ data = json.loads(response_content)
75
+
76
+ # Extract table data and additional notes
77
+ table_data = data.get("table_data", [])
78
+ description = data.get("description", "").strip() if data.get("description") else ""
79
+ column_summary=data.get("column_summary",{})
80
+ best_col1=data.get("best_column1","").strip() if data.get("best_column1") else ""
81
+ best_col2=data.get("best_column2","").strip() if data.get("best_column2") else ""
82
+
83
+
84
+ # Verify that we have valid table data
85
+ has_table_data = bool(table_data)
86
+
87
+ return {
88
+ "page_number": page_number,
89
+ "table_data": table_data if has_table_data else None,
90
+ "description": description if description else None,
91
+ "column_summary": column_summary if column_summary else None,
92
+ "best_col1":best_col1 if best_col1 else None,
93
+ "best_col2":best_col2 if best_col2 else None,
94
+ "has_table_data": has_table_data
95
+ }
96
+ except json.JSONDecodeError as e:
97
+ print(f"JSON decode error on attempt {attempt} for page {page_number}: {e}")
98
+ if attempt == max_retries:
99
+ return {
100
+ "page_number": page_number,
101
+ "table_data": None,
102
+ "description": None,
103
+ "column_summary": None,
104
+ "best_col1": None,
105
+ "best_col2": None,
106
+ "has_table_data": False
107
+ }
108
+
109
+ # Handle any other exceptions without retrying
110
+ except Exception as e:
111
+ print(f"Outer exception for page {page_number}: {e}")
112
+ return {
113
+ "page_number": page_number,
114
+ "table_data": None,
115
+ "description": None,
116
+ "column_summary": None,
117
+ "best_col1": None,
118
+ "best_col2": None,
119
+ "has_table_data": False
120
+ }
process_pdf.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from pdf2image import convert_from_path
3
+ from extract_table_from_image import process_image_using_llm
4
+ from pymongo import MongoClient
5
+ from datetime import datetime
6
+ import uuid
7
+ import os
8
+ import re
9
+ import csv
10
+ import requests
11
+ from io import StringIO, BytesIO
12
+ from dotenv import load_dotenv
13
+ import boto3
14
+
15
+ load_dotenv()
16
+
17
+ AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
18
+ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
19
+ AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
20
+ MONGO_URI = os.getenv("MONGO_URI")
21
+ DB_NAME = os.getenv("DB_NAME")
22
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
23
+
24
+ mongo_client = MongoClient(MONGO_URI)
25
+ db = mongo_client[DB_NAME]
26
+ collection = db[COLLECTION_NAME]
27
+
28
+ s3 = boto3.client(
29
+ 's3',
30
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
31
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
32
+ )
33
+ pdf_temp_dir = 'temp/pdf_files'
34
+ image_temp_dir = 'temp/page_images'
35
+ os.makedirs(pdf_temp_dir, exist_ok=True)
36
+ os.makedirs(image_temp_dir, exist_ok=True)
37
+ pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
38
+
39
+ def cleanup_directory(directory_path):
40
+ try:
41
+ for filename in os.listdir(directory_path):
42
+ file_path = os.path.join(directory_path, filename)
43
+ if os.path.isfile(file_path):
44
+ os.remove(file_path)
45
+ print(f"Cleaned up files in {directory_path}")
46
+ except Exception as e:
47
+ print(f"Error cleaning up directory {directory_path}: {e}")
48
+ def download_and_split_pdf_to_image(url):
49
+ try:
50
+ response = requests.get(url)
51
+ with open(pdf_path, 'wb') as pdf_file:
52
+ pdf_file.write(response.content)
53
+
54
+
55
+ except Exception as e:
56
+ print(f"error occured during downloading pdf from object url : {e}")
57
+ return None
58
+
59
+ try:
60
+ images = convert_from_path(pdf_path)
61
+ for i, image in enumerate(images):
62
+ image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
63
+ image.save(image_path, 'PNG')
64
+ print(f'Saved image: {image_path}')
65
+ return True
66
+
67
+ except Exception as e:
68
+ print(f"error occured in converting pdf pages to image : {e}")
69
+ return None
70
+
71
+ def upload_csv_file(file, csv_filename, content_type):
72
+ try:
73
+ # Generate a unique key for the file using UUID
74
+ uuid_str = str(uuid.uuid4())
75
+ s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'
76
+
77
+ # Upload the CSV to S3
78
+ s3.upload_fileobj(
79
+ file,
80
+ AWS_BUCKET_NAME,
81
+ s3_key,
82
+ ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file
83
+ )
84
+
85
+ upload_time = datetime.now()
86
+
87
+ # Metadata for MongoDB
88
+ metadata = {
89
+ 'name': csv_filename,
90
+ 'type': content_type,
91
+ 's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
92
+ 's3_key': s3_key,
93
+ 'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
94
+ 'date_uploaded': upload_time.strftime('%Y-%m-%d'),
95
+ 'time_uploaded': upload_time.strftime('%H:%M:%S')
96
+ }
97
+
98
+ return metadata
99
+
100
+ except Exception as e:
101
+ print(f"An error occurred during upload: {e}")
102
+ return None
103
+
104
+ def process_pdf(url,filename):
105
+ split=download_and_split_pdf_to_image(url)
106
+ if split:
107
+ image_files = sorted(
108
+ os.listdir(image_temp_dir),
109
+ key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
110
+ )
111
+
112
+ table_datas= []
113
+ for count, image_name in enumerate(image_files, start=1):
114
+ print(f"Processing page {count} of the PDF")
115
+ image_path = os.path.join(image_temp_dir, image_name)
116
+ with open(image_path, "rb") as image_file:
117
+ image_data = base64.b64encode(image_file.read()).decode("utf-8")
118
+ result = process_image_using_llm(image_data,count,3)
119
+
120
+ has_table_data=result.get("has_table_data")
121
+ if has_table_data:
122
+ table_data=result.get("table_data")
123
+ page_number=result.get("page_number")
124
+ description = result.get("description")
125
+ column_summary=result.get("column_summary")
126
+ best_col1 = result.get("best_col1")
127
+ best_col2 = result.get("best_col2")
128
+
129
+
130
+ csv_buffer = StringIO()
131
+ csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
132
+ csv_writer.writeheader()
133
+ csv_writer.writerows(table_data)
134
+
135
+ csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
136
+ csv_filename = f"{filename}_pageNumber_{str(page_number)}.csv"
137
+ s3_metadata = upload_csv_file(csv_bytes, csv_filename, "text/csv")
138
+
139
+ if s3_metadata:
140
+ object_url=s3_metadata.get("object_url")
141
+ s3_url=s3_metadata.get("s3_url")
142
+ data = {
143
+ "table_data": table_data,
144
+ "description": description,
145
+ "column_summary": column_summary,
146
+ "page_number": page_number,
147
+ "csv_object_url":object_url,
148
+ "csv_s3_url":s3_url,
149
+ "best_col1": best_col1,
150
+ "best_col2": best_col2
151
+ }
152
+
153
+ table_datas.append(data)
154
+
155
+ else:
156
+ print(f"no table data found at page {count}")
157
+
158
+ if table_datas:
159
+ collection.update_one({"object_url":url},{"$set":{"table_data":table_datas}})
160
+
161
+ cleanup_directory(pdf_temp_dir)
162
+ cleanup_directory(image_temp_dir)
163
+ return True
164
+
165
+ else:
166
+ print(f"found no table data in whole pdf")
167
+ cleanup_directory(pdf_temp_dir)
168
+ cleanup_directory(image_temp_dir)
169
+ return False
170
+
171
+
172
+
173
+
table_analysis_for_excel.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pygwalker.api.streamlit import StreamlitRenderer
3
+ from io import BytesIO
4
+ import requests
5
+ import streamlit as st
6
+ from pymongo import MongoClient
7
+ import os
8
+ from dotenv import load_dotenv
9
+ import json
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+ MONGO_URI = os.getenv("MONGO_URI")
14
+ DB_NAME = os.getenv("DB_NAME")
15
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
16
+
17
+ mongo_client = MongoClient(MONGO_URI)
18
+ db = mongo_client[DB_NAME]
19
+ collection = db[COLLECTION_NAME]
20
+
21
+
22
+
23
+
24
+ def load_csv_from_url(csv_url):
25
+ response = requests.get(csv_url)
26
+ response.raise_for_status() # Ensure the request was successful
27
+ return pd.read_csv(BytesIO(response.content))
28
+
29
+ # Column Analysis Function
30
+ def analyze_column_data(df):
31
+ analysis = {}
32
+ for col in df.columns:
33
+ if pd.api.types.is_numeric_dtype(df[col]):
34
+ analysis[col] = {
35
+ "Mean": df[col].mean(),
36
+ "Median": df[col].median(),
37
+ "Mode": df[col].mode()[0] if not df[col].mode().empty else None,
38
+ "Unique Values": df[col].nunique(),
39
+ "Null Values": df[col].isnull().sum()
40
+ }
41
+ else:
42
+ analysis[col] = {
43
+ "Unique Values": df[col].nunique(),
44
+ "Null Values": df[col].isnull().sum(),
45
+ "Top Categories": df[col].value_counts().head(5).to_dict()
46
+ }
47
+ return analysis
48
+
49
+ # Streamlit Interface
50
+ def display_csv_analysis(object_url):
51
+
52
+ if st.button("Back",key="back_button"):
53
+ st.session_state.page="view_excel"
54
+ st.rerun()
55
+
56
+ csv_url=collection.find_one({"object_url":object_url}).get("csv_object_url")
57
+ st.title("CSV File Analysis")
58
+
59
+ # Load and display CSV data
60
+ df = load_csv_from_url(csv_url)
61
+ st.subheader("CSV Preview")
62
+ st.dataframe(df)
63
+
64
+ # Perform and display analysis
65
+ st.subheader("Column Analysis")
66
+ column_analysis = analyze_column_data(df)
67
+
68
+ col1, col2 = st.columns(2)
69
+ for idx, (col_name, col_data) in enumerate(column_analysis.items()):
70
+ with col1 if idx % 2 == 0 else col2:
71
+ st.markdown(f"**{col_name}**")
72
+ st.write(col_data)
73
+
74
+ st.markdown("<hr>", unsafe_allow_html=True)
75
+ st.subheader("Graphical Analysis of Table")
76
+ pyg_app = StreamlitRenderer(df)
77
+ pyg_app.explorer()
78
+
79
+
80
+
81
+
table_analysis_for_image.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pygwalker.api.streamlit import StreamlitRenderer
3
+ from io import BytesIO
4
+ import requests
5
+ import streamlit as st
6
+ from pymongo import MongoClient
7
+ import os
8
+ from dotenv import load_dotenv
9
+ import json
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+ MONGO_URI = os.getenv("MONGO_URI")
14
+ DB_NAME = os.getenv("DB_NAME")
15
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
16
+
17
+ mongo_client = MongoClient(MONGO_URI)
18
+ db = mongo_client[DB_NAME]
19
+ collection = db[COLLECTION_NAME]
20
+
21
+
22
+
23
+
24
+ # Load the CSV from a URL (replace with actual CSV download from S3)
25
+ def load_csv_from_url(object_url):
26
+ response = requests.get(object_url)
27
+ response.raise_for_status() # Ensure the request was successful
28
+ csv_data = pd.read_csv(BytesIO(response.content))
29
+ return csv_data
30
+
31
+
32
+ # Analyzing each column based on data type
33
+ def analyze_column_data(df):
34
+ analysis = {}
35
+ for col in df.columns:
36
+ if pd.api.types.is_numeric_dtype(df[col]):
37
+ analysis[col] = {
38
+ "Mean": df[col].mean(),
39
+ "Median": df[col].median(),
40
+ "Mode": df[col].mode()[0] if not df[col].mode().empty else None,
41
+ "Unique Values": df[col].nunique(),
42
+ "Null Values": df[col].isnull().sum()
43
+ }
44
+ else:
45
+ analysis[col] = {
46
+ "Unique Values": df[col].nunique(),
47
+ "Null Values": df[col].isnull().sum(),
48
+ "Top Categories": df[col].value_counts().head(5).to_dict()
49
+ }
50
+ return analysis
51
+
52
+
53
+ # Main function to render the View Table Analysis page
54
+ def view_table_analysis_page(url):
55
+
56
+ if st.button("Back",key="back_button"):
57
+ st.session_state.page="view_image"
58
+ st.rerun()
59
+
60
+ image=collection.find_one({"object_url":url})
61
+ csv_url=image.get("csv_object_url")
62
+
63
+ # Load CSV data
64
+ df = load_csv_from_url(csv_url)
65
+ # Check if the last row has any cell containing the word "total" (case-insensitive)
66
+ if df.iloc[-1].apply(lambda x: "total" in str(x).lower()).any():
67
+ df = df.iloc[:-1] # Drop the last row if "total" is found in any cell
68
+
69
+
70
+
71
+
72
+ # Page title
73
+ st.title("Table Analysis")
74
+
75
+ # CSV Preview
76
+ st.subheader("CSV Preview")
77
+ st.write("Below is a preview of the uploaded CSV file:")
78
+ st.dataframe(df) # Interactive, scrollable table
79
+
80
+ # Download Button
81
+ excel_buffer = BytesIO()
82
+ with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
83
+ df.to_excel(writer, index=False, sheet_name="Sheet1")
84
+ excel_buffer.seek(0) # Reset buffer position
85
+
86
+ # Download Button
87
+ st.download_button(
88
+ label="Download Full Excel Sheet",
89
+ data=excel_buffer,
90
+ file_name="table_data.xlsx",
91
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
92
+ )
93
+
94
+ st.markdown("<hr>", unsafe_allow_html=True)
95
+ table_description=image.get("table_data").get("description",None)
96
+
97
+ if table_description:
98
+ # Table Description
99
+ st.subheader("Table Description")
100
+ st.write(table_description)
101
+
102
+ # Column Summary
103
+ st.markdown("<hr>",unsafe_allow_html=True)
104
+ st.subheader("Column Summary")
105
+ with st.container(height=400, border=False):
106
+
107
+ column_summary = image.get("table_data").get("column_summary", None)
108
+
109
+ if column_summary:
110
+ # Column-level descriptions and analysis
111
+ column_analysis = analyze_column_data(df)
112
+
113
+
114
+ col1, col2 = st.columns(2)
115
+ for idx, (col_name, col_description) in enumerate(column_summary.items()):
116
+ # Determine which column to use based on the index
117
+
118
+ with col1 if idx % 2 == 0 else col2:
119
+ st.markdown(f"Column Name : **{col_name}**")
120
+ st.write(f"Column Description : {col_description}")
121
+
122
+ # Display basic analysis
123
+ analysis = column_analysis.get(col_name, {})
124
+ if pd.api.types.is_numeric_dtype(df[col_name]):
125
+ # Numeric column analysis
126
+ st.write({
127
+ "Mean": analysis.get("Mean"),
128
+ "Median": analysis.get("Median"),
129
+ "Mode": analysis.get("Mode"),
130
+ "Unique Values": analysis.get("Unique Values"),
131
+ "Null Values": analysis.get("Null Values")
132
+ })
133
+ else:
134
+ # Categorical column analysis
135
+ st.write({
136
+ "Unique Values": analysis.get("Unique Values"),
137
+ "Null Values": analysis.get("Null Values"),
138
+ "Top Categories": analysis.get("Top Categories")
139
+ })
140
+
141
+ st.markdown("<hr>", unsafe_allow_html=True)
142
+ st.subheader("Graphical Analysis of Table")
143
+
144
+ # Default configuration for initial visualization
145
+ best_col1=image.get("table_data").get("best_col1")
146
+ best_col2 = image.get("table_data").get("best_col2")
147
+ default_chart_config = {
148
+ "mark": "bar",
149
+ "encoding": {
150
+ "x": {"field": best_col1, "type": "nominal"},
151
+ "y": {"field": best_col2, "type": "quantitative"}
152
+ }
153
+ }
154
+
155
+ # Convert default_chart_config to JSON string for Pygwalker spec parameter
156
+ pyg_app = StreamlitRenderer(df, spec=json.dumps(default_chart_config))
157
+ pyg_app.explorer()
158
+
159
+
160
+
table_analysis_for_pdf.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from io import BytesIO
3
+ import requests
4
+ import streamlit as st
5
+ from pymongo import MongoClient
6
+ import os
7
+ from dotenv import load_dotenv
8
+ import json
9
+ from pygwalker.api.streamlit import StreamlitRenderer
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+ MONGO_URI = os.getenv("MONGO_URI")
14
+ DB_NAME = os.getenv("DB_NAME")
15
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
16
+
17
+ mongo_client = MongoClient(MONGO_URI)
18
+ db = mongo_client[DB_NAME]
19
+ collection = db[COLLECTION_NAME]
20
+
21
+ # Load CSV from S3 URL
22
+ def load_csv_from_url(object_url):
23
+ response = requests.get(object_url)
24
+ response.raise_for_status()
25
+ return pd.read_csv(BytesIO(response.content))
26
+
27
+
28
+ # Analyze column data
29
+ def analyze_column_data(df):
30
+ analysis = {}
31
+ for col in df.columns:
32
+ if pd.api.types.is_numeric_dtype(df[col]):
33
+ analysis[col] = {
34
+ "Mean": df[col].mean(),
35
+ "Median": df[col].median(),
36
+ "Mode": df[col].mode()[0] if not df[col].mode().empty else None,
37
+ "Unique Values": df[col].nunique(),
38
+ "Null Values": df[col].isnull().sum()
39
+ }
40
+ else:
41
+ analysis[col] = {
42
+ "Unique Values": df[col].nunique(),
43
+ "Null Values": df[col].isnull().sum(),
44
+ "Top Categories": df[col].value_counts().head(5).to_dict()
45
+ }
46
+ return analysis
47
+
48
+
49
+ # Display analysis for a selected table
50
+ def display_table_analysis(table):
51
+ # Load CSV data
52
+ df = load_csv_from_url(table['csv_object_url'])
53
+
54
+ # Check for "total" row
55
+ if df.iloc[-1].astype(str).str.contains("total", case=False).any():
56
+ df = df.iloc[:-1] # Drop last row if "total" found
57
+
58
+ # Table preview
59
+ st.subheader("CSV Preview")
60
+ st.dataframe(df, height=300)
61
+
62
+ # Download Button
63
+ st.download_button(
64
+ label="Download CSV",
65
+ data=requests.get(table['csv_object_url']).content,
66
+ file_name="table_data.csv",
67
+ mime="text/csv"
68
+ )
69
+
70
+ # Table Description
71
+ if 'description' in table:
72
+ st.subheader("Table Description")
73
+ st.write(table['description'])
74
+
75
+ # Column Summary
76
+ st.subheader("Column Summary")
77
+ column_summary = table.get('column_summary', {})
78
+ column_analysis = analyze_column_data(df)
79
+
80
+ col1, col2 = st.columns(2)
81
+ for idx, (col_name, col_description) in enumerate(column_summary.items()):
82
+ with col1 if idx % 2 == 0 else col2:
83
+ st.markdown(f"Column Name: **{col_name}**")
84
+ st.write(f"Description: {col_description}")
85
+ analysis = column_analysis.get(col_name, {})
86
+ if pd.api.types.is_numeric_dtype(df[col_name]):
87
+ st.write({
88
+ "Mean": analysis.get("Mean"),
89
+ "Median": analysis.get("Median"),
90
+ "Mode": analysis.get("Mode"),
91
+ "Unique Values": analysis.get("Unique Values"),
92
+ "Null Values": analysis.get("Null Values")
93
+ })
94
+ else:
95
+ st.write({
96
+ "Unique Values": analysis.get("Unique Values"),
97
+ "Null Values": analysis.get("Null Values"),
98
+ "Top Categories": analysis.get("Top Categories")
99
+ })
100
+
101
+ # Graphical Analysis using Pygwalker
102
+ st.subheader("Graphical Analysis of Table")
103
+ pyg_app = StreamlitRenderer(df)
104
+ pyg_app.explorer()
105
+
106
+
107
+ # Main function to render the View Table Analysis page for PDF tables
108
+ def view_pdf_table_analysis_page(url):
109
+ if st.button("Back", key="back_button"):
110
+ st.session_state.page = "view_pdf"
111
+ st.rerun()
112
+
113
+ # Retrieve table data for the PDF
114
+ pdf_data = collection.find_one({"object_url": url})
115
+ tables = pdf_data.get("table_data", [])
116
+
117
+
118
+ # Display the total number of tables
119
+ st.title("PDF Table Analysis")
120
+ st.write(f"Total tables found: {len(tables)}")
121
+
122
+ if "selected_table" not in st.session_state or st.session_state.selected_table is None or st.session_state.selected_table >= len(tables):
123
+ st.session_state.selected_table = 0
124
+
125
+
126
+ selected_table_idx = st.radio(
127
+ "Select a table to analyze",
128
+ options=range(len(tables)),
129
+ format_func=lambda x: f"Analyze Table {x + 1}",
130
+ index=st.session_state.selected_table # Safely use the default if uninitialized
131
+ )
132
+
133
+ st.session_state.selected_table = selected_table_idx
134
+
135
+
136
+ if st.session_state.selected_table is not None:
137
+ selected_table_data = tables[st.session_state.selected_table]
138
+ st.subheader(f"Analysis for Table {st.session_state.selected_table + 1}")
139
+
140
+ csv_url = selected_table_data['csv_object_url']
141
+ df = load_csv_from_url(csv_url)
142
+ if df.iloc[-1].apply(lambda x: "total" in str(x).lower()).any():
143
+ df = df.iloc[:-1]
144
+
145
+
146
+ st.dataframe(df) # Interactive, scrollable table
147
+
148
+ excel_buffer = BytesIO()
149
+ with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
150
+ df.to_excel(writer, index=False, sheet_name="Sheet1")
151
+ excel_buffer.seek(0) # Reset buffer position
152
+
153
+ # Download Button
154
+ st.download_button(
155
+ label="Download Full Excel Sheet",
156
+ data=excel_buffer,
157
+ file_name="table_data.xlsx",
158
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
159
+ )
160
+
161
+ st.markdown("<hr>", unsafe_allow_html=True)
162
+ table_description = selected_table_data.get("description", None)
163
+
164
+ if table_description:
165
+ # Table Description
166
+ st.subheader("Table Description")
167
+ st.write(table_description)
168
+
169
+ # Column Summary
170
+ st.markdown("<hr>", unsafe_allow_html=True)
171
+ st.subheader("Column Summary")
172
+ with st.container(height=400, border=False):
173
+
174
+ column_summary = selected_table_data.get("column_summary", None)
175
+
176
+ if column_summary:
177
+ # Column-level descriptions and analysis
178
+ column_analysis = analyze_column_data(df)
179
+
180
+ col1, col2 = st.columns(2)
181
+ for idx, (col_name, col_description) in enumerate(column_summary.items()):
182
+ # Determine which column to use based on the index
183
+
184
+ with col1 if idx % 2 == 0 else col2:
185
+ st.markdown(f"Column Name : **{col_name}**")
186
+ st.write(f"Column Description : {col_description}")
187
+
188
+ # Display basic analysis
189
+ analysis = column_analysis.get(col_name, {})
190
+ if pd.api.types.is_numeric_dtype(df[col_name]):
191
+ # Numeric column analysis
192
+ st.write({
193
+ "Mean": analysis.get("Mean"),
194
+ "Median": analysis.get("Median"),
195
+ "Mode": analysis.get("Mode"),
196
+ "Unique Values": analysis.get("Unique Values"),
197
+ "Null Values": analysis.get("Null Values")
198
+ })
199
+ else:
200
+ # Categorical column analysis
201
+ st.write({
202
+ "Unique Values": analysis.get("Unique Values"),
203
+ "Null Values": analysis.get("Null Values"),
204
+ "Top Categories": analysis.get("Top Categories")
205
+ })
206
+
207
+ st.markdown("<hr>", unsafe_allow_html=True)
208
+ st.subheader("Graphical Analysis of Table")
209
+
210
+ best_col1 = selected_table_data.get("best_col1")
211
+ best_col2 = selected_table_data .get("best_col2")
212
+ default_chart_config = {
213
+ "mark": "bar",
214
+ "encoding": {
215
+ "x": {"field": best_col1, "type": "nominal"},
216
+ "y": {"field": best_col2, "type": "quantitative"}
217
+ }
218
+ }
219
+
220
+ # Convert default_chart_config to JSON string for Pygwalker spec parameter
221
+ pyg_app = StreamlitRenderer(df, spec=json.dumps(default_chart_config))
222
+ pyg_app.explorer()
223
+
224
+
225
+
226
+
227
+
228
+
upload_file_to_s3.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ from datetime import datetime
3
+ import boto3
4
+ import uuid
5
+ import os
6
+ from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
+
10
+ AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
11
+ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
12
+ AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
13
+ MONGO_URI = os.getenv("MONGO_URI")
14
+ DB_NAME = os.getenv("DB_NAME")
15
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
16
+
17
+ mongo_client = MongoClient(MONGO_URI)
18
+ db = mongo_client[DB_NAME]
19
+ collection = db[COLLECTION_NAME]
20
+ s3 = boto3.client(
21
+ 's3',
22
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
23
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
24
+ )
25
+ def upload_file(file,filetype):
26
+ try:
27
+ # Generate a unique key for the file using UUID
28
+ uuid_str = str(uuid.uuid4())
29
+ file_name = file.name
30
+ s3_key = f'MoSPI_files/{uuid_str}-{file_name}'
31
+
32
+ # Upload the image to S3 with ContentType for image files
33
+ s3.upload_fileobj(
34
+ file,
35
+ AWS_BUCKET_NAME,
36
+ s3_key,
37
+ ExtraArgs={'ContentType': file.type} # Set the MIME type of the uploaded file
38
+ )
39
+
40
+ file_size = file.size
41
+ upload_time = datetime.now()
42
+
43
+ # Extract date and time separately
44
+ upload_date = upload_time.strftime('%Y-%m-%d')
45
+ upload_time_only = upload_time.strftime('%H:%M:%S')
46
+
47
+ # Metadata to MongoDB
48
+ metadata = {
49
+ 'name': file_name,
50
+ 'size': file_size,
51
+ 'type': filetype,
52
+ 'status': 'unprocessed',
53
+ 's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
54
+ 's3_key': s3_key,
55
+ 'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
56
+ 'date_uploaded': upload_date,
57
+ 'time_uploaded': upload_time_only,
58
+ 'accuracy': None
59
+ }
60
+
61
+ # Insert metadata into MongoDB
62
+ collection.insert_one(metadata)
63
+ return metadata
64
+
65
+ except Exception as e:
66
+ print(f"An error occurred during upload: {e}")
67
+ return None
68
+
69
+
view_excel.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pymongo import MongoClient
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from datetime import datetime
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+ MONGO_URI = os.getenv("MONGO_URI")
10
+ DB_NAME = os.getenv("DB_NAME")
11
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
12
+
13
+ mongo_client = MongoClient(MONGO_URI)
14
+ db = mongo_client[DB_NAME]
15
+ collection = db[COLLECTION_NAME]
16
+
17
+
18
+ def format_date(timestamp):
19
+ """Convert timestamp to a readable date format."""
20
+ return datetime.fromtimestamp(timestamp).strftime("%B %d, %Y")
21
+
22
+
23
+ # Custom CSS to control image and expander container width and styling
24
+
25
+
26
+ def view_excel():
27
+ if st.button("Back"):
28
+ st.session_state.page = "home"
29
+ st.rerun()
30
+ st.title("Your Uploaded Images")
31
+
32
+ # Fetch all uploaded images from MongoDB
33
+ images = list(collection.find({"filetype": {"$in": ["excel", "csv"]},"status":"processed"}))
34
+
35
+
36
+ if not images:
37
+ st.write("You have not uploaded any Excel yet.")
38
+ return
39
+
40
+ # Display images in a grid (4 images per row)
41
+ cols = st.columns(4)
42
+ for idx, image in enumerate(images):
43
+ col = cols[idx % 4]
44
+
45
+ with col:
46
+ # Container for each image and its expander
47
+
48
+
49
+ # Expander for image details
50
+ with st.expander("View Excel Details"):
51
+ st.write(f"**File Name:** {image.get('name', 'N/A')}")
52
+ st.write(f"**Date Uploaded:** {format_date(image.get('upload_date', datetime.now().timestamp()))}")
53
+
54
+ st.markdown(
55
+ f"<a href='{image['object_url']}' class='download-link' download>Download File</a>",
56
+ unsafe_allow_html=True
57
+ )
58
+
59
+ if st.button("View Table Analysis",key=f"image_analysis_{idx}"):
60
+ st.session_state.page="view_excel_analysis"
61
+ st.session_state.excel_url=image['object_url']
62
+ st.rerun()
63
+
64
+ # Move to a new row after every 4 images
65
+ if (idx + 1) % 4 == 0:
66
+ st.write("") # Line break to move to the next row
67
+
68
+
view_image.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pymongo import MongoClient
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from datetime import datetime
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+ MONGO_URI = os.getenv("MONGO_URI")
10
+ DB_NAME = os.getenv("DB_NAME")
11
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
12
+
13
+ mongo_client = MongoClient(MONGO_URI)
14
+ db = mongo_client[DB_NAME]
15
+ collection = db[COLLECTION_NAME]
16
+
17
+
18
+ def format_date(timestamp):
19
+ """Convert timestamp to a readable date format."""
20
+ return datetime.fromtimestamp(timestamp).strftime("%B %d, %Y")
21
+
22
+
23
+ # Custom CSS to control image and expander container width and styling
24
+
25
+
26
+ def view_images():
27
+ if st.button("Back"):
28
+ st.session_state.page = "home"
29
+ st.rerun()
30
+ st.title("Your Uploaded Images")
31
+
32
+ # Fetch all uploaded images from MongoDB
33
+ images = list(collection.find({"type": {"$regex": "Image", "$options": "i"},"status":"processed"}))
34
+
35
+ if not images:
36
+ st.write("You have not uploaded any images yet.")
37
+ return
38
+
39
+ # Display images in a grid (4 images per row)
40
+ cols = st.columns(4)
41
+ for idx, image in enumerate(images):
42
+ col = cols[idx % 4]
43
+
44
+ with col:
45
+ # Container for each image and its expander
46
+ st.markdown("<div class='image-wrapper'>", unsafe_allow_html=True)
47
+
48
+ # Display the image using HTML
49
+
50
+ st.markdown(
51
+ f"""
52
+ <div style='text-align: center;'>
53
+ <img src='{image['object_url']}' alt='{image.get('name', 'Image')}' style='width:250px; height:250px; object-fit: cover; border-radius: 8px;' />
54
+
55
+ </div>
56
+ """,
57
+ unsafe_allow_html=True
58
+ )
59
+
60
+ st.markdown("</div>", unsafe_allow_html=True) # Close image container
61
+
62
+ # Expander for image details
63
+ with st.expander("View Image Details"):
64
+ st.write(f"**File Name:** {image.get('name', 'N/A')}")
65
+ st.write(f"**Date Uploaded:** {format_date(image.get('upload_date', datetime.now().timestamp()))}")
66
+ st.write(f"**Table Description**: {image.get('table_data').get('description','')}")
67
+
68
+ st.markdown(
69
+ f"<a href='{image['object_url']}' class='download-link' download>Download Image</a>",
70
+ unsafe_allow_html=True
71
+ )
72
+
73
+ if st.button("View Table Analysis",key=f"image_analysis_{idx}"):
74
+ st.session_state.page="view_image_analysis"
75
+ st.session_state.image_url=image['object_url']
76
+ st.rerun()
77
+
78
+ # Move to a new row after every 4 images
79
+ if (idx + 1) % 4 == 0:
80
+ st.write("") # Line break to move to the next row
81
+
82
+
view_pdf.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pymongo import MongoClient
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from datetime import datetime
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+ MONGO_URI = os.getenv("MONGO_URI")
10
+ DB_NAME = os.getenv("DB_NAME")
11
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
12
+
13
+ mongo_client = MongoClient(MONGO_URI)
14
+ db = mongo_client[DB_NAME]
15
+ collection = db[COLLECTION_NAME]
16
+
17
+
18
+
19
+ def format_date(timestamp):
20
+ """Convert timestamp to a readable date format."""
21
+ return datetime.fromtimestamp(timestamp).strftime("%B %d, %Y")
22
+
23
+
24
+ # Custom CSS to control image and expander container width and styling
25
+
26
+
27
+ def view_pdfs():
28
+ if st.button("Back"):
29
+ st.session_state.page = "home"
30
+ st.rerun()
31
+ st.title("Your Uploaded PDFs")
32
+
33
+ # Fetch all uploaded images from MongoDB
34
+ pdfs= list(collection.find({"type": {"$regex": "pdf", "$options": "i"},"status":"processed"}))
35
+
36
+ if not pdfs:
37
+ st.write("You have not uploaded any PDFs yet.")
38
+ return
39
+
40
+ # Display images in a grid (4 images per row)
41
+ cols = st.columns(4)
42
+ for idx, pdf in enumerate(pdfs):
43
+ col = cols[idx % 4]
44
+
45
+ with col:
46
+
47
+ # Expander for image details
48
+ filename=pdf.get('name','N/A')
49
+ with st.expander(f"{filename}"):
50
+ st.write(f"**File Name:** {pdf.get('name', 'N/A')}")
51
+ st.write(f"**Date Uploaded:** {format_date(pdf.get('upload_date', datetime.now().timestamp()))}")
52
+ st.write(f"**Total tables found** : {len(pdf.get('table_data'))}")
53
+ # Download link
54
+ st.markdown(
55
+ f"<a href='{pdf['object_url']}' class='download-link' download>Download PDF</a>",
56
+ unsafe_allow_html=True
57
+ )
58
+
59
+ if st.button("View Table Analysis",key=f"table_analysis_{idx}"):
60
+ st.session_state.page="pdf_analysis"
61
+ st.session_state.pdf_url = pdf['object_url']
62
+ st.rerun()
63
+
64
+
65
+
66
+ # Move to a new row after every 4 images
67
+ if (idx + 1) % 4 == 0:
68
+ st.write("") # Line break to move to the next row
69
+
70
+
71
+