akshansh36 commited on
Commit
51f7b0e
·
verified ·
1 Parent(s): 3e5a1d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +280 -275
app.py CHANGED
@@ -1,275 +1,280 @@
1
- import streamlit as st
2
- from upload_file_to_s3 import upload_file
3
- import base64
4
- import httpx
5
- from extract_table_from_image import process_image_using_llm
6
- from process_pdf import process_pdf
7
- from pymongo import MongoClient
8
- from datetime import datetime
9
- from table_analysis_for_image import view_table_analysis_page
10
- from table_analysis_for_pdf import view_pdf_table_analysis_page
11
- from table_analysis_for_excel import display_csv_analysis
12
- from view_excel import view_excel
13
- from copy import deepcopy
14
-
15
- import uuid
16
- import os
17
- import csv
18
- from view_pdf import view_pdfs
19
- from view_image import view_images
20
- from io import StringIO, BytesIO
21
- from dotenv import load_dotenv
22
- import boto3
23
- import pandas as pd
24
- st.set_page_config(layout='wide',page_title="MoSPI", page_icon="📄")
25
- load_dotenv()
26
-
27
- AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
28
- AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
29
- AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
30
- MONGO_URI = os.getenv("MONGO_URI")
31
- DB_NAME = os.getenv("DB_NAME")
32
- COLLECTION_NAME = os.getenv("COLLECTION_NAME")
33
-
34
- mongo_client = MongoClient(MONGO_URI)
35
- db = mongo_client[DB_NAME]
36
- collection = db[COLLECTION_NAME]
37
-
38
- s3 = boto3.client(
39
- 's3',
40
- aws_access_key_id=AWS_ACCESS_KEY_ID,
41
- aws_secret_access_key=AWS_SECRET_ACCESS_KEY
42
- )
43
-
44
- if "page" not in st.session_state:
45
- st.session_state.page = "home"
46
- def upload_csv_file(file, csv_filename, content_type):
47
- try:
48
- # Generate a unique key for the file using UUID
49
- uuid_str = str(uuid.uuid4())
50
- s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'
51
-
52
- # Upload the CSV to S3
53
- s3.upload_fileobj(
54
- file,
55
- AWS_BUCKET_NAME,
56
- s3_key,
57
- ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file
58
- )
59
-
60
- upload_time = datetime.now()
61
-
62
- # Metadata for MongoDB
63
- metadata = {
64
- 'name': csv_filename,
65
- 'type': content_type,
66
- 's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
67
- 's3_key': s3_key,
68
- 'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
69
- 'date_uploaded': upload_time.strftime('%Y-%m-%d'),
70
- 'time_uploaded': upload_time.strftime('%H:%M:%S')
71
- }
72
-
73
- return metadata
74
-
75
- except Exception as e:
76
- print(f"An error occurred during upload: {e}")
77
- return None
78
-
79
- def process_image(url, filename):
80
- try:
81
-
82
- image_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
83
- if image_data:
84
-
85
- result = process_image_using_llm(image_data, 1, 3)
86
- has_table_data = result.get("has_table_data")
87
- if has_table_data:
88
- table_data = result.get("table_data")
89
- page_number = result.get("page_number")
90
- description = result.get("description")
91
- column_summary = result.get("column_summary")
92
- best_col1=result.get("best_col1")
93
- best_col2=result.get("best_col2")
94
-
95
- data={
96
- "table_data":table_data,
97
- "page_number":page_number,
98
- "description":description,
99
- "column_summary":column_summary,
100
- "best_col1":best_col1,
101
- "best_col2":best_col2
102
- }
103
-
104
-
105
- collection.update_one({"object_url": url}, {"$set": {"table_data": data}})
106
- print("Successfully extracted data from image and inserted into MongoDB")
107
-
108
- # Generate CSV from table data
109
- csv_buffer = StringIO()
110
- csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
111
- csv_writer.writeheader()
112
- csv_writer.writerows(table_data)
113
-
114
- # Convert CSV text to bytes for uploading
115
- csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
116
-
117
- # Upload CSV to S3
118
- csv_filename = f"{filename}.csv"
119
- s3_metadata = upload_csv_file(csv_bytes, csv_filename, content_type="text/csv")
120
-
121
- if s3_metadata:
122
- # Update MongoDB with CSV S3 URL
123
- collection.update_one(
124
- {"object_url": url},
125
- {"$set": {
126
- "csv_object_url": s3_metadata.get("object_url"),
127
- "csv_s3_url": s3_metadata.get("s3_url")
128
- }}
129
- )
130
- print("CSV file uploaded to S3 and URL saved in MongoDB")
131
-
132
- return True
133
- else:
134
- print(f"No table data was found in the image {url}")
135
- return False
136
-
137
- else:
138
- print(f"No image data found in uploaded image")
139
- return False
140
-
141
- except Exception as e:
142
- print(f"Error occurred in processing image: {e}")
143
- return False
144
-
145
-
146
- def convert_excel_to_csv(file, filename):
147
- # Determine the appropriate engine based on file extension
148
- file_extension = filename.split('.')[-1].lower()
149
- if file_extension == 'xlsx':
150
- engine = 'openpyxl'
151
- elif file_extension == 'xls':
152
- engine = 'xlrd'
153
- else:
154
- raise ValueError("Unsupported file format for Excel. Please upload an .xls or .xlsx file.")
155
-
156
- # Load the Excel file into a DataFrame
157
- df = pd.read_excel(file, engine=engine)
158
-
159
- # Convert the DataFrame to CSV format in memory
160
- csv_buffer = BytesIO()
161
- df.to_csv(csv_buffer, index=False)
162
- csv_buffer.seek(0) # Move to the start of the buffer
163
-
164
- # Generate a new filename for CSV
165
- csv_filename = filename.replace(".xlsx", ".csv").replace(".xls", ".csv")
166
- return csv_buffer, csv_filename
167
-
168
-
169
-
170
- if st.session_state.page=="home":
171
- st.title("Smart Data Extraction and Analysis")
172
-
173
- uploaded_file = st.file_uploader(
174
- "Upload a file",
175
- type=["png", "jpg", "jpeg", "pdf", "xlsx", "xls", "csv"],
176
- accept_multiple_files=False,
177
- help="Please upload only one file of type image, PDF, Excel, or CSV."
178
- )
179
-
180
- if uploaded_file and st.button("Upload"):
181
- with st.spinner("Processing your file"):
182
- file_copy = BytesIO(uploaded_file.getvalue())
183
- file_type = uploaded_file.type
184
- metadata = upload_file(uploaded_file, file_type)
185
- if metadata:
186
-
187
- object_url = metadata.get("object_url")
188
- filename = metadata.get("name")
189
-
190
- if "image" in file_type: # Process image files
191
- processed = process_image(object_url, filename)
192
- if processed:
193
- collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
194
- st.success("Image processed and CSV file uploaded to S3 successfully.")
195
- else:
196
- collection.update_one({"object_url":object_url},{"$set":{"status":"failed"}})
197
- st.error("Error occured in processing Image, please try again later")
198
-
199
- elif "pdf" in file_type:
200
- processed=process_pdf(object_url,filename)
201
- if processed:
202
- collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
203
- st.success("Successfully processed pdf")
204
- else:
205
- collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
206
- st.error("Error occured in processing pdf")
207
-
208
- elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","application/vnd.ms-excel"]:
209
- csv_buffer, csv_filename = convert_excel_to_csv(file_copy, filename)
210
- s3_metadata = upload_csv_file(csv_buffer, csv_filename, content_type="text/csv")
211
- if s3_metadata:
212
- collection.update_one({"object_url": object_url}, {
213
- "$set": {"csv_object_url": s3_metadata["object_url"], "csv_s3_url": s3_metadata["s3_url"],
214
- "filetype": "excel","status":"processed"}
215
- })
216
- st.success("Excel file uploaded to S3 successfully.")
217
-
218
- else:
219
- collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
220
-
221
- elif "csv" in file_type:
222
- collection.update_one({"object_url": object_url}, {
223
- "$set": {"csv_object_url": object_url,"filetype": "csv","status":"processed"}})
224
- st.success("CSV file uploaded to S3 successfully.")
225
-
226
- st.markdown("<hr>",unsafe_allow_html=True)
227
- col1, col2, col3 = st.columns([1, 1, 1], gap="small")
228
-
229
- with col1:
230
- if st.button("View PDFs", key="View pdf button"):
231
- st.session_state.page = "view_pdf"
232
- st.rerun()
233
-
234
- with col2:
235
- if st.button("View Images", key="View image button"):
236
- st.session_state.page = "view_image"
237
- st.rerun()
238
-
239
- with col3:
240
- if st.button("View Excel", key="View excel button"):
241
- st.session_state.page = "view_excel"
242
- st.rerun()
243
-
244
- #in case of csv we are already uploading it.
245
-
246
- if st.session_state.page=="view_pdf":
247
- view_pdfs()
248
-
249
-
250
- elif st.session_state.page=="view_image":
251
- view_images()
252
-
253
- elif st.session_state.page=="view_excel":
254
- view_excel()
255
-
256
- if st.session_state.page=="view_image_analysis" and "image_url" in st.session_state:
257
- image_url = st.session_state.image_url
258
- view_table_analysis_page(image_url)
259
-
260
- if st.session_state.page=="pdf_analysis" and "pdf_url" in st.session_state:
261
- pdf_url=st.session_state.pdf_url
262
- view_pdf_table_analysis_page(pdf_url)
263
-
264
-
265
- if st.session_state.page=="view_excel_analysis" and "excel_url" in st.session_state:
266
- excel_url=st.session_state.excel_url
267
- display_csv_analysis(excel_url)
268
-
269
-
270
-
271
-
272
-
273
-
274
-
275
-
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from upload_file_to_s3 import upload_file
3
+ import base64
4
+ import httpx
5
+ from extract_table_from_image import process_image_using_llm
6
+ from process_pdf import process_pdf
7
+ from pymongo import MongoClient
8
+ from datetime import datetime
9
+ from table_analysis_for_image import view_table_analysis_page
10
+ from table_analysis_for_pdf import view_pdf_table_analysis_page
11
+ from table_analysis_for_excel import display_csv_analysis
12
+ from view_excel import view_excel
13
+ from copy import deepcopy
14
+
15
+ import uuid
16
+ import os
17
+ import csv
18
+ from view_pdf import view_pdfs
19
+ from view_image import view_images
20
+ from io import StringIO, BytesIO
21
+ from dotenv import load_dotenv
22
+ import boto3
23
+ import pandas as pd
24
+ st.set_page_config(layout='wide',page_title="MoSPI", page_icon="📄")
25
+ load_dotenv()
26
+
27
+ AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
28
+ AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
29
+ AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
30
+ MONGO_URI = os.getenv("MONGO_URI")
31
+ DB_NAME = os.getenv("DB_NAME")
32
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME")
33
+
34
+ mongo_client = MongoClient(MONGO_URI)
35
+ db = mongo_client[DB_NAME]
36
+ collection = db[COLLECTION_NAME]
37
+
38
+ s3 = boto3.client(
39
+ 's3',
40
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
41
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
42
+ )
43
+ path_to_logo='logo.png'
44
+ if "page" not in st.session_state:
45
+ st.session_state.page = "home"
46
+ def upload_csv_file(file, csv_filename, content_type):
47
+ try:
48
+ # Generate a unique key for the file using UUID
49
+ uuid_str = str(uuid.uuid4())
50
+ s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'
51
+
52
+ # Upload the CSV to S3
53
+ s3.upload_fileobj(
54
+ file,
55
+ AWS_BUCKET_NAME,
56
+ s3_key,
57
+ ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file
58
+ )
59
+
60
+ upload_time = datetime.now()
61
+
62
+ # Metadata for MongoDB
63
+ metadata = {
64
+ 'name': csv_filename,
65
+ 'type': content_type,
66
+ 's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
67
+ 's3_key': s3_key,
68
+ 'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
69
+ 'date_uploaded': upload_time.strftime('%Y-%m-%d'),
70
+ 'time_uploaded': upload_time.strftime('%H:%M:%S')
71
+ }
72
+
73
+ return metadata
74
+
75
+ except Exception as e:
76
+ print(f"An error occurred during upload: {e}")
77
+ return None
78
+
79
+ def process_image(url, filename):
80
+ try:
81
+
82
+ image_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
83
+ if image_data:
84
+
85
+ result = process_image_using_llm(image_data, 1, 3)
86
+ has_table_data = result.get("has_table_data")
87
+ if has_table_data:
88
+ table_data = result.get("table_data")
89
+ page_number = result.get("page_number")
90
+ description = result.get("description")
91
+ column_summary = result.get("column_summary")
92
+ best_col1=result.get("best_col1")
93
+ best_col2=result.get("best_col2")
94
+
95
+ data={
96
+ "table_data":table_data,
97
+ "page_number":page_number,
98
+ "description":description,
99
+ "column_summary":column_summary,
100
+ "best_col1":best_col1,
101
+ "best_col2":best_col2
102
+ }
103
+
104
+
105
+ collection.update_one({"object_url": url}, {"$set": {"table_data": data}})
106
+ print("Successfully extracted data from image and inserted into MongoDB")
107
+
108
+ # Generate CSV from table data
109
+ csv_buffer = StringIO()
110
+ csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
111
+ csv_writer.writeheader()
112
+ csv_writer.writerows(table_data)
113
+
114
+ # Convert CSV text to bytes for uploading
115
+ csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
116
+
117
+ # Upload CSV to S3
118
+ csv_filename = f"{filename}.csv"
119
+ s3_metadata = upload_csv_file(csv_bytes, csv_filename, content_type="text/csv")
120
+
121
+ if s3_metadata:
122
+ # Update MongoDB with CSV S3 URL
123
+ collection.update_one(
124
+ {"object_url": url},
125
+ {"$set": {
126
+ "csv_object_url": s3_metadata.get("object_url"),
127
+ "csv_s3_url": s3_metadata.get("s3_url")
128
+ }}
129
+ )
130
+ print("CSV file uploaded to S3 and URL saved in MongoDB")
131
+
132
+ return True
133
+ else:
134
+ print(f"No table data was found in the image {url}")
135
+ return False
136
+
137
+ else:
138
+ print(f"No image data found in uploaded image")
139
+ return False
140
+
141
+ except Exception as e:
142
+ print(f"Error occurred in processing image: {e}")
143
+ return False
144
+
145
+
146
+ def convert_excel_to_csv(file, filename):
147
+ # Determine the appropriate engine based on file extension
148
+ file_extension = filename.split('.')[-1].lower()
149
+ if file_extension == 'xlsx':
150
+ engine = 'openpyxl'
151
+ elif file_extension == 'xls':
152
+ engine = 'xlrd'
153
+ else:
154
+ raise ValueError("Unsupported file format for Excel. Please upload an .xls or .xlsx file.")
155
+
156
+ # Load the Excel file into a DataFrame
157
+ df = pd.read_excel(file, engine=engine)
158
+
159
+ # Convert the DataFrame to CSV format in memory
160
+ csv_buffer = BytesIO()
161
+ df.to_csv(csv_buffer, index=False)
162
+ csv_buffer.seek(0) # Move to the start of the buffer
163
+
164
+ # Generate a new filename for CSV
165
+ csv_filename = filename.replace(".xlsx", ".csv").replace(".xls", ".csv")
166
+ return csv_buffer, csv_filename
167
+
168
+
169
+
170
+ if st.session_state.page=="home":
171
+
172
+ col1,col2=st.columns([1,7])
173
+ with col1:
174
+ st.image(path_to_logo, width=100)
175
+ with col2:
176
+ st.title("Smart Data Extraction and Analysis tool")
177
+
178
+ uploaded_file = st.file_uploader(
179
+ "Upload a file",
180
+ type=["png", "jpg", "jpeg", "pdf", "xlsx", "xls", "csv"],
181
+ accept_multiple_files=False,
182
+ help="Please upload only one file of type image, PDF, Excel, or CSV."
183
+ )
184
+
185
+ if uploaded_file and st.button("Upload"):
186
+ with st.spinner("Processing your file"):
187
+ file_copy = BytesIO(uploaded_file.getvalue())
188
+ file_type = uploaded_file.type
189
+ metadata = upload_file(uploaded_file, file_type)
190
+ if metadata:
191
+
192
+ object_url = metadata.get("object_url")
193
+ filename = metadata.get("name")
194
+
195
+ if "image" in file_type: # Process image files
196
+ processed = process_image(object_url, filename)
197
+ if processed:
198
+ collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
199
+ st.success("Image processed and CSV file uploaded to S3 successfully.")
200
+ else:
201
+ collection.update_one({"object_url":object_url},{"$set":{"status":"failed"}})
202
+ st.error("Error occured in processing Image, please try again later")
203
+
204
+ elif "pdf" in file_type:
205
+ processed=process_pdf(object_url,filename)
206
+ if processed:
207
+ collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
208
+ st.success("Successfully processed pdf")
209
+ else:
210
+ collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
211
+ st.error("Error occured in processing pdf")
212
+
213
+ elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","application/vnd.ms-excel"]:
214
+ csv_buffer, csv_filename = convert_excel_to_csv(file_copy, filename)
215
+ s3_metadata = upload_csv_file(csv_buffer, csv_filename, content_type="text/csv")
216
+ if s3_metadata:
217
+ collection.update_one({"object_url": object_url}, {
218
+ "$set": {"csv_object_url": s3_metadata["object_url"], "csv_s3_url": s3_metadata["s3_url"],
219
+ "filetype": "excel","status":"processed"}
220
+ })
221
+ st.success("Excel file uploaded to S3 successfully.")
222
+
223
+ else:
224
+ collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
225
+
226
+ elif "csv" in file_type:
227
+ collection.update_one({"object_url": object_url}, {
228
+ "$set": {"csv_object_url": object_url,"filetype": "csv","status":"processed"}})
229
+ st.success("CSV file uploaded to S3 successfully.")
230
+
231
+ st.markdown("<hr>",unsafe_allow_html=True)
232
+ col1, col2, col3 = st.columns([1, 1, 1], gap="small")
233
+
234
+ with col1:
235
+ if st.button("View PDFs", key="View pdf button"):
236
+ st.session_state.page = "view_pdf"
237
+ st.rerun()
238
+
239
+ with col2:
240
+ if st.button("View Images", key="View image button"):
241
+ st.session_state.page = "view_image"
242
+ st.rerun()
243
+
244
+ with col3:
245
+ if st.button("View Excel", key="View excel button"):
246
+ st.session_state.page = "view_excel"
247
+ st.rerun()
248
+
249
+ #in case of csv we are already uploading it.
250
+
251
+ if st.session_state.page=="view_pdf":
252
+ view_pdfs()
253
+
254
+
255
+ elif st.session_state.page=="view_image":
256
+ view_images()
257
+
258
+ elif st.session_state.page=="view_excel":
259
+ view_excel()
260
+
261
+ if st.session_state.page=="view_image_analysis" and "image_url" in st.session_state:
262
+ image_url = st.session_state.image_url
263
+ view_table_analysis_page(image_url)
264
+
265
+ if st.session_state.page=="pdf_analysis" and "pdf_url" in st.session_state:
266
+ pdf_url=st.session_state.pdf_url
267
+ view_pdf_table_analysis_page(pdf_url)
268
+
269
+
270
+ if st.session_state.page=="view_excel_analysis" and "excel_url" in st.session_state:
271
+ excel_url=st.session_state.excel_url
272
+ display_csv_analysis(excel_url)
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+