Spaces:
Sleeping
Sleeping
File size: 10,144 Bytes
51f7b0e 7f388d0 51f7b0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 |
import streamlit as st
from upload_file_to_s3 import upload_file
import base64
import httpx
from extract_table_from_image import process_image_using_llm
from process_pdf import process_pdf
from pymongo import MongoClient
from datetime import datetime
from table_analysis_for_image import view_table_analysis_page
from table_analysis_for_pdf import view_pdf_table_analysis_page
from table_analysis_for_excel import display_csv_analysis
from view_excel import view_excel
from copy import deepcopy
import uuid
import os
import csv
from view_pdf import view_pdfs
from view_image import view_images
from io import StringIO, BytesIO
from dotenv import load_dotenv
import boto3
import pandas as pd
st.set_page_config(layout='wide',page_title="MoSPI", page_icon="📄")
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DB_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]
s3 = boto3.client(
's3',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
path_to_logo='logo.png'
if "page" not in st.session_state:
st.session_state.page = "home"
def upload_csv_file(file, csv_filename, content_type):
try:
# Generate a unique key for the file using UUID
uuid_str = str(uuid.uuid4())
s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'
# Upload the CSV to S3
s3.upload_fileobj(
file,
AWS_BUCKET_NAME,
s3_key,
ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file
)
upload_time = datetime.now()
# Metadata for MongoDB
metadata = {
'name': csv_filename,
'type': content_type,
's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
's3_key': s3_key,
'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
'date_uploaded': upload_time.strftime('%Y-%m-%d'),
'time_uploaded': upload_time.strftime('%H:%M:%S')
}
return metadata
except Exception as e:
print(f"An error occurred during upload: {e}")
return None
def process_image(url, filename):
try:
image_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
if image_data:
result = process_image_using_llm(image_data, 1, 3)
has_table_data = result.get("has_table_data")
if has_table_data:
table_data = result.get("table_data")
page_number = result.get("page_number")
description = result.get("description")
column_summary = result.get("column_summary")
best_col1=result.get("best_col1")
best_col2=result.get("best_col2")
data={
"table_data":table_data,
"page_number":page_number,
"description":description,
"column_summary":column_summary,
"best_col1":best_col1,
"best_col2":best_col2
}
collection.update_one({"object_url": url}, {"$set": {"table_data": data}})
print("Successfully extracted data from image and inserted into MongoDB")
# Generate CSV from table data
csv_buffer = StringIO()
csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
csv_writer.writeheader()
csv_writer.writerows(table_data)
# Convert CSV text to bytes for uploading
csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
# Upload CSV to S3
csv_filename = f"{filename}.csv"
s3_metadata = upload_csv_file(csv_bytes, csv_filename, content_type="text/csv")
if s3_metadata:
# Update MongoDB with CSV S3 URL
collection.update_one(
{"object_url": url},
{"$set": {
"csv_object_url": s3_metadata.get("object_url"),
"csv_s3_url": s3_metadata.get("s3_url")
}}
)
print("CSV file uploaded to S3 and URL saved in MongoDB")
return True
else:
print(f"No table data was found in the image {url}")
return False
else:
print(f"No image data found in uploaded image")
return False
except Exception as e:
print(f"Error occurred in processing image: {e}")
return False
def convert_excel_to_csv(file, filename):
# Determine the appropriate engine based on file extension
file_extension = filename.split('.')[-1].lower()
if file_extension == 'xlsx':
engine = 'openpyxl'
elif file_extension == 'xls':
engine = 'xlrd'
else:
raise ValueError("Unsupported file format for Excel. Please upload an .xls or .xlsx file.")
# Load the Excel file into a DataFrame
df = pd.read_excel(file, engine=engine)
# Convert the DataFrame to CSV format in memory
csv_buffer = BytesIO()
df.to_csv(csv_buffer, index=False)
csv_buffer.seek(0) # Move to the start of the buffer
# Generate a new filename for CSV
csv_filename = filename.replace(".xlsx", ".csv").replace(".xls", ".csv")
return csv_buffer, csv_filename
if st.session_state.page=="home":
col1,col2=st.columns([1,13])
with col1:
st.image(path_to_logo, width=100)
with col2:
st.title("Smart Data Extraction and Analysis tool")
uploaded_file = st.file_uploader(
"Upload a file",
type=["png", "jpg", "jpeg", "pdf", "xlsx", "xls", "csv"],
accept_multiple_files=False,
help="Please upload only one file of type image, PDF, Excel, or CSV."
)
if uploaded_file and st.button("Upload"):
with st.spinner("Processing your file"):
file_copy = BytesIO(uploaded_file.getvalue())
file_type = uploaded_file.type
metadata = upload_file(uploaded_file, file_type)
if metadata:
object_url = metadata.get("object_url")
filename = metadata.get("name")
if "image" in file_type: # Process image files
processed = process_image(object_url, filename)
if processed:
collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
st.success("Image processed and CSV file uploaded to S3 successfully.")
else:
collection.update_one({"object_url":object_url},{"$set":{"status":"failed"}})
st.error("Error occured in processing Image, please try again later")
elif "pdf" in file_type:
processed=process_pdf(object_url,filename)
if processed:
collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
st.success("Successfully processed pdf")
else:
collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
st.error("Error occured in processing pdf")
elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","application/vnd.ms-excel"]:
csv_buffer, csv_filename = convert_excel_to_csv(file_copy, filename)
s3_metadata = upload_csv_file(csv_buffer, csv_filename, content_type="text/csv")
if s3_metadata:
collection.update_one({"object_url": object_url}, {
"$set": {"csv_object_url": s3_metadata["object_url"], "csv_s3_url": s3_metadata["s3_url"],
"filetype": "excel","status":"processed"}
})
st.success("Excel file uploaded to S3 successfully.")
else:
collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
elif "csv" in file_type:
collection.update_one({"object_url": object_url}, {
"$set": {"csv_object_url": object_url,"filetype": "csv","status":"processed"}})
st.success("CSV file uploaded to S3 successfully.")
st.markdown("<hr>",unsafe_allow_html=True)
col1, col2, col3 = st.columns([1, 1, 1], gap="small")
with col1:
if st.button("View PDFs", key="View pdf button"):
st.session_state.page = "view_pdf"
st.rerun()
with col2:
if st.button("View Images", key="View image button"):
st.session_state.page = "view_image"
st.rerun()
with col3:
if st.button("View Excel", key="View excel button"):
st.session_state.page = "view_excel"
st.rerun()
#in case of csv we are already uploading it.
if st.session_state.page=="view_pdf":
view_pdfs()
elif st.session_state.page=="view_image":
view_images()
elif st.session_state.page=="view_excel":
view_excel()
if st.session_state.page=="view_image_analysis" and "image_url" in st.session_state:
image_url = st.session_state.image_url
view_table_analysis_page(image_url)
if st.session_state.page=="pdf_analysis" and "pdf_url" in st.session_state:
pdf_url=st.session_state.pdf_url
view_pdf_table_analysis_page(pdf_url)
if st.session_state.page=="view_excel_analysis" and "excel_url" in st.session_state:
excel_url=st.session_state.excel_url
display_csv_analysis(excel_url)
|