akshansh36's picture
Update app.py
7f388d0 verified
import streamlit as st
from upload_file_to_s3 import upload_file
import base64
import httpx
from extract_table_from_image import process_image_using_llm
from process_pdf import process_pdf
from pymongo import MongoClient
from datetime import datetime
from table_analysis_for_image import view_table_analysis_page
from table_analysis_for_pdf import view_pdf_table_analysis_page
from table_analysis_for_excel import display_csv_analysis
from view_excel import view_excel
from copy import deepcopy
import uuid
import os
import csv
from view_pdf import view_pdfs
from view_image import view_images
from io import StringIO, BytesIO
from dotenv import load_dotenv
import boto3
import pandas as pd
st.set_page_config(layout='wide',page_title="MoSPI", page_icon="📄")
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DB_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]
s3 = boto3.client(
's3',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
path_to_logo='logo.png'
if "page" not in st.session_state:
st.session_state.page = "home"
def upload_csv_file(file, csv_filename, content_type):
try:
# Generate a unique key for the file using UUID
uuid_str = str(uuid.uuid4())
s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'
# Upload the CSV to S3
s3.upload_fileobj(
file,
AWS_BUCKET_NAME,
s3_key,
ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file
)
upload_time = datetime.now()
# Metadata for MongoDB
metadata = {
'name': csv_filename,
'type': content_type,
's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
's3_key': s3_key,
'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
'date_uploaded': upload_time.strftime('%Y-%m-%d'),
'time_uploaded': upload_time.strftime('%H:%M:%S')
}
return metadata
except Exception as e:
print(f"An error occurred during upload: {e}")
return None
def process_image(url, filename):
try:
image_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
if image_data:
result = process_image_using_llm(image_data, 1, 3)
has_table_data = result.get("has_table_data")
if has_table_data:
table_data = result.get("table_data")
page_number = result.get("page_number")
description = result.get("description")
column_summary = result.get("column_summary")
best_col1=result.get("best_col1")
best_col2=result.get("best_col2")
data={
"table_data":table_data,
"page_number":page_number,
"description":description,
"column_summary":column_summary,
"best_col1":best_col1,
"best_col2":best_col2
}
collection.update_one({"object_url": url}, {"$set": {"table_data": data}})
print("Successfully extracted data from image and inserted into MongoDB")
# Generate CSV from table data
csv_buffer = StringIO()
csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
csv_writer.writeheader()
csv_writer.writerows(table_data)
# Convert CSV text to bytes for uploading
csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
# Upload CSV to S3
csv_filename = f"{filename}.csv"
s3_metadata = upload_csv_file(csv_bytes, csv_filename, content_type="text/csv")
if s3_metadata:
# Update MongoDB with CSV S3 URL
collection.update_one(
{"object_url": url},
{"$set": {
"csv_object_url": s3_metadata.get("object_url"),
"csv_s3_url": s3_metadata.get("s3_url")
}}
)
print("CSV file uploaded to S3 and URL saved in MongoDB")
return True
else:
print(f"No table data was found in the image {url}")
return False
else:
print(f"No image data found in uploaded image")
return False
except Exception as e:
print(f"Error occurred in processing image: {e}")
return False
def convert_excel_to_csv(file, filename):
# Determine the appropriate engine based on file extension
file_extension = filename.split('.')[-1].lower()
if file_extension == 'xlsx':
engine = 'openpyxl'
elif file_extension == 'xls':
engine = 'xlrd'
else:
raise ValueError("Unsupported file format for Excel. Please upload an .xls or .xlsx file.")
# Load the Excel file into a DataFrame
df = pd.read_excel(file, engine=engine)
# Convert the DataFrame to CSV format in memory
csv_buffer = BytesIO()
df.to_csv(csv_buffer, index=False)
csv_buffer.seek(0) # Move to the start of the buffer
# Generate a new filename for CSV
csv_filename = filename.replace(".xlsx", ".csv").replace(".xls", ".csv")
return csv_buffer, csv_filename
if st.session_state.page=="home":
col1,col2=st.columns([1,13])
with col1:
st.image(path_to_logo, width=100)
with col2:
st.title("Smart Data Extraction and Analysis tool")
uploaded_file = st.file_uploader(
"Upload a file",
type=["png", "jpg", "jpeg", "pdf", "xlsx", "xls", "csv"],
accept_multiple_files=False,
help="Please upload only one file of type image, PDF, Excel, or CSV."
)
if uploaded_file and st.button("Upload"):
with st.spinner("Processing your file"):
file_copy = BytesIO(uploaded_file.getvalue())
file_type = uploaded_file.type
metadata = upload_file(uploaded_file, file_type)
if metadata:
object_url = metadata.get("object_url")
filename = metadata.get("name")
if "image" in file_type: # Process image files
processed = process_image(object_url, filename)
if processed:
collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
st.success("Image processed and CSV file uploaded to S3 successfully.")
else:
collection.update_one({"object_url":object_url},{"$set":{"status":"failed"}})
st.error("Error occured in processing Image, please try again later")
elif "pdf" in file_type:
processed=process_pdf(object_url,filename)
if processed:
collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
st.success("Successfully processed pdf")
else:
collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
st.error("Error occured in processing pdf")
elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","application/vnd.ms-excel"]:
csv_buffer, csv_filename = convert_excel_to_csv(file_copy, filename)
s3_metadata = upload_csv_file(csv_buffer, csv_filename, content_type="text/csv")
if s3_metadata:
collection.update_one({"object_url": object_url}, {
"$set": {"csv_object_url": s3_metadata["object_url"], "csv_s3_url": s3_metadata["s3_url"],
"filetype": "excel","status":"processed"}
})
st.success("Excel file uploaded to S3 successfully.")
else:
collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
elif "csv" in file_type:
collection.update_one({"object_url": object_url}, {
"$set": {"csv_object_url": object_url,"filetype": "csv","status":"processed"}})
st.success("CSV file uploaded to S3 successfully.")
st.markdown("<hr>",unsafe_allow_html=True)
col1, col2, col3 = st.columns([1, 1, 1], gap="small")
with col1:
if st.button("View PDFs", key="View pdf button"):
st.session_state.page = "view_pdf"
st.rerun()
with col2:
if st.button("View Images", key="View image button"):
st.session_state.page = "view_image"
st.rerun()
with col3:
if st.button("View Excel", key="View excel button"):
st.session_state.page = "view_excel"
st.rerun()
#in case of csv we are already uploading it.
if st.session_state.page=="view_pdf":
view_pdfs()
elif st.session_state.page=="view_image":
view_images()
elif st.session_state.page=="view_excel":
view_excel()
if st.session_state.page=="view_image_analysis" and "image_url" in st.session_state:
image_url = st.session_state.image_url
view_table_analysis_page(image_url)
if st.session_state.page=="pdf_analysis" and "pdf_url" in st.session_state:
pdf_url=st.session_state.pdf_url
view_pdf_table_analysis_page(pdf_url)
if st.session_state.page=="view_excel_analysis" and "excel_url" in st.session_state:
excel_url=st.session_state.excel_url
display_csv_analysis(excel_url)