import streamlit as st from upload_file_to_s3 import upload_file import base64 import httpx from extract_table_from_image import process_image_using_llm from process_pdf import process_pdf from pymongo import MongoClient from datetime import datetime from table_analysis_for_image import view_table_analysis_page from table_analysis_for_pdf import view_pdf_table_analysis_page from table_analysis_for_excel import display_csv_analysis from view_excel import view_excel from copy import deepcopy import uuid import os import csv from view_pdf import view_pdfs from view_image import view_images from io import StringIO, BytesIO from dotenv import load_dotenv import boto3 import pandas as pd st.set_page_config(layout='wide',page_title="MoSPI", page_icon="📄") load_dotenv() AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") MONGO_URI = os.getenv("MONGO_URI") DB_NAME = os.getenv("DB_NAME") COLLECTION_NAME = os.getenv("COLLECTION_NAME") mongo_client = MongoClient(MONGO_URI) db = mongo_client[DB_NAME] collection = db[COLLECTION_NAME] s3 = boto3.client( 's3', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY ) path_to_logo='logo.png' if "page" not in st.session_state: st.session_state.page = "home" def upload_csv_file(file, csv_filename, content_type): try: # Generate a unique key for the file using UUID uuid_str = str(uuid.uuid4()) s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}' # Upload the CSV to S3 s3.upload_fileobj( file, AWS_BUCKET_NAME, s3_key, ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file ) upload_time = datetime.now() # Metadata for MongoDB metadata = { 'name': csv_filename, 'type': content_type, 's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}', 's3_key': s3_key, 'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}', 'date_uploaded': upload_time.strftime('%Y-%m-%d'), 'time_uploaded': upload_time.strftime('%H:%M:%S') } return metadata except Exception as e: print(f"An error occurred during upload: {e}") return None def process_image(url, filename): try: image_data = base64.b64encode(httpx.get(url).content).decode("utf-8") if image_data: result = process_image_using_llm(image_data, 1, 3) has_table_data = result.get("has_table_data") if has_table_data: table_data = result.get("table_data") page_number = result.get("page_number") description = result.get("description") column_summary = result.get("column_summary") best_col1=result.get("best_col1") best_col2=result.get("best_col2") data={ "table_data":table_data, "page_number":page_number, "description":description, "column_summary":column_summary, "best_col1":best_col1, "best_col2":best_col2 } collection.update_one({"object_url": url}, {"$set": {"table_data": data}}) print("Successfully extracted data from image and inserted into MongoDB") # Generate CSV from table data csv_buffer = StringIO() csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys()) csv_writer.writeheader() csv_writer.writerows(table_data) # Convert CSV text to bytes for uploading csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8")) # Upload CSV to S3 csv_filename = f"{filename}.csv" s3_metadata = upload_csv_file(csv_bytes, csv_filename, content_type="text/csv") if s3_metadata: # Update MongoDB with CSV S3 URL collection.update_one( {"object_url": url}, {"$set": { "csv_object_url": s3_metadata.get("object_url"), "csv_s3_url": s3_metadata.get("s3_url") }} ) print("CSV file uploaded to S3 and URL saved in MongoDB") return True else: print(f"No table data was found in the image {url}") return False else: print(f"No image data found in uploaded image") return False except Exception as e: print(f"Error occurred in processing image: {e}") return False def convert_excel_to_csv(file, filename): # Determine the appropriate engine based on file extension file_extension = filename.split('.')[-1].lower() if file_extension == 'xlsx': engine = 'openpyxl' elif file_extension == 'xls': engine = 'xlrd' else: raise ValueError("Unsupported file format for Excel. Please upload an .xls or .xlsx file.") # Load the Excel file into a DataFrame df = pd.read_excel(file, engine=engine) # Convert the DataFrame to CSV format in memory csv_buffer = BytesIO() df.to_csv(csv_buffer, index=False) csv_buffer.seek(0) # Move to the start of the buffer # Generate a new filename for CSV csv_filename = filename.replace(".xlsx", ".csv").replace(".xls", ".csv") return csv_buffer, csv_filename if st.session_state.page=="home": col1,col2=st.columns([1,13]) with col1: st.image(path_to_logo, width=100) with col2: st.title("Smart Data Extraction and Analysis tool") uploaded_file = st.file_uploader( "Upload a file", type=["png", "jpg", "jpeg", "pdf", "xlsx", "xls", "csv"], accept_multiple_files=False, help="Please upload only one file of type image, PDF, Excel, or CSV." ) if uploaded_file and st.button("Upload"): with st.spinner("Processing your file"): file_copy = BytesIO(uploaded_file.getvalue()) file_type = uploaded_file.type metadata = upload_file(uploaded_file, file_type) if metadata: object_url = metadata.get("object_url") filename = metadata.get("name") if "image" in file_type: # Process image files processed = process_image(object_url, filename) if processed: collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}}) st.success("Image processed and CSV file uploaded to S3 successfully.") else: collection.update_one({"object_url":object_url},{"$set":{"status":"failed"}}) st.error("Error occured in processing Image, please try again later") elif "pdf" in file_type: processed=process_pdf(object_url,filename) if processed: collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}}) st.success("Successfully processed pdf") else: collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}}) st.error("Error occured in processing pdf") elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","application/vnd.ms-excel"]: csv_buffer, csv_filename = convert_excel_to_csv(file_copy, filename) s3_metadata = upload_csv_file(csv_buffer, csv_filename, content_type="text/csv") if s3_metadata: collection.update_one({"object_url": object_url}, { "$set": {"csv_object_url": s3_metadata["object_url"], "csv_s3_url": s3_metadata["s3_url"], "filetype": "excel","status":"processed"} }) st.success("Excel file uploaded to S3 successfully.") else: collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}}) elif "csv" in file_type: collection.update_one({"object_url": object_url}, { "$set": {"csv_object_url": object_url,"filetype": "csv","status":"processed"}}) st.success("CSV file uploaded to S3 successfully.") st.markdown("
",unsafe_allow_html=True) col1, col2, col3 = st.columns([1, 1, 1], gap="small") with col1: if st.button("View PDFs", key="View pdf button"): st.session_state.page = "view_pdf" st.rerun() with col2: if st.button("View Images", key="View image button"): st.session_state.page = "view_image" st.rerun() with col3: if st.button("View Excel", key="View excel button"): st.session_state.page = "view_excel" st.rerun() #in case of csv we are already uploading it. if st.session_state.page=="view_pdf": view_pdfs() elif st.session_state.page=="view_image": view_images() elif st.session_state.page=="view_excel": view_excel() if st.session_state.page=="view_image_analysis" and "image_url" in st.session_state: image_url = st.session_state.image_url view_table_analysis_page(image_url) if st.session_state.page=="pdf_analysis" and "pdf_url" in st.session_state: pdf_url=st.session_state.pdf_url view_pdf_table_analysis_page(pdf_url) if st.session_state.page=="view_excel_analysis" and "excel_url" in st.session_state: excel_url=st.session_state.excel_url display_csv_analysis(excel_url)