Spaces:
Sleeping
Sleeping
import streamlit as st | |
from upload_file_to_s3 import upload_file | |
import base64 | |
import httpx | |
from extract_table_from_image import process_image_using_llm | |
from process_pdf import process_pdf | |
from pymongo import MongoClient | |
from datetime import datetime | |
from table_analysis_for_image import view_table_analysis_page | |
from table_analysis_for_pdf import view_pdf_table_analysis_page | |
from table_analysis_for_excel import display_csv_analysis | |
from view_excel import view_excel | |
from copy import deepcopy | |
import uuid | |
import os | |
import csv | |
from view_pdf import view_pdfs | |
from view_image import view_images | |
from io import StringIO, BytesIO | |
from dotenv import load_dotenv | |
import boto3 | |
import pandas as pd | |
st.set_page_config(layout='wide',page_title="MoSPI", page_icon="📄") | |
load_dotenv() | |
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") | |
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") | |
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") | |
MONGO_URI = os.getenv("MONGO_URI") | |
DB_NAME = os.getenv("DB_NAME") | |
COLLECTION_NAME = os.getenv("COLLECTION_NAME") | |
mongo_client = MongoClient(MONGO_URI) | |
db = mongo_client[DB_NAME] | |
collection = db[COLLECTION_NAME] | |
s3 = boto3.client( | |
's3', | |
aws_access_key_id=AWS_ACCESS_KEY_ID, | |
aws_secret_access_key=AWS_SECRET_ACCESS_KEY | |
) | |
path_to_logo='logo.png' | |
if "page" not in st.session_state: | |
st.session_state.page = "home" | |
def upload_csv_file(file, csv_filename, content_type): | |
try: | |
# Generate a unique key for the file using UUID | |
uuid_str = str(uuid.uuid4()) | |
s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}' | |
# Upload the CSV to S3 | |
s3.upload_fileobj( | |
file, | |
AWS_BUCKET_NAME, | |
s3_key, | |
ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file | |
) | |
upload_time = datetime.now() | |
# Metadata for MongoDB | |
metadata = { | |
'name': csv_filename, | |
'type': content_type, | |
's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}', | |
's3_key': s3_key, | |
'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}', | |
'date_uploaded': upload_time.strftime('%Y-%m-%d'), | |
'time_uploaded': upload_time.strftime('%H:%M:%S') | |
} | |
return metadata | |
except Exception as e: | |
print(f"An error occurred during upload: {e}") | |
return None | |
def process_image(url, filename): | |
try: | |
image_data = base64.b64encode(httpx.get(url).content).decode("utf-8") | |
if image_data: | |
result = process_image_using_llm(image_data, 1, 3) | |
has_table_data = result.get("has_table_data") | |
if has_table_data: | |
table_data = result.get("table_data") | |
page_number = result.get("page_number") | |
description = result.get("description") | |
column_summary = result.get("column_summary") | |
best_col1=result.get("best_col1") | |
best_col2=result.get("best_col2") | |
data={ | |
"table_data":table_data, | |
"page_number":page_number, | |
"description":description, | |
"column_summary":column_summary, | |
"best_col1":best_col1, | |
"best_col2":best_col2 | |
} | |
collection.update_one({"object_url": url}, {"$set": {"table_data": data}}) | |
print("Successfully extracted data from image and inserted into MongoDB") | |
# Generate CSV from table data | |
csv_buffer = StringIO() | |
csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys()) | |
csv_writer.writeheader() | |
csv_writer.writerows(table_data) | |
# Convert CSV text to bytes for uploading | |
csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8")) | |
# Upload CSV to S3 | |
csv_filename = f"{filename}.csv" | |
s3_metadata = upload_csv_file(csv_bytes, csv_filename, content_type="text/csv") | |
if s3_metadata: | |
# Update MongoDB with CSV S3 URL | |
collection.update_one( | |
{"object_url": url}, | |
{"$set": { | |
"csv_object_url": s3_metadata.get("object_url"), | |
"csv_s3_url": s3_metadata.get("s3_url") | |
}} | |
) | |
print("CSV file uploaded to S3 and URL saved in MongoDB") | |
return True | |
else: | |
print(f"No table data was found in the image {url}") | |
return False | |
else: | |
print(f"No image data found in uploaded image") | |
return False | |
except Exception as e: | |
print(f"Error occurred in processing image: {e}") | |
return False | |
def convert_excel_to_csv(file, filename): | |
# Determine the appropriate engine based on file extension | |
file_extension = filename.split('.')[-1].lower() | |
if file_extension == 'xlsx': | |
engine = 'openpyxl' | |
elif file_extension == 'xls': | |
engine = 'xlrd' | |
else: | |
raise ValueError("Unsupported file format for Excel. Please upload an .xls or .xlsx file.") | |
# Load the Excel file into a DataFrame | |
df = pd.read_excel(file, engine=engine) | |
# Convert the DataFrame to CSV format in memory | |
csv_buffer = BytesIO() | |
df.to_csv(csv_buffer, index=False) | |
csv_buffer.seek(0) # Move to the start of the buffer | |
# Generate a new filename for CSV | |
csv_filename = filename.replace(".xlsx", ".csv").replace(".xls", ".csv") | |
return csv_buffer, csv_filename | |
if st.session_state.page=="home": | |
col1,col2=st.columns([1,13]) | |
with col1: | |
st.image(path_to_logo, width=100) | |
with col2: | |
st.title("Smart Data Extraction and Analysis tool") | |
uploaded_file = st.file_uploader( | |
"Upload a file", | |
type=["png", "jpg", "jpeg", "pdf", "xlsx", "xls", "csv"], | |
accept_multiple_files=False, | |
help="Please upload only one file of type image, PDF, Excel, or CSV." | |
) | |
if uploaded_file and st.button("Upload"): | |
with st.spinner("Processing your file"): | |
file_copy = BytesIO(uploaded_file.getvalue()) | |
file_type = uploaded_file.type | |
metadata = upload_file(uploaded_file, file_type) | |
if metadata: | |
object_url = metadata.get("object_url") | |
filename = metadata.get("name") | |
if "image" in file_type: # Process image files | |
processed = process_image(object_url, filename) | |
if processed: | |
collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}}) | |
st.success("Image processed and CSV file uploaded to S3 successfully.") | |
else: | |
collection.update_one({"object_url":object_url},{"$set":{"status":"failed"}}) | |
st.error("Error occured in processing Image, please try again later") | |
elif "pdf" in file_type: | |
processed=process_pdf(object_url,filename) | |
if processed: | |
collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}}) | |
st.success("Successfully processed pdf") | |
else: | |
collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}}) | |
st.error("Error occured in processing pdf") | |
elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","application/vnd.ms-excel"]: | |
csv_buffer, csv_filename = convert_excel_to_csv(file_copy, filename) | |
s3_metadata = upload_csv_file(csv_buffer, csv_filename, content_type="text/csv") | |
if s3_metadata: | |
collection.update_one({"object_url": object_url}, { | |
"$set": {"csv_object_url": s3_metadata["object_url"], "csv_s3_url": s3_metadata["s3_url"], | |
"filetype": "excel","status":"processed"} | |
}) | |
st.success("Excel file uploaded to S3 successfully.") | |
else: | |
collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}}) | |
elif "csv" in file_type: | |
collection.update_one({"object_url": object_url}, { | |
"$set": {"csv_object_url": object_url,"filetype": "csv","status":"processed"}}) | |
st.success("CSV file uploaded to S3 successfully.") | |
st.markdown("<hr>",unsafe_allow_html=True) | |
col1, col2, col3 = st.columns([1, 1, 1], gap="small") | |
with col1: | |
if st.button("View PDFs", key="View pdf button"): | |
st.session_state.page = "view_pdf" | |
st.rerun() | |
with col2: | |
if st.button("View Images", key="View image button"): | |
st.session_state.page = "view_image" | |
st.rerun() | |
with col3: | |
if st.button("View Excel", key="View excel button"): | |
st.session_state.page = "view_excel" | |
st.rerun() | |
#in case of csv we are already uploading it. | |
if st.session_state.page=="view_pdf": | |
view_pdfs() | |
elif st.session_state.page=="view_image": | |
view_images() | |
elif st.session_state.page=="view_excel": | |
view_excel() | |
if st.session_state.page=="view_image_analysis" and "image_url" in st.session_state: | |
image_url = st.session_state.image_url | |
view_table_analysis_page(image_url) | |
if st.session_state.page=="pdf_analysis" and "pdf_url" in st.session_state: | |
pdf_url=st.session_state.pdf_url | |
view_pdf_table_analysis_page(pdf_url) | |
if st.session_state.page=="view_excel_analysis" and "excel_url" in st.session_state: | |
excel_url=st.session_state.excel_url | |
display_csv_analysis(excel_url) | |