File size: 10,144 Bytes
51f7b0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f388d0
51f7b0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
import streamlit as st
from upload_file_to_s3 import upload_file
import base64
import httpx
from extract_table_from_image import process_image_using_llm
from process_pdf import process_pdf
from pymongo import MongoClient
from datetime import datetime
from table_analysis_for_image import view_table_analysis_page
from table_analysis_for_pdf import view_pdf_table_analysis_page
from table_analysis_for_excel import display_csv_analysis
from view_excel import view_excel
from copy import deepcopy

import uuid
import os
import csv
from view_pdf import view_pdfs
from view_image import view_images
from io import StringIO, BytesIO
from dotenv import load_dotenv
import boto3
import pandas as pd
st.set_page_config(layout='wide',page_title="MoSPI", page_icon="📄")
load_dotenv()

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DB_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")

mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]

s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
path_to_logo='logo.png'
if "page" not in st.session_state:
    st.session_state.page = "home"
def upload_csv_file(file, csv_filename, content_type):
    try:
        # Generate a unique key for the file using UUID
        uuid_str = str(uuid.uuid4())
        s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'

        # Upload the CSV to S3
        s3.upload_fileobj(
            file,
            AWS_BUCKET_NAME,
            s3_key,
            ExtraArgs={'ContentType': content_type}  # Set the MIME type of the uploaded file
        )

        upload_time = datetime.now()

        # Metadata for MongoDB
        metadata = {
            'name': csv_filename,
            'type': content_type,
            's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
            's3_key': s3_key,
            'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
            'date_uploaded': upload_time.strftime('%Y-%m-%d'),
            'time_uploaded': upload_time.strftime('%H:%M:%S')
        }

        return metadata

    except Exception as e:
        print(f"An error occurred during upload: {e}")
        return None

def process_image(url, filename):
    try:

        image_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
        if image_data:

            result = process_image_using_llm(image_data, 1, 3)
            has_table_data = result.get("has_table_data")
            if has_table_data:
                table_data = result.get("table_data")
                page_number = result.get("page_number")
                description = result.get("description")
                column_summary = result.get("column_summary")
                best_col1=result.get("best_col1")
                best_col2=result.get("best_col2")

                data={
                    "table_data":table_data,
                    "page_number":page_number,
                    "description":description,
                    "column_summary":column_summary,
                    "best_col1":best_col1,
                    "best_col2":best_col2
                }


                collection.update_one({"object_url": url}, {"$set": {"table_data": data}})
                print("Successfully extracted data from image and inserted into MongoDB")

                # Generate CSV from table data
                csv_buffer = StringIO()
                csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
                csv_writer.writeheader()
                csv_writer.writerows(table_data)

                # Convert CSV text to bytes for uploading
                csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))

                # Upload CSV to S3
                csv_filename = f"{filename}.csv"
                s3_metadata = upload_csv_file(csv_bytes, csv_filename, content_type="text/csv")

                if s3_metadata:
                    # Update MongoDB with CSV S3 URL
                    collection.update_one(
                        {"object_url": url},
                        {"$set": {
                            "csv_object_url": s3_metadata.get("object_url"),
                            "csv_s3_url": s3_metadata.get("s3_url")
                        }}
                    )
                    print("CSV file uploaded to S3 and URL saved in MongoDB")

                return True
            else:
                print(f"No table data was found in the image {url}")
                return False

        else:
            print(f"No image data found in uploaded image")
            return False

    except Exception as e:
        print(f"Error occurred in processing image: {e}")
        return False


def convert_excel_to_csv(file, filename):
    # Determine the appropriate engine based on file extension
    file_extension = filename.split('.')[-1].lower()
    if file_extension == 'xlsx':
        engine = 'openpyxl'
    elif file_extension == 'xls':
        engine = 'xlrd'
    else:
        raise ValueError("Unsupported file format for Excel. Please upload an .xls or .xlsx file.")

    # Load the Excel file into a DataFrame
    df = pd.read_excel(file, engine=engine)

    # Convert the DataFrame to CSV format in memory
    csv_buffer = BytesIO()
    df.to_csv(csv_buffer, index=False)
    csv_buffer.seek(0)  # Move to the start of the buffer

    # Generate a new filename for CSV
    csv_filename = filename.replace(".xlsx", ".csv").replace(".xls", ".csv")
    return csv_buffer, csv_filename



if st.session_state.page=="home":

    col1,col2=st.columns([1,13])
    with col1:
        st.image(path_to_logo, width=100)
    with col2:
        st.title("Smart Data Extraction and Analysis tool")

    uploaded_file = st.file_uploader(
        "Upload a file",
        type=["png", "jpg", "jpeg", "pdf", "xlsx", "xls", "csv"],
        accept_multiple_files=False,
        help="Please upload only one file of type image, PDF, Excel, or CSV."
    )

    if uploaded_file and st.button("Upload"):
        with st.spinner("Processing your file"):
            file_copy = BytesIO(uploaded_file.getvalue())
            file_type = uploaded_file.type
            metadata = upload_file(uploaded_file, file_type)
            if metadata:

                object_url = metadata.get("object_url")
                filename = metadata.get("name")

                if "image" in file_type:  # Process image files
                    processed = process_image(object_url, filename)
                    if processed:
                        collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
                        st.success("Image processed and CSV file uploaded to S3 successfully.")
                    else:
                        collection.update_one({"object_url":object_url},{"$set":{"status":"failed"}})
                        st.error("Error occured in processing Image, please try again later")

                elif "pdf" in file_type:
                    processed=process_pdf(object_url,filename)
                    if processed:
                        collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
                        st.success("Successfully processed pdf")
                    else:
                        collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
                        st.error("Error occured in processing pdf")

                elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","application/vnd.ms-excel"]:
                    csv_buffer, csv_filename = convert_excel_to_csv(file_copy, filename)
                    s3_metadata = upload_csv_file(csv_buffer, csv_filename, content_type="text/csv")
                    if s3_metadata:
                        collection.update_one({"object_url": object_url}, {
                            "$set": {"csv_object_url": s3_metadata["object_url"], "csv_s3_url": s3_metadata["s3_url"],
                                     "filetype": "excel","status":"processed"}
                        })
                        st.success("Excel file uploaded to S3 successfully.")

                    else:
                        collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})

                elif "csv" in file_type:
                    collection.update_one({"object_url": object_url}, {
                        "$set": {"csv_object_url": object_url,"filetype": "csv","status":"processed"}})
                    st.success("CSV file uploaded to S3 successfully.")

    st.markdown("<hr>",unsafe_allow_html=True)
    col1, col2, col3 = st.columns([1, 1, 1], gap="small")

    with col1:
        if st.button("View PDFs", key="View pdf button"):
            st.session_state.page = "view_pdf"
            st.rerun()

    with col2:
        if st.button("View Images", key="View image button"):
            st.session_state.page = "view_image"
            st.rerun()

    with col3:
        if st.button("View Excel", key="View excel button"):
            st.session_state.page = "view_excel"
            st.rerun()

            #in case of csv we are already uploading it.

if st.session_state.page=="view_pdf":
    view_pdfs()


elif st.session_state.page=="view_image":
    view_images()

elif st.session_state.page=="view_excel":
    view_excel()

if st.session_state.page=="view_image_analysis" and "image_url" in st.session_state:
    image_url = st.session_state.image_url
    view_table_analysis_page(image_url)

if st.session_state.page=="pdf_analysis" and "pdf_url" in st.session_state:
    pdf_url=st.session_state.pdf_url
    view_pdf_table_analysis_page(pdf_url)


if st.session_state.page=="view_excel_analysis" and "excel_url" in st.session_state:
    excel_url=st.session_state.excel_url
    display_csv_analysis(excel_url)