Spaces:

youdata-ai
/

MOSPI_analysis_tool

Sleeping

App Files Files Community

akshansh36 commited on Nov 16, 2024

Commit

eef9e83

verified ·

1 Parent(s): a1fc950

Upload 10 files

Browse files

Files changed (10) hide show

app.py +275 -0
extract_table_from_image.py +120 -0
process_pdf.py +173 -0
table_analysis_for_excel.py +81 -0
table_analysis_for_image.py +160 -0
table_analysis_for_pdf.py +228 -0
upload_file_to_s3.py +69 -0
view_excel.py +68 -0
view_image.py +82 -0
view_pdf.py +71 -0

app.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import streamlit as st
+from upload_file_to_s3 import upload_file
+import base64
+import httpx
+from extract_table_from_image import process_image_using_llm
+from process_pdf import process_pdf
+from pymongo import MongoClient
+from datetime import datetime
+from table_analysis_for_image import view_table_analysis_page
+from table_analysis_for_pdf import view_pdf_table_analysis_page
+from table_analysis_for_excel import display_csv_analysis
+from view_excel import view_excel
+from copy import deepcopy
+import uuid
+import os
+import csv
+from view_pdf import view_pdfs
+from view_image import view_images
+from io import StringIO, BytesIO
+from dotenv import load_dotenv
+import boto3
+import pandas as pd
+st.set_page_config(layout='wide',page_title="MoSPI", page_icon="📄")
+load_dotenv()
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+s3 = boto3.client(
+    's3',
+    aws_access_key_id=AWS_ACCESS_KEY_ID,
+    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
+)
+if "page" not in st.session_state:
+    st.session_state.page = "home"
+def upload_csv_file(file, csv_filename, content_type):
+    try:
+        # Generate a unique key for the file using UUID
+        uuid_str = str(uuid.uuid4())
+        s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'
+        # Upload the CSV to S3
+        s3.upload_fileobj(
+            file,
+            AWS_BUCKET_NAME,
+            s3_key,
+            ExtraArgs={'ContentType': content_type}  # Set the MIME type of the uploaded file
+        )
+        upload_time = datetime.now()
+        # Metadata for MongoDB
+        metadata = {
+            'name': csv_filename,
+            'type': content_type,
+            's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
+            's3_key': s3_key,
+            'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
+            'date_uploaded': upload_time.strftime('%Y-%m-%d'),
+            'time_uploaded': upload_time.strftime('%H:%M:%S')
+        }
+        return metadata
+    except Exception as e:
+        print(f"An error occurred during upload: {e}")
+        return None
+def process_image(url, filename):
+    try:
+        image_data = base64.b64encode(httpx.get(url).content).decode("utf-8")
+        if image_data:
+            result = process_image_using_llm(image_data, 1, 3)
+            has_table_data = result.get("has_table_data")
+            if has_table_data:
+                table_data = result.get("table_data")
+                page_number = result.get("page_number")
+                description = result.get("description")
+                column_summary = result.get("column_summary")
+                best_col1=result.get("best_col1")
+                best_col2=result.get("best_col2")
+                data={
+                    "table_data":table_data,
+                    "page_number":page_number,
+                    "description":description,
+                    "column_summary":column_summary,
+                    "best_col1":best_col1,
+                    "best_col2":best_col2
+                }
+                collection.update_one({"object_url": url}, {"$set": {"table_data": data}})
+                print("Successfully extracted data from image and inserted into MongoDB")
+                # Generate CSV from table data
+                csv_buffer = StringIO()
+                csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
+                csv_writer.writeheader()
+                csv_writer.writerows(table_data)
+                # Convert CSV text to bytes for uploading
+                csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
+                # Upload CSV to S3
+                csv_filename = f"{filename}.csv"
+                s3_metadata = upload_csv_file(csv_bytes, csv_filename, content_type="text/csv")
+                if s3_metadata:
+                    # Update MongoDB with CSV S3 URL
+                    collection.update_one(
+                        {"object_url": url},
+                        {"$set": {
+                            "csv_object_url": s3_metadata.get("object_url"),
+                            "csv_s3_url": s3_metadata.get("s3_url")
+                        }}
+                    )
+                    print("CSV file uploaded to S3 and URL saved in MongoDB")
+                return True
+            else:
+                print(f"No table data was found in the image {url}")
+                return False
+        else:
+            print(f"No image data found in uploaded image")
+            return False
+    except Exception as e:
+        print(f"Error occurred in processing image: {e}")
+        return False
+def convert_excel_to_csv(file, filename):
+    # Determine the appropriate engine based on file extension
+    file_extension = filename.split('.')[-1].lower()
+    if file_extension == 'xlsx':
+        engine = 'openpyxl'
+    elif file_extension == 'xls':
+        engine = 'xlrd'
+    else:
+        raise ValueError("Unsupported file format for Excel. Please upload an .xls or .xlsx file.")
+    # Load the Excel file into a DataFrame
+    df = pd.read_excel(file, engine=engine)
+    # Convert the DataFrame to CSV format in memory
+    csv_buffer = BytesIO()
+    df.to_csv(csv_buffer, index=False)
+    csv_buffer.seek(0)  # Move to the start of the buffer
+    # Generate a new filename for CSV
+    csv_filename = filename.replace(".xlsx", ".csv").replace(".xls", ".csv")
+    return csv_buffer, csv_filename
+if st.session_state.page=="home":
+    st.title("Smart Data Extraction and Analysis")
+    uploaded_file = st.file_uploader(
+        "Upload a file",
+        type=["png", "jpg", "jpeg", "pdf", "xlsx", "xls", "csv"],
+        accept_multiple_files=False,
+        help="Please upload only one file of type image, PDF, Excel, or CSV."
+    )
+    if uploaded_file and st.button("Upload"):
+        with st.spinner("Processing your file"):
+            file_copy = BytesIO(uploaded_file.getvalue())
+            file_type = uploaded_file.type
+            metadata = upload_file(uploaded_file, file_type)
+            if metadata:
+                object_url = metadata.get("object_url")
+                filename = metadata.get("name")
+                if "image" in file_type:  # Process image files
+                    processed = process_image(object_url, filename)
+                    if processed:
+                        collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
+                        st.success("Image processed and CSV file uploaded to S3 successfully.")
+                    else:
+                        collection.update_one({"object_url":object_url},{"$set":{"status":"failed"}})
+                        st.error("Error occured in processing Image, please try again later")
+                elif "pdf" in file_type:
+                    processed=process_pdf(object_url,filename)
+                    if processed:
+                        collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}})
+                        st.success("Successfully processed pdf")
+                    else:
+                        collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
+                        st.error("Error occured in processing pdf")
+                elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","application/vnd.ms-excel"]:
+                    csv_buffer, csv_filename = convert_excel_to_csv(file_copy, filename)
+                    s3_metadata = upload_csv_file(csv_buffer, csv_filename, content_type="text/csv")
+                    if s3_metadata:
+                        collection.update_one({"object_url": object_url}, {
+                            "$set": {"csv_object_url": s3_metadata["object_url"], "csv_s3_url": s3_metadata["s3_url"],
+                                     "filetype": "excel","status":"processed"}
+                        })
+                        st.success("Excel file uploaded to S3 successfully.")
+                    else:
+                        collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}})
+                elif "csv" in file_type:
+                    collection.update_one({"object_url": object_url}, {
+                        "$set": {"csv_object_url": object_url,"filetype": "csv","status":"processed"}})
+                    st.success("CSV file uploaded to S3 successfully.")
+    st.markdown("<hr>",unsafe_allow_html=True)
+    col1, col2, col3 = st.columns([1, 1, 1], gap="small")
+    with col1:
+        if st.button("View PDFs", key="View pdf button"):
+            st.session_state.page = "view_pdf"
+            st.rerun()
+    with col2:
+        if st.button("View Images", key="View image button"):
+            st.session_state.page = "view_image"
+            st.rerun()
+    with col3:
+        if st.button("View Excel", key="View excel button"):
+            st.session_state.page = "view_excel"
+            st.rerun()
+            #in case of csv we are already uploading it.
+if st.session_state.page=="view_pdf":
+    view_pdfs()
+elif st.session_state.page=="view_image":
+    view_images()
+elif st.session_state.page=="view_excel":
+    view_excel()
+if st.session_state.page=="view_image_analysis" and "image_url" in st.session_state:
+    image_url = st.session_state.image_url
+    view_table_analysis_page(image_url)
+if st.session_state.page=="pdf_analysis" and "pdf_url" in st.session_state:
+    pdf_url=st.session_state.pdf_url
+    view_pdf_table_analysis_page(pdf_url)
+if st.session_state.page=="view_excel_analysis" and "excel_url" in st.session_state:
+    excel_url=st.session_state.excel_url
+    display_csv_analysis(excel_url)

extract_table_from_image.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.messages import HumanMessage
+import os
+import re
+import json
+from dotenv import load_dotenv
+load_dotenv()
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+FLASH_API = os.getenv("FLASH_API")
+PINECONE_API=os.getenv("PINECONE_API")
+PINECONE_INDEX=os.getenv("PINECONE_INDEX")
+model = ChatGoogleGenerativeAI(model="gemini-1.5-flash-002", temperature=0.2, max_tokens=None, google_api_key=FLASH_API)
+system_prompt_text = f"""Please extract the table from the image and return the table data in JSON format, with each row represented as an object containing column headers as keys. Ensure that each cell's content corresponds accurately to its column header. If a cell is empty, Keep None as its value.
+Go through the data and give a summary of the table, describing what the data is about in description field.
+Go through each column and give a column summary telling what each column header means.
+Analyze the data to suggest two columns which can be used to plot the best graph for this table.
+If a table contains both hindi and english translations for header or cell then only give english translations.
+Remember to give the response in correct JSON Format.
+Expected output format : {{
+    "table_data": [
+        {{
+            "column_1": "Value 1-1",
+            "column_2": "Value 1-2",
+            "column_3": "Value 1-3"
+        }},
+        {{
+            "column_1": "Value 2-1",
+            "column_2": "Value 2-2",
+            "column_3": "Value 2-3"
+        }}
+        // Additional rows as needed
+    ],
+    "description": "Table Description",
+    "column_summary":{{
+     "column_1" : "column description",
+     "column_2" : "column description",
+     "column_3" :"column description"
+    }},
+    "best_column1" : "Column 1 name",
+    "best_column2" : "Column 2 name"
+}}
+"""
+def process_image_using_llm(image, page_number, max_retries=3):
+    for attempt in range(1, max_retries + 1):
+        try:
+            # Send the image and system prompt to the LLM
+            message = HumanMessage(
+                content=[
+                    {"type": "text", "text": system_prompt_text},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
+                ],
+            )
+            response = model.invoke([message])
+            # Clean up the response content
+            response_content = response.content.strip("```").replace("json", "").replace("\\n", "").strip()
+            print(response_content)
+            response_content = response_content.strip("```")
+            try:
+                # Attempt direct JSON parsing
+                data = json.loads(response_content)
+                # Extract table data and additional notes
+                table_data = data.get("table_data", [])
+                description = data.get("description", "").strip() if data.get("description") else ""
+                column_summary=data.get("column_summary",{})
+                best_col1=data.get("best_column1","").strip() if data.get("best_column1") else ""
+                best_col2=data.get("best_column2","").strip() if data.get("best_column2") else ""
+                # Verify that we have valid table data
+                has_table_data = bool(table_data)
+                return {
+                    "page_number": page_number,
+                    "table_data": table_data if has_table_data else None,
+                    "description": description if description else None,
+                    "column_summary": column_summary if column_summary else None,
+                    "best_col1":best_col1 if best_col1 else None,
+                    "best_col2":best_col2 if best_col2 else None,
+                    "has_table_data": has_table_data
+                }
+            except json.JSONDecodeError as e:
+                print(f"JSON decode error on attempt {attempt} for page {page_number}: {e}")
+                if attempt == max_retries:
+                    return {
+                        "page_number": page_number,
+                        "table_data": None,
+                        "description": None,
+                        "column_summary": None,
+                        "best_col1": None,
+                        "best_col2": None,
+                        "has_table_data": False
+                    }
+        # Handle any other exceptions without retrying
+        except Exception as e:
+            print(f"Outer exception for page {page_number}: {e}")
+            return {
+                "page_number": page_number,
+                "table_data": None,
+                "description": None,
+                "column_summary": None,
+                "best_col1": None,
+                "best_col2": None,
+                "has_table_data": False
+            }

process_pdf.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import base64
+from pdf2image import convert_from_path
+from extract_table_from_image import process_image_using_llm
+from pymongo import MongoClient
+from datetime import datetime
+import uuid
+import os
+import re
+import csv
+import requests
+from io import StringIO, BytesIO
+from dotenv import load_dotenv
+import boto3
+load_dotenv()
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+s3 = boto3.client(
+    's3',
+    aws_access_key_id=AWS_ACCESS_KEY_ID,
+    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
+)
+pdf_temp_dir = 'temp/pdf_files'
+image_temp_dir = 'temp/page_images'
+os.makedirs(pdf_temp_dir, exist_ok=True)
+os.makedirs(image_temp_dir, exist_ok=True)
+pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')
+def cleanup_directory(directory_path):
+    try:
+        for filename in os.listdir(directory_path):
+            file_path = os.path.join(directory_path, filename)
+            if os.path.isfile(file_path):
+                os.remove(file_path)
+        print(f"Cleaned up files in {directory_path}")
+    except Exception as e:
+        print(f"Error cleaning up directory {directory_path}: {e}")
+def download_and_split_pdf_to_image(url):
+    try:
+        response = requests.get(url)
+        with open(pdf_path, 'wb') as pdf_file:
+            pdf_file.write(response.content)
+    except Exception as e:
+        print(f"error occured during downloading pdf from object url : {e}")
+        return None
+    try:
+        images = convert_from_path(pdf_path)
+        for i, image in enumerate(images):
+            image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
+            image.save(image_path, 'PNG')
+            print(f'Saved image: {image_path}')
+        return True
+    except Exception as e:
+        print(f"error occured in converting pdf pages to image : {e}")
+        return None
+def upload_csv_file(file, csv_filename, content_type):
+    try:
+        # Generate a unique key for the file using UUID
+        uuid_str = str(uuid.uuid4())
+        s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'
+        # Upload the CSV to S3
+        s3.upload_fileobj(
+            file,
+            AWS_BUCKET_NAME,
+            s3_key,
+            ExtraArgs={'ContentType': content_type}  # Set the MIME type of the uploaded file
+        )
+        upload_time = datetime.now()
+        # Metadata for MongoDB
+        metadata = {
+            'name': csv_filename,
+            'type': content_type,
+            's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
+            's3_key': s3_key,
+            'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
+            'date_uploaded': upload_time.strftime('%Y-%m-%d'),
+            'time_uploaded': upload_time.strftime('%H:%M:%S')
+        }
+        return metadata
+    except Exception as e:
+        print(f"An error occurred during upload: {e}")
+        return None
+def process_pdf(url,filename):
+    split=download_and_split_pdf_to_image(url)
+    if split:
+        image_files = sorted(
+            os.listdir(image_temp_dir),
+            key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
+        )
+        table_datas= []
+        for count, image_name in enumerate(image_files, start=1):
+            print(f"Processing page {count} of the PDF")
+            image_path = os.path.join(image_temp_dir, image_name)
+            with open(image_path, "rb") as image_file:
+                image_data = base64.b64encode(image_file.read()).decode("utf-8")
+                result = process_image_using_llm(image_data,count,3)
+                has_table_data=result.get("has_table_data")
+                if has_table_data:
+                    table_data=result.get("table_data")
+                    page_number=result.get("page_number")
+                    description = result.get("description")
+                    column_summary=result.get("column_summary")
+                    best_col1 = result.get("best_col1")
+                    best_col2 = result.get("best_col2")
+                    csv_buffer = StringIO()
+                    csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
+                    csv_writer.writeheader()
+                    csv_writer.writerows(table_data)
+                    csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
+                    csv_filename = f"{filename}_pageNumber_{str(page_number)}.csv"
+                    s3_metadata = upload_csv_file(csv_bytes, csv_filename, "text/csv")
+                    if s3_metadata:
+                        object_url=s3_metadata.get("object_url")
+                        s3_url=s3_metadata.get("s3_url")
+                        data = {
+                            "table_data": table_data,
+                            "description": description,
+                            "column_summary": column_summary,
+                            "page_number": page_number,
+                            "csv_object_url":object_url,
+                            "csv_s3_url":s3_url,
+                            "best_col1": best_col1,
+                            "best_col2": best_col2
+                        }
+                        table_datas.append(data)
+                else:
+                    print(f"no table data found at page {count}")
+        if table_datas:
+            collection.update_one({"object_url":url},{"$set":{"table_data":table_datas}})
+            cleanup_directory(pdf_temp_dir)
+            cleanup_directory(image_temp_dir)
+            return True
+        else:
+            print(f"found no table data in whole pdf")
+            cleanup_directory(pdf_temp_dir)
+            cleanup_directory(image_temp_dir)
+            return False

table_analysis_for_excel.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import pandas as pd
+from pygwalker.api.streamlit import StreamlitRenderer
+from io import BytesIO
+import requests
+import streamlit as st
+from pymongo import MongoClient
+import os
+from dotenv import load_dotenv
+import json
+# Load environment variables
+load_dotenv()
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+def load_csv_from_url(csv_url):
+    response = requests.get(csv_url)
+    response.raise_for_status()  # Ensure the request was successful
+    return pd.read_csv(BytesIO(response.content))
+# Column Analysis Function
+def analyze_column_data(df):
+    analysis = {}
+    for col in df.columns:
+        if pd.api.types.is_numeric_dtype(df[col]):
+            analysis[col] = {
+                "Mean": df[col].mean(),
+                "Median": df[col].median(),
+                "Mode": df[col].mode()[0] if not df[col].mode().empty else None,
+                "Unique Values": df[col].nunique(),
+                "Null Values": df[col].isnull().sum()
+            }
+        else:
+            analysis[col] = {
+                "Unique Values": df[col].nunique(),
+                "Null Values": df[col].isnull().sum(),
+                "Top Categories": df[col].value_counts().head(5).to_dict()
+            }
+    return analysis
+# Streamlit Interface
+def display_csv_analysis(object_url):
+    if st.button("Back",key="back_button"):
+        st.session_state.page="view_excel"
+        st.rerun()
+    csv_url=collection.find_one({"object_url":object_url}).get("csv_object_url")
+    st.title("CSV File Analysis")
+    # Load and display CSV data
+    df = load_csv_from_url(csv_url)
+    st.subheader("CSV Preview")
+    st.dataframe(df)
+    # Perform and display analysis
+    st.subheader("Column Analysis")
+    column_analysis = analyze_column_data(df)
+    col1, col2 = st.columns(2)
+    for idx, (col_name, col_data) in enumerate(column_analysis.items()):
+        with col1 if idx % 2 == 0 else col2:
+            st.markdown(f"**{col_name}**")
+            st.write(col_data)
+    st.markdown("<hr>", unsafe_allow_html=True)
+    st.subheader("Graphical Analysis of Table")
+    pyg_app = StreamlitRenderer(df)
+    pyg_app.explorer()

table_analysis_for_image.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import pandas as pd
+from pygwalker.api.streamlit import StreamlitRenderer
+from io import BytesIO
+import requests
+import streamlit as st
+from pymongo import MongoClient
+import os
+from dotenv import load_dotenv
+import json
+# Load environment variables
+load_dotenv()
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+# Load the CSV from a URL (replace with actual CSV download from S3)
+def load_csv_from_url(object_url):
+    response = requests.get(object_url)
+    response.raise_for_status()  # Ensure the request was successful
+    csv_data = pd.read_csv(BytesIO(response.content))
+    return csv_data
+# Analyzing each column based on data type
+def analyze_column_data(df):
+    analysis = {}
+    for col in df.columns:
+        if pd.api.types.is_numeric_dtype(df[col]):
+            analysis[col] = {
+                "Mean": df[col].mean(),
+                "Median": df[col].median(),
+                "Mode": df[col].mode()[0] if not df[col].mode().empty else None,
+                "Unique Values": df[col].nunique(),
+                "Null Values": df[col].isnull().sum()
+            }
+        else:
+            analysis[col] = {
+                "Unique Values": df[col].nunique(),
+                "Null Values": df[col].isnull().sum(),
+                "Top Categories": df[col].value_counts().head(5).to_dict()
+            }
+    return analysis
+# Main function to render the View Table Analysis page
+def view_table_analysis_page(url):
+    if st.button("Back",key="back_button"):
+        st.session_state.page="view_image"
+        st.rerun()
+    image=collection.find_one({"object_url":url})
+    csv_url=image.get("csv_object_url")
+    # Load CSV data
+    df = load_csv_from_url(csv_url)
+    # Check if the last row has any cell containing the word "total" (case-insensitive)
+    if df.iloc[-1].apply(lambda x: "total" in str(x).lower()).any():
+        df = df.iloc[:-1]  # Drop the last row if "total" is found in any cell
+    # Page title
+    st.title("Table Analysis")
+    # CSV Preview
+    st.subheader("CSV Preview")
+    st.write("Below is a preview of the uploaded CSV file:")
+    st.dataframe(df)  # Interactive, scrollable table
+    # Download Button
+    excel_buffer = BytesIO()
+    with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
+        df.to_excel(writer, index=False, sheet_name="Sheet1")
+    excel_buffer.seek(0)  # Reset buffer position
+    # Download Button
+    st.download_button(
+        label="Download Full Excel Sheet",
+        data=excel_buffer,
+        file_name="table_data.xlsx",
+        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    )
+    st.markdown("<hr>", unsafe_allow_html=True)
+    table_description=image.get("table_data").get("description",None)
+    if table_description:
+        # Table Description
+        st.subheader("Table Description")
+        st.write(table_description)
+    # Column Summary
+    st.markdown("<hr>",unsafe_allow_html=True)
+    st.subheader("Column Summary")
+    with st.container(height=400, border=False):
+        column_summary = image.get("table_data").get("column_summary", None)
+        if column_summary:
+            # Column-level descriptions and analysis
+            column_analysis = analyze_column_data(df)
+            col1, col2 = st.columns(2)
+            for idx, (col_name, col_description) in enumerate(column_summary.items()):
+                # Determine which column to use based on the index
+                with col1 if idx % 2 == 0 else col2:
+                    st.markdown(f"Column Name : **{col_name}**")
+                    st.write(f"Column Description : {col_description}")
+                    # Display basic analysis
+                    analysis = column_analysis.get(col_name, {})
+                    if pd.api.types.is_numeric_dtype(df[col_name]):
+                        # Numeric column analysis
+                        st.write({
+                            "Mean": analysis.get("Mean"),
+                            "Median": analysis.get("Median"),
+                            "Mode": analysis.get("Mode"),
+                            "Unique Values": analysis.get("Unique Values"),
+                            "Null Values": analysis.get("Null Values")
+                        })
+                    else:
+                        # Categorical column analysis
+                        st.write({
+                            "Unique Values": analysis.get("Unique Values"),
+                            "Null Values": analysis.get("Null Values"),
+                            "Top Categories": analysis.get("Top Categories")
+                        })
+    st.markdown("<hr>", unsafe_allow_html=True)
+    st.subheader("Graphical Analysis of Table")
+    # Default configuration for initial visualization
+    best_col1=image.get("table_data").get("best_col1")
+    best_col2 = image.get("table_data").get("best_col2")
+    default_chart_config = {
+        "mark": "bar",
+        "encoding": {
+            "x": {"field": best_col1, "type": "nominal"},
+            "y": {"field": best_col2, "type": "quantitative"}
+        }
+    }
+    # Convert default_chart_config to JSON string for Pygwalker spec parameter
+    pyg_app = StreamlitRenderer(df, spec=json.dumps(default_chart_config))
+    pyg_app.explorer()

table_analysis_for_pdf.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import pandas as pd
+from io import BytesIO
+import requests
+import streamlit as st
+from pymongo import MongoClient
+import os
+from dotenv import load_dotenv
+import json
+from pygwalker.api.streamlit import StreamlitRenderer
+# Load environment variables
+load_dotenv()
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+# Load CSV from S3 URL
+def load_csv_from_url(object_url):
+    response = requests.get(object_url)
+    response.raise_for_status()
+    return pd.read_csv(BytesIO(response.content))
+# Analyze column data
+def analyze_column_data(df):
+    analysis = {}
+    for col in df.columns:
+        if pd.api.types.is_numeric_dtype(df[col]):
+            analysis[col] = {
+                "Mean": df[col].mean(),
+                "Median": df[col].median(),
+                "Mode": df[col].mode()[0] if not df[col].mode().empty else None,
+                "Unique Values": df[col].nunique(),
+                "Null Values": df[col].isnull().sum()
+            }
+        else:
+            analysis[col] = {
+                "Unique Values": df[col].nunique(),
+                "Null Values": df[col].isnull().sum(),
+                "Top Categories": df[col].value_counts().head(5).to_dict()
+            }
+    return analysis
+# Display analysis for a selected table
+def display_table_analysis(table):
+    # Load CSV data
+    df = load_csv_from_url(table['csv_object_url'])
+    # Check for "total" row
+    if df.iloc[-1].astype(str).str.contains("total", case=False).any():
+        df = df.iloc[:-1]  # Drop last row if "total" found
+    # Table preview
+    st.subheader("CSV Preview")
+    st.dataframe(df, height=300)
+    # Download Button
+    st.download_button(
+        label="Download CSV",
+        data=requests.get(table['csv_object_url']).content,
+        file_name="table_data.csv",
+        mime="text/csv"
+    )
+    # Table Description
+    if 'description' in table:
+        st.subheader("Table Description")
+        st.write(table['description'])
+    # Column Summary
+    st.subheader("Column Summary")
+    column_summary = table.get('column_summary', {})
+    column_analysis = analyze_column_data(df)
+    col1, col2 = st.columns(2)
+    for idx, (col_name, col_description) in enumerate(column_summary.items()):
+        with col1 if idx % 2 == 0 else col2:
+            st.markdown(f"Column Name: **{col_name}**")
+            st.write(f"Description: {col_description}")
+            analysis = column_analysis.get(col_name, {})
+            if pd.api.types.is_numeric_dtype(df[col_name]):
+                st.write({
+                    "Mean": analysis.get("Mean"),
+                    "Median": analysis.get("Median"),
+                    "Mode": analysis.get("Mode"),
+                    "Unique Values": analysis.get("Unique Values"),
+                    "Null Values": analysis.get("Null Values")
+                })
+            else:
+                st.write({
+                    "Unique Values": analysis.get("Unique Values"),
+                    "Null Values": analysis.get("Null Values"),
+                    "Top Categories": analysis.get("Top Categories")
+                })
+    # Graphical Analysis using Pygwalker
+    st.subheader("Graphical Analysis of Table")
+    pyg_app = StreamlitRenderer(df)
+    pyg_app.explorer()
+# Main function to render the View Table Analysis page for PDF tables
+def view_pdf_table_analysis_page(url):
+    if st.button("Back", key="back_button"):
+        st.session_state.page = "view_pdf"
+        st.rerun()
+    # Retrieve table data for the PDF
+    pdf_data = collection.find_one({"object_url": url})
+    tables = pdf_data.get("table_data", [])
+    # Display the total number of tables
+    st.title("PDF Table Analysis")
+    st.write(f"Total tables found: {len(tables)}")
+    if "selected_table" not in st.session_state or st.session_state.selected_table is None or st.session_state.selected_table >= len(tables):
+        st.session_state.selected_table = 0
+    selected_table_idx = st.radio(
+        "Select a table to analyze",
+        options=range(len(tables)),
+        format_func=lambda x: f"Analyze Table {x + 1}",
+        index=st.session_state.selected_table  # Safely use the default if uninitialized
+    )
+    st.session_state.selected_table = selected_table_idx
+    if st.session_state.selected_table is not None:
+        selected_table_data = tables[st.session_state.selected_table]
+        st.subheader(f"Analysis for Table {st.session_state.selected_table + 1}")
+        csv_url = selected_table_data['csv_object_url']
+        df = load_csv_from_url(csv_url)
+        if df.iloc[-1].apply(lambda x: "total" in str(x).lower()).any():
+            df = df.iloc[:-1]
+        st.dataframe(df)  # Interactive, scrollable table
+        excel_buffer = BytesIO()
+        with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
+            df.to_excel(writer, index=False, sheet_name="Sheet1")
+        excel_buffer.seek(0)  # Reset buffer position
+        # Download Button
+        st.download_button(
+            label="Download Full Excel Sheet",
+            data=excel_buffer,
+            file_name="table_data.xlsx",
+            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+        )
+        st.markdown("<hr>", unsafe_allow_html=True)
+        table_description = selected_table_data.get("description", None)
+        if table_description:
+            # Table Description
+            st.subheader("Table Description")
+            st.write(table_description)
+        # Column Summary
+        st.markdown("<hr>", unsafe_allow_html=True)
+        st.subheader("Column Summary")
+        with st.container(height=400, border=False):
+            column_summary = selected_table_data.get("column_summary", None)
+            if column_summary:
+                # Column-level descriptions and analysis
+                column_analysis = analyze_column_data(df)
+                col1, col2 = st.columns(2)
+                for idx, (col_name, col_description) in enumerate(column_summary.items()):
+                    # Determine which column to use based on the index
+                    with col1 if idx % 2 == 0 else col2:
+                        st.markdown(f"Column Name : **{col_name}**")
+                        st.write(f"Column Description : {col_description}")
+                        # Display basic analysis
+                        analysis = column_analysis.get(col_name, {})
+                        if pd.api.types.is_numeric_dtype(df[col_name]):
+                            # Numeric column analysis
+                            st.write({
+                                "Mean": analysis.get("Mean"),
+                                "Median": analysis.get("Median"),
+                                "Mode": analysis.get("Mode"),
+                                "Unique Values": analysis.get("Unique Values"),
+                                "Null Values": analysis.get("Null Values")
+                            })
+                        else:
+                            # Categorical column analysis
+                            st.write({
+                                "Unique Values": analysis.get("Unique Values"),
+                                "Null Values": analysis.get("Null Values"),
+                                "Top Categories": analysis.get("Top Categories")
+                            })
+        st.markdown("<hr>", unsafe_allow_html=True)
+        st.subheader("Graphical Analysis of Table")
+        best_col1 = selected_table_data.get("best_col1")
+        best_col2 = selected_table_data .get("best_col2")
+        default_chart_config = {
+            "mark": "bar",
+            "encoding": {
+                "x": {"field": best_col1, "type": "nominal"},
+                "y": {"field": best_col2, "type": "quantitative"}
+            }
+        }
+        # Convert default_chart_config to JSON string for Pygwalker spec parameter
+        pyg_app = StreamlitRenderer(df, spec=json.dumps(default_chart_config))
+        pyg_app.explorer()

upload_file_to_s3.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from pymongo import MongoClient
+from datetime import datetime
+import boto3
+import uuid
+import os
+from dotenv import load_dotenv
+load_dotenv()
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+s3 = boto3.client(
+    's3',
+    aws_access_key_id=AWS_ACCESS_KEY_ID,
+    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
+)
+def upload_file(file,filetype):
+    try:
+        # Generate a unique key for the file using UUID
+        uuid_str = str(uuid.uuid4())
+        file_name = file.name
+        s3_key = f'MoSPI_files/{uuid_str}-{file_name}'
+        # Upload the image to S3 with ContentType for image files
+        s3.upload_fileobj(
+            file,
+            AWS_BUCKET_NAME,
+            s3_key,
+            ExtraArgs={'ContentType': file.type}  # Set the MIME type of the uploaded file
+        )
+        file_size = file.size
+        upload_time = datetime.now()
+        # Extract date and time separately
+        upload_date = upload_time.strftime('%Y-%m-%d')
+        upload_time_only = upload_time.strftime('%H:%M:%S')
+        # Metadata to MongoDB
+        metadata = {
+            'name': file_name,
+            'size': file_size,
+            'type': filetype,
+            'status': 'unprocessed',
+            's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
+            's3_key': s3_key,
+            'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
+            'date_uploaded': upload_date,
+            'time_uploaded': upload_time_only,
+            'accuracy': None
+        }
+        # Insert metadata into MongoDB
+        collection.insert_one(metadata)
+        return metadata
+    except Exception as e:
+        print(f"An error occurred during upload: {e}")
+        return None

view_excel.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import streamlit as st
+from pymongo import MongoClient
+import os
+from dotenv import load_dotenv
+from datetime import datetime
+# Load environment variables
+load_dotenv()
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+def format_date(timestamp):
+    """Convert timestamp to a readable date format."""
+    return datetime.fromtimestamp(timestamp).strftime("%B %d, %Y")
+# Custom CSS to control image and expander container width and styling
+def view_excel():
+    if st.button("Back"):
+        st.session_state.page = "home"
+        st.rerun()
+    st.title("Your Uploaded Images")
+    # Fetch all uploaded images from MongoDB
+    images = list(collection.find({"filetype": {"$in": ["excel", "csv"]},"status":"processed"}))
+    if not images:
+        st.write("You have not uploaded any Excel yet.")
+        return
+    # Display images in a grid (4 images per row)
+    cols = st.columns(4)
+    for idx, image in enumerate(images):
+        col = cols[idx % 4]
+        with col:
+            # Container for each image and its expander
+            # Expander for image details
+            with st.expander("View Excel Details"):
+                st.write(f"**File Name:** {image.get('name', 'N/A')}")
+                st.write(f"**Date Uploaded:** {format_date(image.get('upload_date', datetime.now().timestamp()))}")
+                st.markdown(
+                    f"<a href='{image['object_url']}' class='download-link' download>Download File</a>",
+                    unsafe_allow_html=True
+                )
+                if st.button("View Table Analysis",key=f"image_analysis_{idx}"):
+                    st.session_state.page="view_excel_analysis"
+                    st.session_state.excel_url=image['object_url']
+                    st.rerun()
+        # Move to a new row after every 4 images
+        if (idx + 1) % 4 == 0:
+            st.write("")  # Line break to move to the next row

view_image.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import streamlit as st
+from pymongo import MongoClient
+import os
+from dotenv import load_dotenv
+from datetime import datetime
+# Load environment variables
+load_dotenv()
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+def format_date(timestamp):
+    """Convert timestamp to a readable date format."""
+    return datetime.fromtimestamp(timestamp).strftime("%B %d, %Y")
+# Custom CSS to control image and expander container width and styling
+def view_images():
+    if st.button("Back"):
+        st.session_state.page = "home"
+        st.rerun()
+    st.title("Your Uploaded Images")
+    # Fetch all uploaded images from MongoDB
+    images = list(collection.find({"type": {"$regex": "Image", "$options": "i"},"status":"processed"}))
+    if not images:
+        st.write("You have not uploaded any images yet.")
+        return
+    # Display images in a grid (4 images per row)
+    cols = st.columns(4)
+    for idx, image in enumerate(images):
+        col = cols[idx % 4]
+        with col:
+            # Container for each image and its expander
+            st.markdown("<div class='image-wrapper'>", unsafe_allow_html=True)
+            # Display the image using HTML
+            st.markdown(
+                f"""
+                                <div style='text-align: center;'>
+                                    <img src='{image['object_url']}' alt='{image.get('name', 'Image')}' style='width:250px; height:250px; object-fit: cover; border-radius: 8px;' />
+                                </div>
+                                """,
+                unsafe_allow_html=True
+            )
+            st.markdown("</div>", unsafe_allow_html=True)  # Close image container
+            # Expander for image details
+            with st.expander("View Image Details"):
+                st.write(f"**File Name:** {image.get('name', 'N/A')}")
+                st.write(f"**Date Uploaded:** {format_date(image.get('upload_date', datetime.now().timestamp()))}")
+                st.write(f"**Table Description**: {image.get('table_data').get('description','')}")
+                st.markdown(
+                    f"<a href='{image['object_url']}' class='download-link' download>Download Image</a>",
+                    unsafe_allow_html=True
+                )
+                if st.button("View Table Analysis",key=f"image_analysis_{idx}"):
+                    st.session_state.page="view_image_analysis"
+                    st.session_state.image_url=image['object_url']
+                    st.rerun()
+        # Move to a new row after every 4 images
+        if (idx + 1) % 4 == 0:
+            st.write("")  # Line break to move to the next row

view_pdf.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import streamlit as st
+from pymongo import MongoClient
+import os
+from dotenv import load_dotenv
+from datetime import datetime
+# Load environment variables
+load_dotenv()
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = os.getenv("DB_NAME")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME")
+mongo_client = MongoClient(MONGO_URI)
+db = mongo_client[DB_NAME]
+collection = db[COLLECTION_NAME]
+def format_date(timestamp):
+    """Convert timestamp to a readable date format."""
+    return datetime.fromtimestamp(timestamp).strftime("%B %d, %Y")
+# Custom CSS to control image and expander container width and styling
+def view_pdfs():
+    if st.button("Back"):
+        st.session_state.page = "home"
+        st.rerun()
+    st.title("Your Uploaded PDFs")
+    # Fetch all uploaded images from MongoDB
+    pdfs= list(collection.find({"type": {"$regex": "pdf", "$options": "i"},"status":"processed"}))
+    if not pdfs:
+        st.write("You have not uploaded any PDFs yet.")
+        return
+    # Display images in a grid (4 images per row)
+    cols = st.columns(4)
+    for idx, pdf in enumerate(pdfs):
+        col = cols[idx % 4]
+        with col:
+            # Expander for image details
+            filename=pdf.get('name','N/A')
+            with st.expander(f"{filename}"):
+                st.write(f"**File Name:** {pdf.get('name', 'N/A')}")
+                st.write(f"**Date Uploaded:** {format_date(pdf.get('upload_date', datetime.now().timestamp()))}")
+                st.write(f"**Total tables found** : {len(pdf.get('table_data'))}")
+                # Download link
+                st.markdown(
+                    f"<a href='{pdf['object_url']}' class='download-link' download>Download PDF</a>",
+                    unsafe_allow_html=True
+                )
+                if st.button("View Table Analysis",key=f"table_analysis_{idx}"):
+                    st.session_state.page="pdf_analysis"
+                    st.session_state.pdf_url = pdf['object_url']
+                    st.rerun()
+        # Move to a new row after every 4 images
+        if (idx + 1) % 4 == 0:
+            st.write("")  # Line break to move to the next row