import base64
from pdf2image import convert_from_path
from extract_table_from_image import process_image_using_llm
from pymongo import MongoClient
from datetime import datetime
import uuid
import os
import re
import csv
import requests
from io import StringIO, BytesIO
from dotenv import load_dotenv
import boto3

load_dotenv()

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME")
MONGO_URI = os.getenv("MONGO_URI")
DB_NAME = os.getenv("DB_NAME")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")

mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]

s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
pdf_temp_dir = 'temp/pdf_files'
image_temp_dir = 'temp/page_images'
os.makedirs(pdf_temp_dir, exist_ok=True)
os.makedirs(image_temp_dir, exist_ok=True)
pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf')

def cleanup_directory(directory_path):
    try:
        for filename in os.listdir(directory_path):
            file_path = os.path.join(directory_path, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)
        print(f"Cleaned up files in {directory_path}")
    except Exception as e:
        print(f"Error cleaning up directory {directory_path}: {e}")
def download_and_split_pdf_to_image(url):
    try:
        response = requests.get(url)
        with open(pdf_path, 'wb') as pdf_file:
            pdf_file.write(response.content)


    except Exception as e:
        print(f"error occured during downloading pdf from object url : {e}")
        return None

    try:
        images = convert_from_path(pdf_path)
        for i, image in enumerate(images):
            image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png')
            image.save(image_path, 'PNG')
            print(f'Saved image: {image_path}')
        return True

    except Exception as e:
        print(f"error occured in converting pdf pages to image : {e}")
        return None

def upload_csv_file(file, csv_filename, content_type):
    try:
        # Generate a unique key for the file using UUID
        uuid_str = str(uuid.uuid4())
        s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}'

        # Upload the CSV to S3
        s3.upload_fileobj(
            file,
            AWS_BUCKET_NAME,
            s3_key,
            ExtraArgs={'ContentType': content_type}  # Set the MIME type of the uploaded file
        )

        upload_time = datetime.now()

        # Metadata for MongoDB
        metadata = {
            'name': csv_filename,
            'type': content_type,
            's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}',
            's3_key': s3_key,
            'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}',
            'date_uploaded': upload_time.strftime('%Y-%m-%d'),
            'time_uploaded': upload_time.strftime('%H:%M:%S')
        }

        return metadata

    except Exception as e:
        print(f"An error occurred during upload: {e}")
        return None

def process_pdf(url,filename):
    split=download_and_split_pdf_to_image(url)
    if split:
        image_files = sorted(
            os.listdir(image_temp_dir),
            key=lambda x: int(re.search(r'page_(\d+)', x).group(1))
        )

        table_datas= []
        for count, image_name in enumerate(image_files, start=1):
            print(f"Processing page {count} of the PDF")
            image_path = os.path.join(image_temp_dir, image_name)
            with open(image_path, "rb") as image_file:
                image_data = base64.b64encode(image_file.read()).decode("utf-8")
                result = process_image_using_llm(image_data,count,3)

                has_table_data=result.get("has_table_data")
                if has_table_data:
                    table_data=result.get("table_data")
                    page_number=result.get("page_number")
                    description = result.get("description")
                    column_summary=result.get("column_summary")
                    best_col1 = result.get("best_col1")
                    best_col2 = result.get("best_col2")


                    csv_buffer = StringIO()
                    csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys())
                    csv_writer.writeheader()
                    csv_writer.writerows(table_data)

                    csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8"))
                    csv_filename = f"{filename}_pageNumber_{str(page_number)}.csv"
                    s3_metadata = upload_csv_file(csv_bytes, csv_filename, "text/csv")

                    if s3_metadata:
                        object_url=s3_metadata.get("object_url")
                        s3_url=s3_metadata.get("s3_url")
                        data = {
                            "table_data": table_data,
                            "description": description,
                            "column_summary": column_summary,
                            "page_number": page_number,
                            "csv_object_url":object_url,
                            "csv_s3_url":s3_url,
                            "best_col1": best_col1,
                            "best_col2": best_col2
                        }

                        table_datas.append(data)

                else:
                    print(f"no table data found at page {count}")

        if table_datas:
            collection.update_one({"object_url":url},{"$set":{"table_data":table_datas}})

            cleanup_directory(pdf_temp_dir)
            cleanup_directory(image_temp_dir)
            return True

        else:
            print(f"found no table data in whole pdf")
            cleanup_directory(pdf_temp_dir)
            cleanup_directory(image_temp_dir)
            return False