import os
from datasets.filesystems import S3FileSystem
import s3fs
import boto3
from pdf2image import convert_from_path
import csv
from PIL import Image
import gradio as gr

# AWS and S3 Initialization with environment variables
aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
region_name = os.getenv('AWS_REGION')
s3_bucket = os.getenv('AWS_BUCKET')

# Properly initialize s3fs with environment variables
s3 = s3fs.S3FileSystem(
    key=os.getenv('AWS_ACCESS_KEY'),
    secret=os.getenv('AWS_SECRET_KEY'),
    client_kwargs={'region_name': os.getenv('AWS_REGION')}
)
# textract_client = boto3.client('textract', region_name=region_name)
textract_client = boto3.client('textract', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)

# textract_client = boto3.client(
#     'textract',
#     aws_access_key_id=('AWS_ACCESS_KEY'),
#     aws_secret_access_key=('AWS_SECRET_KEY'),
#     region_name=('AWS_REGION')
# )

def upload_file_to_s3(file_path, bucket, object_name=None):
    if object_name is None:
        object_name = os.path.basename(file_path)
    try:
        s3_path = f"{bucket}/{object_name}"
        s3.upload(file_path, s3_path)
        return object_name
    except FileNotFoundError:
        print("The file was not found")
        return None


def process_image(file_path, s3_bucket, textract_client):
    s3_object_key = upload_file_to_s3(file_path, s3_bucket)
    if not s3_object_key:
        return None

    response = textract_client.analyze_document(
        Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
        FeatureTypes=["TABLES"]
    )
    return response

def generate_table_csv(tables, blocks_map, csv_output_path):
    with open(csv_output_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for table in tables:
            rows = get_rows_columns_map(table, blocks_map)
            for row_index, cols in rows.items():
                row = []
                for col_index in range(1, max(cols.keys()) + 1):
                    row.append(cols.get(col_index, ""))
                writer.writerow(row)

def get_rows_columns_map(table_result, blocks_map):
    rows = {}
    for relationship in table_result['Relationships']:
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                cell = blocks_map[child_id]
                if 'RowIndex' in cell and 'ColumnIndex' in cell:
                    row_index = cell['RowIndex']
                    col_index = cell['ColumnIndex']
                    if row_index not in rows:
                        rows[row_index] = {}
                    rows[row_index][col_index] = get_text(cell, blocks_map)
    return rows

def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] == 'SELECTED':
                            text += 'X '
    return text.strip()
    
def process_file_and_generate_csv(file_path):
    # The file_path is directly usable; no need to check for attributes or methods

    csv_output_path = "/tmp/output.csv"
    
    if file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
        images = [Image.open(file_path)]
    else:
        # Convert PDF or other supported formats to images
        images = convert_from_path(file_path)

    for i, image in enumerate(images):
        image_path = f"/tmp/image_{i}.jpg"
        image.save(image_path, 'JPEG')

        response = process_image(image_path, s3_bucket, textract_client)
        if response:
            blocks = response['Blocks']
            blocks_map = {block['Id']: block for block in blocks}
            tables = [block for block in blocks if block['BlockType'] == "TABLE"]
            generate_table_csv(tables, blocks_map, csv_output_path)

    # No need to remove the original file_path; Gradio handles temporary file cleanup

    # Return the CSV output path and a success message for Gradio to handle
    return csv_output_path, "Processing completed successfully!"


# Gradio Interface
iface = gr.Interface(
    fn=process_file_and_generate_csv,
    inputs=gr.File(label="Upload your file (PDF, PNG, JPG, TIFF)"),
    outputs=[gr.File(label="Download Generated CSV"), "text"],
    description="Upload a document to extract tables into a CSV file."
)

if __name__ == "__main__":
    iface.launch()