import streamlit as st
import torch
from ultralytics import YOLO
import pytesseract
from PIL import Image
import numpy as np
from transformers import pipeline
import os
import time

# Set up the Tesseract command line path (optional, depending on your setup)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

# Load the YOLOv8 model for panel and character detection
yolo_model = YOLO('yolov8n.pt')  # YOLOv8 nano model for lightweight processing

# Load the Hugging Face summarizer
summarizer = pipeline("summarization")

# App title
st.title("Manga Narration for the Visually Impaired")

# Sidebar to upload images
st.sidebar.title("Upload Manga Images")
uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=["png", "jpg", "jpeg"], accept_multiple_files=True)

# Progress bar
progress_bar = st.sidebar.progress(0)

# Hyperparameters for tuning
st.sidebar.title("Hyperparameters")
confidence_threshold = st.sidebar.slider("YOLO Confidence Threshold", min_value=0.1, max_value=1.0, value=0.25)
iou_threshold = st.sidebar.slider("YOLO IoU Threshold", min_value=0.1, max_value=1.0, value=0.45)
summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100)

def detect_panels_and_characters(image):
    # Perform panel and character detection using YOLOv8
    results = yolo_model.predict(image, conf=confidence_threshold, iou=iou_threshold)
    
    # Extract bounding boxes and labels
    panels = []
    characters = []
    for result in results[0].boxes:
        if result.cls == 0:  # Assuming '0' is the class ID for panels
            panels.append(result.xyxy.cpu().numpy())  # Panel bounding box
        elif result.cls == 1:  # Assuming '1' is the class ID for characters
            characters.append(result.xyxy.cpu().numpy())  # Character bounding box
    
    return panels, characters

def detect_text(image):
    # Convert image to grayscale for better OCR accuracy
    gray_image = Image.fromarray(image).convert("L")
    text = pytesseract.image_to_string(gray_image)
    return text

def generate_narration(panels, characters, text):
    # Match detected text to characters in the panels
    narration = ""
    if panels:
        narration += f"Detected {len(panels)} panels. "
    if characters:
        narration += f"{len(characters)} characters were found in the scene. "

    # Add the summarization of the detected text as narration
    if text.strip():
        narration += "Here's a summary of the text: "
        summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text']
        narration += summary
    
    return narration

def process_images(uploaded_files):
    narrations = []
    total_images = len(uploaded_files)

    for idx, file in enumerate(uploaded_files):
        # Load the image
        image = Image.open(file)
        image_np = np.array(image)

        # Detect panels and characters
        panels, characters = detect_panels_and_characters(image_np)

        # Detect text
        text = detect_text(image_np)

        # Generate narration
        narration = generate_narration(panels, characters, text)
        narrations.append(narration)

        # Update progress bar
        progress_bar.progress((idx + 1) / total_images)

        # Display the current image and its narration
        st.image(image, caption=f"Image {idx + 1}")
        st.write(narration)

    return narrations

if uploaded_files:
    # Process uploaded images
    narrations = process_images(uploaded_files)

    # Show final results after processing all images
    st.write("Narration Summary for All Images:")
    st.write("\n\n".join(narrations))
else:
    st.write("Please upload manga images to get started.")