import streamlit as st
import torch
from ultralytics import YOLO
import pytesseract
from PIL import Image
import numpy as np
from transformers import pipeline

# Set up the Tesseract command line path (optional, depending on your setup)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

# Load the YOLOv8 model for panel and character detection
yolo_model = YOLO('yolov8n.pt')  # YOLOv8 nano model for lightweight processing

# Load the Hugging Face summarizer
summarizer = pipeline("summarization")

# App title
st.title("Manga Narration with Adjustable Hyperparameters")

# Sidebar to upload images
st.sidebar.title("Upload Manga Images")
uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=["png", "jpg", "jpeg"], accept_multiple_files=True)

# Progress bar
progress_bar = st.sidebar.progress(0)

# Hyperparameters for tuning detection
st.sidebar.title("Hyperparameters")
st.sidebar.subheader("Character & Panel Detection")
character_confidence = st.sidebar.slider("Character Detection Confidence", min_value=0.1, max_value=1.0, value=0.25)
panel_confidence = st.sidebar.slider("Panel Detection Confidence", min_value=0.1, max_value=1.0, value=0.25)
iou_threshold = st.sidebar.slider("IoU Threshold for YOLO", min_value=0.1, max_value=1.0, value=0.45)

st.sidebar.subheader("Text & Character Matching")
text_to_character_matching = st.sidebar.slider("Text-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.75)
character_to_character_matching = st.sidebar.slider("Character-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.5)

# Manga reading order (right-to-left for most manga)
reading_order = st.sidebar.radio("Manga Reading Order", options=["Right-to-Left", "Left-to-Right"], index=0)

# Summarization parameters
summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100)


def detect_panels_and_characters(image):
    # Perform panel and character detection using YOLOv8 with adjustable thresholds
    results = yolo_model.predict(image, conf=max(character_confidence, panel_confidence), iou=iou_threshold)
    
    # Separate results into panels and characters
    panels = []
    characters = []
    for result in results[0].boxes:
        if result.conf >= panel_confidence and result.cls == 0:  # Assuming '0' is the class ID for panels
            panels.append(result.xyxy.cpu().numpy())  # Panel bounding box
        elif result.conf >= character_confidence and result.cls == 1:  # Assuming '1' is the class ID for characters
            characters.append(result.xyxy.cpu().numpy())  # Character bounding box
    
    return panels, characters


def detect_text(image):
    # Convert image to grayscale for better OCR accuracy
    gray_image = Image.fromarray(image).convert("L")
    text = pytesseract.image_to_string(gray_image)
    return text


def generate_narration(panels, characters, text):
    # Generate narrations based on detected panels, characters, and text
    narration = ""
    if panels:
        narration += f"Detected {len(panels)} panels. "
    if characters:
        narration += f"{len(characters)} characters were found. "
    
    # Add text and summarization for better clarity
    if text.strip():
        narration += "Here's a summary of the text: "
        summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text']
        narration += summary
    
    return narration


def match_text_to_characters(text, characters):
    # Match text to the closest detected characters based on proximity
    matched_characters = []
    
    # Simplified matching logic based on distance between text and characters' positions
    for char in characters:
        if np.random.random() <= text_to_character_matching:  # Simulated matching logic
            matched_characters.append(char)
    
    return matched_characters


def match_character_to_character(characters):
    # Match characters with one another based on proximity or other characteristics
    matched_pairs = []
    
    # Simplified matching logic for character-to-character interaction
    for i in range(len(characters)):
        for j in range(i + 1, len(characters)):
            if np.random.random() <= character_to_character_matching:  # Simulated proximity matching
                matched_pairs.append((characters[i], characters[j]))
    
    return matched_pairs


def process_images(uploaded_files):
    narrations = []
    total_images = len(uploaded_files)

    for idx, file in enumerate(uploaded_files):
        # Load the image
        image = Image.open(file)
        image_np = np.array(image)

        # Detect panels and characters
        panels, characters = detect_panels_and_characters(image_np)

        # Detect text
        text = detect_text(image_np)

        # Match text to characters and match characters to each other
        matched_characters = match_text_to_characters(text, characters)
        matched_pairs = match_character_to_character(characters)

        # Generate narration based on matches
        narration = generate_narration(panels, matched_characters, text)
        narrations.append(narration)

        # Adjust the reading order
        if reading_order == "Right-to-Left":
            narrations.reverse()

        # Update progress bar
        progress_bar.progress((idx + 1) / total_images)

        # Display the current image and its narration
        st.image(image, caption=f"Image {idx + 1}")
        st.write(narration)

    return narrations


if uploaded_files:
    # Process uploaded images
    narrations = process_images(uploaded_files)

    # Show final results after processing all images
    st.write("Narration Summary for All Images:")
    st.write("\n\n".join(narrations))
else:
    st.write("Please upload manga images to get started.")