Spaces:

daranaka
/

manga-narrator-ai

Runtime error

File size: 5,892 Bytes

db28818
42eb874
db28818
42eb874
 
db28818
42eb874
db28818
42eb874
 
db28818
42eb874
 
db28818
42eb874
 
db28818
42eb874
161dbfb
42eb874
 
 
 
 
 
 
 
161dbfb
42eb874
161dbfb
 
 
 
 
 
 
 
 
 
 
 
 
42eb874
 
161dbfb
42eb874
161dbfb
 
42eb874
161dbfb
42eb874
 
 
161dbfb
42eb874
161dbfb
42eb874
 
 
db28818
161dbfb
42eb874
 
 
 
 
 
161dbfb
42eb874
161dbfb
42eb874
 
 
 
161dbfb
 
 
42eb874
 
 
 
 
 
 
161dbfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42eb874
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161dbfb
 
 
 
 
 
42eb874
db28818
161dbfb
 
 
 
db28818
42eb874
 
 
 
 
 
 
 
161dbfb
42eb874
 
 
 
 
 
 
 
161dbfb

import streamlit as st
import torch
from ultralytics import YOLO
import pytesseract
from PIL import Image
import numpy as np
from transformers import pipeline

# Set up the Tesseract command line path (optional, depending on your setup)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

# Load the YOLOv8 model for panel and character detection
yolo_model = YOLO('yolov8n.pt')  # YOLOv8 nano model for lightweight processing

# Load the Hugging Face summarizer
summarizer = pipeline("summarization")

# App title
st.title("Manga Narration with Adjustable Hyperparameters")

# Sidebar to upload images
st.sidebar.title("Upload Manga Images")
uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=["png", "jpg", "jpeg"], accept_multiple_files=True)

# Progress bar
progress_bar = st.sidebar.progress(0)

# Hyperparameters for tuning detection
st.sidebar.title("Hyperparameters")
st.sidebar.subheader("Character & Panel Detection")
character_confidence = st.sidebar.slider("Character Detection Confidence", min_value=0.1, max_value=1.0, value=0.25)
panel_confidence = st.sidebar.slider("Panel Detection Confidence", min_value=0.1, max_value=1.0, value=0.25)
iou_threshold = st.sidebar.slider("IoU Threshold for YOLO", min_value=0.1, max_value=1.0, value=0.45)

st.sidebar.subheader("Text & Character Matching")
text_to_character_matching = st.sidebar.slider("Text-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.75)
character_to_character_matching = st.sidebar.slider("Character-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.5)

# Manga reading order (right-to-left for most manga)
reading_order = st.sidebar.radio("Manga Reading Order", options=["Right-to-Left", "Left-to-Right"], index=0)

# Summarization parameters
summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100)


def detect_panels_and_characters(image):
    # Perform panel and character detection using YOLOv8 with adjustable thresholds
    results = yolo_model.predict(image, conf=max(character_confidence, panel_confidence), iou=iou_threshold)
    
    # Separate results into panels and characters
    panels = []
    characters = []
    for result in results[0].boxes:
        if result.conf >= panel_confidence and result.cls == 0:  # Assuming '0' is the class ID for panels
            panels.append(result.xyxy.cpu().numpy())  # Panel bounding box
        elif result.conf >= character_confidence and result.cls == 1:  # Assuming '1' is the class ID for characters
            characters.append(result.xyxy.cpu().numpy())  # Character bounding box
    
    return panels, characters


def detect_text(image):
    # Convert image to grayscale for better OCR accuracy
    gray_image = Image.fromarray(image).convert("L")
    text = pytesseract.image_to_string(gray_image)
    return text


def generate_narration(panels, characters, text):
    # Generate narrations based on detected panels, characters, and text
    narration = ""
    if panels:
        narration += f"Detected {len(panels)} panels. "
    if characters:
        narration += f"{len(characters)} characters were found. "
    
    # Add text and summarization for better clarity
    if text.strip():
        narration += "Here's a summary of the text: "
        summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text']
        narration += summary
    
    return narration


def match_text_to_characters(text, characters):
    # Match text to the closest detected characters based on proximity
    matched_characters = []
    
    # Simplified matching logic based on distance between text and characters' positions
    for char in characters:
        if np.random.random() <= text_to_character_matching:  # Simulated matching logic
            matched_characters.append(char)
    
    return matched_characters


def match_character_to_character(characters):
    # Match characters with one another based on proximity or other characteristics
    matched_pairs = []
    
    # Simplified matching logic for character-to-character interaction
    for i in range(len(characters)):
        for j in range(i + 1, len(characters)):
            if np.random.random() <= character_to_character_matching:  # Simulated proximity matching
                matched_pairs.append((characters[i], characters[j]))
    
    return matched_pairs


def process_images(uploaded_files):
    narrations = []
    total_images = len(uploaded_files)

    for idx, file in enumerate(uploaded_files):
        # Load the image
        image = Image.open(file)
        image_np = np.array(image)

        # Detect panels and characters
        panels, characters = detect_panels_and_characters(image_np)

        # Detect text
        text = detect_text(image_np)

        # Match text to characters and match characters to each other
        matched_characters = match_text_to_characters(text, characters)
        matched_pairs = match_character_to_character(characters)

        # Generate narration based on matches
        narration = generate_narration(panels, matched_characters, text)
        narrations.append(narration)

        # Adjust the reading order
        if reading_order == "Right-to-Left":
            narrations.reverse()

        # Update progress bar
        progress_bar.progress((idx + 1) / total_images)

        # Display the current image and its narration
        st.image(image, caption=f"Image {idx + 1}")
        st.write(narration)

    return narrations


if uploaded_files:
    # Process uploaded images
    narrations = process_images(uploaded_files)

    # Show final results after processing all images
    st.write("Narration Summary for All Images:")
    st.write("\n\n".join(narrations))
else:
    st.write("Please upload manga images to get started.")