import streamlit as st import torch from ultralytics import YOLO import pytesseract from PIL import Image import numpy as np from transformers import pipeline # Set up the Tesseract command line path (optional, depending on your setup) pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Load the YOLOv8 model for panel and character detection yolo_model = YOLO('yolov8n.pt') # YOLOv8 nano model for lightweight processing # Load the Hugging Face summarizer summarizer = pipeline("summarization") # App title st.title("Manga Narration with Adjustable Hyperparameters") # Sidebar to upload images st.sidebar.title("Upload Manga Images") uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=["png", "jpg", "jpeg"], accept_multiple_files=True) # Progress bar progress_bar = st.sidebar.progress(0) # Hyperparameters for tuning detection st.sidebar.title("Hyperparameters") st.sidebar.subheader("Character & Panel Detection") character_confidence = st.sidebar.slider("Character Detection Confidence", min_value=0.1, max_value=1.0, value=0.25) panel_confidence = st.sidebar.slider("Panel Detection Confidence", min_value=0.1, max_value=1.0, value=0.25) iou_threshold = st.sidebar.slider("IoU Threshold for YOLO", min_value=0.1, max_value=1.0, value=0.45) st.sidebar.subheader("Text & Character Matching") text_to_character_matching = st.sidebar.slider("Text-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.75) character_to_character_matching = st.sidebar.slider("Character-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.5) # Manga reading order (right-to-left for most manga) reading_order = st.sidebar.radio("Manga Reading Order", options=["Right-to-Left", "Left-to-Right"], index=0) # Summarization parameters summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100) def detect_panels_and_characters(image): # Perform panel and character detection using YOLOv8 with adjustable thresholds results = yolo_model.predict(image, conf=max(character_confidence, panel_confidence), iou=iou_threshold) # Separate results into panels and characters panels = [] characters = [] for result in results[0].boxes: if result.conf >= panel_confidence and result.cls == 0: # Assuming '0' is the class ID for panels panels.append(result.xyxy.cpu().numpy()) # Panel bounding box elif result.conf >= character_confidence and result.cls == 1: # Assuming '1' is the class ID for characters characters.append(result.xyxy.cpu().numpy()) # Character bounding box return panels, characters def detect_text(image): # Convert image to grayscale for better OCR accuracy gray_image = Image.fromarray(image).convert("L") text = pytesseract.image_to_string(gray_image) return text def generate_narration(panels, characters, text): # Generate narrations based on detected panels, characters, and text narration = "" if panels: narration += f"Detected {len(panels)} panels. " if characters: narration += f"{len(characters)} characters were found. " # Add text and summarization for better clarity if text.strip(): narration += "Here's a summary of the text: " summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text'] narration += summary return narration def match_text_to_characters(text, characters): # Match text to the closest detected characters based on proximity matched_characters = [] # Simplified matching logic based on distance between text and characters' positions for char in characters: if np.random.random() <= text_to_character_matching: # Simulated matching logic matched_characters.append(char) return matched_characters def match_character_to_character(characters): # Match characters with one another based on proximity or other characteristics matched_pairs = [] # Simplified matching logic for character-to-character interaction for i in range(len(characters)): for j in range(i + 1, len(characters)): if np.random.random() <= character_to_character_matching: # Simulated proximity matching matched_pairs.append((characters[i], characters[j])) return matched_pairs def process_images(uploaded_files): narrations = [] total_images = len(uploaded_files) for idx, file in enumerate(uploaded_files): # Load the image image = Image.open(file) image_np = np.array(image) # Detect panels and characters panels, characters = detect_panels_and_characters(image_np) # Detect text text = detect_text(image_np) # Match text to characters and match characters to each other matched_characters = match_text_to_characters(text, characters) matched_pairs = match_character_to_character(characters) # Generate narration based on matches narration = generate_narration(panels, matched_characters, text) narrations.append(narration) # Adjust the reading order if reading_order == "Right-to-Left": narrations.reverse() # Update progress bar progress_bar.progress((idx + 1) / total_images) # Display the current image and its narration st.image(image, caption=f"Image {idx + 1}") st.write(narration) return narrations if uploaded_files: # Process uploaded images narrations = process_images(uploaded_files) # Show final results after processing all images st.write("Narration Summary for All Images:") st.write("\n\n".join(narrations)) else: st.write("Please upload manga images to get started.")