import streamlit as st import torch from ultralytics import YOLO import pytesseract from PIL import Image import numpy as np from transformers import pipeline import os import time # Set up the Tesseract command line path (optional, depending on your setup) pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Load the YOLOv8 model for panel and character detection yolo_model = YOLO('yolov8n.pt') # YOLOv8 nano model for lightweight processing # Load the Hugging Face summarizer summarizer = pipeline("summarization") # App title st.title("Manga Narration for the Visually Impaired") # Sidebar to upload images st.sidebar.title("Upload Manga Images") uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=["png", "jpg", "jpeg"], accept_multiple_files=True) # Progress bar progress_bar = st.sidebar.progress(0) # Hyperparameters for tuning st.sidebar.title("Hyperparameters") confidence_threshold = st.sidebar.slider("YOLO Confidence Threshold", min_value=0.1, max_value=1.0, value=0.25) iou_threshold = st.sidebar.slider("YOLO IoU Threshold", min_value=0.1, max_value=1.0, value=0.45) summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100) def detect_panels_and_characters(image): # Perform panel and character detection using YOLOv8 results = yolo_model.predict(image, conf=confidence_threshold, iou=iou_threshold) # Extract bounding boxes and labels panels = [] characters = [] for result in results[0].boxes: if result.cls == 0: # Assuming '0' is the class ID for panels panels.append(result.xyxy.cpu().numpy()) # Panel bounding box elif result.cls == 1: # Assuming '1' is the class ID for characters characters.append(result.xyxy.cpu().numpy()) # Character bounding box return panels, characters def detect_text(image): # Convert image to grayscale for better OCR accuracy gray_image = Image.fromarray(image).convert("L") text = pytesseract.image_to_string(gray_image) return text def generate_narration(panels, characters, text): # Match detected text to characters in the panels narration = "" if panels: narration += f"Detected {len(panels)} panels. " if characters: narration += f"{len(characters)} characters were found in the scene. " # Add the summarization of the detected text as narration if text.strip(): narration += "Here's a summary of the text: " summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text'] narration += summary return narration def process_images(uploaded_files): narrations = [] total_images = len(uploaded_files) for idx, file in enumerate(uploaded_files): # Load the image image = Image.open(file) image_np = np.array(image) # Detect panels and characters panels, characters = detect_panels_and_characters(image_np) # Detect text text = detect_text(image_np) # Generate narration narration = generate_narration(panels, characters, text) narrations.append(narration) # Update progress bar progress_bar.progress((idx + 1) / total_images) # Display the current image and its narration st.image(image, caption=f"Image {idx + 1}") st.write(narration) return narrations if uploaded_files: # Process uploaded images narrations = process_images(uploaded_files) # Show final results after processing all images st.write("Narration Summary for All Images:") st.write("\n\n".join(narrations)) else: st.write("Please upload manga images to get started.")