Spaces:
Runtime error
Runtime error
import streamlit as st | |
import torch | |
from ultralytics import YOLO | |
import pytesseract | |
from PIL import Image | |
import numpy as np | |
from transformers import pipeline | |
# Set up the Tesseract command line path (optional, depending on your setup) | |
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" | |
# Load the YOLOv8 model for panel and character detection | |
yolo_model = YOLO('yolov8n.pt') # YOLOv8 nano model for lightweight processing | |
# Load the Hugging Face summarizer | |
summarizer = pipeline("summarization") | |
# App title | |
st.title("Manga Narration with Adjustable Hyperparameters") | |
# Sidebar to upload images | |
st.sidebar.title("Upload Manga Images") | |
uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=["png", "jpg", "jpeg"], accept_multiple_files=True) | |
# Progress bar | |
progress_bar = st.sidebar.progress(0) | |
# Hyperparameters for tuning detection | |
st.sidebar.title("Hyperparameters") | |
st.sidebar.subheader("Character & Panel Detection") | |
character_confidence = st.sidebar.slider("Character Detection Confidence", min_value=0.1, max_value=1.0, value=0.25) | |
panel_confidence = st.sidebar.slider("Panel Detection Confidence", min_value=0.1, max_value=1.0, value=0.25) | |
iou_threshold = st.sidebar.slider("IoU Threshold for YOLO", min_value=0.1, max_value=1.0, value=0.45) | |
st.sidebar.subheader("Text & Character Matching") | |
text_to_character_matching = st.sidebar.slider("Text-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.75) | |
character_to_character_matching = st.sidebar.slider("Character-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.5) | |
# Manga reading order (right-to-left for most manga) | |
reading_order = st.sidebar.radio("Manga Reading Order", options=["Right-to-Left", "Left-to-Right"], index=0) | |
# Summarization parameters | |
summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100) | |
def detect_panels_and_characters(image): | |
# Perform panel and character detection using YOLOv8 with adjustable thresholds | |
results = yolo_model.predict(image, conf=max(character_confidence, panel_confidence), iou=iou_threshold) | |
# Separate results into panels and characters | |
panels = [] | |
characters = [] | |
for result in results[0].boxes: | |
if result.conf >= panel_confidence and result.cls == 0: # Assuming '0' is the class ID for panels | |
panels.append(result.xyxy.cpu().numpy()) # Panel bounding box | |
elif result.conf >= character_confidence and result.cls == 1: # Assuming '1' is the class ID for characters | |
characters.append(result.xyxy.cpu().numpy()) # Character bounding box | |
return panels, characters | |
def detect_text(image): | |
# Convert image to grayscale for better OCR accuracy | |
gray_image = Image.fromarray(image).convert("L") | |
text = pytesseract.image_to_string(gray_image) | |
return text | |
def generate_narration(panels, characters, text): | |
# Generate narrations based on detected panels, characters, and text | |
narration = "" | |
if panels: | |
narration += f"Detected {len(panels)} panels. " | |
if characters: | |
narration += f"{len(characters)} characters were found. " | |
# Add text and summarization for better clarity | |
if text.strip(): | |
narration += "Here's a summary of the text: " | |
summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text'] | |
narration += summary | |
return narration | |
def match_text_to_characters(text, characters): | |
# Match text to the closest detected characters based on proximity | |
matched_characters = [] | |
# Simplified matching logic based on distance between text and characters' positions | |
for char in characters: | |
if np.random.random() <= text_to_character_matching: # Simulated matching logic | |
matched_characters.append(char) | |
return matched_characters | |
def match_character_to_character(characters): | |
# Match characters with one another based on proximity or other characteristics | |
matched_pairs = [] | |
# Simplified matching logic for character-to-character interaction | |
for i in range(len(characters)): | |
for j in range(i + 1, len(characters)): | |
if np.random.random() <= character_to_character_matching: # Simulated proximity matching | |
matched_pairs.append((characters[i], characters[j])) | |
return matched_pairs | |
def process_images(uploaded_files): | |
narrations = [] | |
total_images = len(uploaded_files) | |
for idx, file in enumerate(uploaded_files): | |
# Load the image | |
image = Image.open(file) | |
image_np = np.array(image) | |
# Detect panels and characters | |
panels, characters = detect_panels_and_characters(image_np) | |
# Detect text | |
text = detect_text(image_np) | |
# Match text to characters and match characters to each other | |
matched_characters = match_text_to_characters(text, characters) | |
matched_pairs = match_character_to_character(characters) | |
# Generate narration based on matches | |
narration = generate_narration(panels, matched_characters, text) | |
narrations.append(narration) | |
# Adjust the reading order | |
if reading_order == "Right-to-Left": | |
narrations.reverse() | |
# Update progress bar | |
progress_bar.progress((idx + 1) / total_images) | |
# Display the current image and its narration | |
st.image(image, caption=f"Image {idx + 1}") | |
st.write(narration) | |
return narrations | |
if uploaded_files: | |
# Process uploaded images | |
narrations = process_images(uploaded_files) | |
# Show final results after processing all images | |
st.write("Narration Summary for All Images:") | |
st.write("\n\n".join(narrations)) | |
else: | |
st.write("Please upload manga images to get started.") | |