Spaces:
Runtime error
Runtime error
File size: 5,892 Bytes
db28818 42eb874 db28818 42eb874 db28818 42eb874 db28818 42eb874 db28818 42eb874 db28818 42eb874 db28818 42eb874 161dbfb 42eb874 161dbfb 42eb874 161dbfb 42eb874 161dbfb 42eb874 161dbfb 42eb874 161dbfb 42eb874 161dbfb 42eb874 161dbfb 42eb874 db28818 161dbfb 42eb874 161dbfb 42eb874 161dbfb 42eb874 161dbfb 42eb874 161dbfb 42eb874 161dbfb 42eb874 db28818 161dbfb db28818 42eb874 161dbfb 42eb874 161dbfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import streamlit as st
import torch
from ultralytics import YOLO
import pytesseract
from PIL import Image
import numpy as np
from transformers import pipeline
# Set up the Tesseract command line path (optional, depending on your setup)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
# Load the YOLOv8 model for panel and character detection
yolo_model = YOLO('yolov8n.pt') # YOLOv8 nano model for lightweight processing
# Load the Hugging Face summarizer
summarizer = pipeline("summarization")
# App title
st.title("Manga Narration with Adjustable Hyperparameters")
# Sidebar to upload images
st.sidebar.title("Upload Manga Images")
uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=["png", "jpg", "jpeg"], accept_multiple_files=True)
# Progress bar
progress_bar = st.sidebar.progress(0)
# Hyperparameters for tuning detection
st.sidebar.title("Hyperparameters")
st.sidebar.subheader("Character & Panel Detection")
character_confidence = st.sidebar.slider("Character Detection Confidence", min_value=0.1, max_value=1.0, value=0.25)
panel_confidence = st.sidebar.slider("Panel Detection Confidence", min_value=0.1, max_value=1.0, value=0.25)
iou_threshold = st.sidebar.slider("IoU Threshold for YOLO", min_value=0.1, max_value=1.0, value=0.45)
st.sidebar.subheader("Text & Character Matching")
text_to_character_matching = st.sidebar.slider("Text-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.75)
character_to_character_matching = st.sidebar.slider("Character-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.5)
# Manga reading order (right-to-left for most manga)
reading_order = st.sidebar.radio("Manga Reading Order", options=["Right-to-Left", "Left-to-Right"], index=0)
# Summarization parameters
summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100)
def detect_panels_and_characters(image):
# Perform panel and character detection using YOLOv8 with adjustable thresholds
results = yolo_model.predict(image, conf=max(character_confidence, panel_confidence), iou=iou_threshold)
# Separate results into panels and characters
panels = []
characters = []
for result in results[0].boxes:
if result.conf >= panel_confidence and result.cls == 0: # Assuming '0' is the class ID for panels
panels.append(result.xyxy.cpu().numpy()) # Panel bounding box
elif result.conf >= character_confidence and result.cls == 1: # Assuming '1' is the class ID for characters
characters.append(result.xyxy.cpu().numpy()) # Character bounding box
return panels, characters
def detect_text(image):
# Convert image to grayscale for better OCR accuracy
gray_image = Image.fromarray(image).convert("L")
text = pytesseract.image_to_string(gray_image)
return text
def generate_narration(panels, characters, text):
# Generate narrations based on detected panels, characters, and text
narration = ""
if panels:
narration += f"Detected {len(panels)} panels. "
if characters:
narration += f"{len(characters)} characters were found. "
# Add text and summarization for better clarity
if text.strip():
narration += "Here's a summary of the text: "
summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text']
narration += summary
return narration
def match_text_to_characters(text, characters):
# Match text to the closest detected characters based on proximity
matched_characters = []
# Simplified matching logic based on distance between text and characters' positions
for char in characters:
if np.random.random() <= text_to_character_matching: # Simulated matching logic
matched_characters.append(char)
return matched_characters
def match_character_to_character(characters):
# Match characters with one another based on proximity or other characteristics
matched_pairs = []
# Simplified matching logic for character-to-character interaction
for i in range(len(characters)):
for j in range(i + 1, len(characters)):
if np.random.random() <= character_to_character_matching: # Simulated proximity matching
matched_pairs.append((characters[i], characters[j]))
return matched_pairs
def process_images(uploaded_files):
narrations = []
total_images = len(uploaded_files)
for idx, file in enumerate(uploaded_files):
# Load the image
image = Image.open(file)
image_np = np.array(image)
# Detect panels and characters
panels, characters = detect_panels_and_characters(image_np)
# Detect text
text = detect_text(image_np)
# Match text to characters and match characters to each other
matched_characters = match_text_to_characters(text, characters)
matched_pairs = match_character_to_character(characters)
# Generate narration based on matches
narration = generate_narration(panels, matched_characters, text)
narrations.append(narration)
# Adjust the reading order
if reading_order == "Right-to-Left":
narrations.reverse()
# Update progress bar
progress_bar.progress((idx + 1) / total_images)
# Display the current image and its narration
st.image(image, caption=f"Image {idx + 1}")
st.write(narration)
return narrations
if uploaded_files:
# Process uploaded images
narrations = process_images(uploaded_files)
# Show final results after processing all images
st.write("Narration Summary for All Images:")
st.write("\n\n".join(narrations))
else:
st.write("Please upload manga images to get started.")
|