Spaces:

daranaka
/

manga-narrator-ai

Runtime error

App Files Files Community

daranaka commited on Oct 20, 2024

Commit

161dbfb

verified ·

1 Parent(s): 164ac8c

initial commit

Browse files

Files changed (1) hide show

app.py +65 -18

app.py CHANGED Viewed

@@ -5,8 +5,6 @@ import pytesseract
 from PIL import Image
 import numpy as np
 from transformers import pipeline
-import os
-import time
 # Set up the Tesseract command line path (optional, depending on your setup)
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
@@ -18,7 +16,7 @@ yolo_model = YOLO('yolov8n.pt')  # YOLOv8 nano model for lightweight processing
 summarizer = pipeline("summarization")
 # App title
-st.title("Manga Narration for the Visually Impaired")
 # Sidebar to upload images
 st.sidebar.title("Upload Manga Images")
@@ -27,42 +25,56 @@ uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=[
 # Progress bar
 progress_bar = st.sidebar.progress(0)
-# Hyperparameters for tuning
 st.sidebar.title("Hyperparameters")
-confidence_threshold = st.sidebar.slider("YOLO Confidence Threshold", min_value=0.1, max_value=1.0, value=0.25)
-iou_threshold = st.sidebar.slider("YOLO IoU Threshold", min_value=0.1, max_value=1.0, value=0.45)
 summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100)
 def detect_panels_and_characters(image):
-    # Perform panel and character detection using YOLOv8
-    results = yolo_model.predict(image, conf=confidence_threshold, iou=iou_threshold)
-    # Extract bounding boxes and labels
     panels = []
     characters = []
     for result in results[0].boxes:
-        if result.cls == 0:  # Assuming '0' is the class ID for panels
             panels.append(result.xyxy.cpu().numpy())  # Panel bounding box
-        elif result.cls == 1:  # Assuming '1' is the class ID for characters
             characters.append(result.xyxy.cpu().numpy())  # Character bounding box
     return panels, characters
 def detect_text(image):
     # Convert image to grayscale for better OCR accuracy
     gray_image = Image.fromarray(image).convert("L")
     text = pytesseract.image_to_string(gray_image)
     return text
 def generate_narration(panels, characters, text):
-    # Match detected text to characters in the panels
     narration = ""
     if panels:
         narration += f"Detected {len(panels)} panels. "
     if characters:
-        narration += f"{len(characters)} characters were found in the scene. "
-    # Add the summarization of the detected text as narration
     if text.strip():
         narration += "Here's a summary of the text: "
         summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text']
@@ -70,6 +82,32 @@ def generate_narration(panels, characters, text):
     return narration
 def process_images(uploaded_files):
     narrations = []
     total_images = len(uploaded_files)
@@ -85,10 +123,18 @@ def process_images(uploaded_files):
         # Detect text
         text = detect_text(image_np)
-        # Generate narration
-        narration = generate_narration(panels, characters, text)
         narrations.append(narration)
         # Update progress bar
         progress_bar.progress((idx + 1) / total_images)
@@ -98,6 +144,7 @@ def process_images(uploaded_files):
     return narrations
 if uploaded_files:
     # Process uploaded images
     narrations = process_images(uploaded_files)
@@ -106,4 +153,4 @@ if uploaded_files:
     st.write("Narration Summary for All Images:")
     st.write("\n\n".join(narrations))
 else:
-    st.write("Please upload manga images to get started.")

 from PIL import Image
 import numpy as np
 from transformers import pipeline
 # Set up the Tesseract command line path (optional, depending on your setup)
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 summarizer = pipeline("summarization")
 # App title
+st.title("Manga Narration with Adjustable Hyperparameters")
 # Sidebar to upload images
 st.sidebar.title("Upload Manga Images")
 # Progress bar
 progress_bar = st.sidebar.progress(0)
+# Hyperparameters for tuning detection
 st.sidebar.title("Hyperparameters")
+st.sidebar.subheader("Character & Panel Detection")
+character_confidence = st.sidebar.slider("Character Detection Confidence", min_value=0.1, max_value=1.0, value=0.25)
+panel_confidence = st.sidebar.slider("Panel Detection Confidence", min_value=0.1, max_value=1.0, value=0.25)
+iou_threshold = st.sidebar.slider("IoU Threshold for YOLO", min_value=0.1, max_value=1.0, value=0.45)
+st.sidebar.subheader("Text & Character Matching")
+text_to_character_matching = st.sidebar.slider("Text-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.75)
+character_to_character_matching = st.sidebar.slider("Character-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.5)
+# Manga reading order (right-to-left for most manga)
+reading_order = st.sidebar.radio("Manga Reading Order", options=["Right-to-Left", "Left-to-Right"], index=0)
+# Summarization parameters
 summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100)
 def detect_panels_and_characters(image):
+    # Perform panel and character detection using YOLOv8 with adjustable thresholds
+    results = yolo_model.predict(image, conf=max(character_confidence, panel_confidence), iou=iou_threshold)
+    # Separate results into panels and characters
     panels = []
     characters = []
     for result in results[0].boxes:
+        if result.conf >= panel_confidence and result.cls == 0:  # Assuming '0' is the class ID for panels
             panels.append(result.xyxy.cpu().numpy())  # Panel bounding box
+        elif result.conf >= character_confidence and result.cls == 1:  # Assuming '1' is the class ID for characters
             characters.append(result.xyxy.cpu().numpy())  # Character bounding box
     return panels, characters
 def detect_text(image):
     # Convert image to grayscale for better OCR accuracy
     gray_image = Image.fromarray(image).convert("L")
     text = pytesseract.image_to_string(gray_image)
     return text
 def generate_narration(panels, characters, text):
+    # Generate narrations based on detected panels, characters, and text
     narration = ""
     if panels:
         narration += f"Detected {len(panels)} panels. "
     if characters:
+        narration += f"{len(characters)} characters were found. "
+    # Add text and summarization for better clarity
     if text.strip():
         narration += "Here's a summary of the text: "
         summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text']
     return narration
+def match_text_to_characters(text, characters):
+    # Match text to the closest detected characters based on proximity
+    matched_characters = []
+    # Simplified matching logic based on distance between text and characters' positions
+    for char in characters:
+        if np.random.random() <= text_to_character_matching:  # Simulated matching logic
+            matched_characters.append(char)
+    return matched_characters
+def match_character_to_character(characters):
+    # Match characters with one another based on proximity or other characteristics
+    matched_pairs = []
+    # Simplified matching logic for character-to-character interaction
+    for i in range(len(characters)):
+        for j in range(i + 1, len(characters)):
+            if np.random.random() <= character_to_character_matching:  # Simulated proximity matching
+                matched_pairs.append((characters[i], characters[j]))
+    return matched_pairs
 def process_images(uploaded_files):
     narrations = []
     total_images = len(uploaded_files)
         # Detect text
         text = detect_text(image_np)
+        # Match text to characters and match characters to each other
+        matched_characters = match_text_to_characters(text, characters)
+        matched_pairs = match_character_to_character(characters)
+        # Generate narration based on matches
+        narration = generate_narration(panels, matched_characters, text)
         narrations.append(narration)
+        # Adjust the reading order
+        if reading_order == "Right-to-Left":
+            narrations.reverse()
         # Update progress bar
         progress_bar.progress((idx + 1) / total_images)
     return narrations
 if uploaded_files:
     # Process uploaded images
     narrations = process_images(uploaded_files)
     st.write("Narration Summary for All Images:")
     st.write("\n\n".join(narrations))
 else:
+    st.write("Please upload manga images to get started.")