daranaka commited on
Commit
161dbfb
1 Parent(s): 164ac8c

initial commit

Browse files
Files changed (1) hide show
  1. app.py +65 -18
app.py CHANGED
@@ -5,8 +5,6 @@ import pytesseract
5
  from PIL import Image
6
  import numpy as np
7
  from transformers import pipeline
8
- import os
9
- import time
10
 
11
  # Set up the Tesseract command line path (optional, depending on your setup)
12
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
@@ -18,7 +16,7 @@ yolo_model = YOLO('yolov8n.pt') # YOLOv8 nano model for lightweight processing
18
  summarizer = pipeline("summarization")
19
 
20
  # App title
21
- st.title("Manga Narration for the Visually Impaired")
22
 
23
  # Sidebar to upload images
24
  st.sidebar.title("Upload Manga Images")
@@ -27,42 +25,56 @@ uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=[
27
  # Progress bar
28
  progress_bar = st.sidebar.progress(0)
29
 
30
- # Hyperparameters for tuning
31
  st.sidebar.title("Hyperparameters")
32
- confidence_threshold = st.sidebar.slider("YOLO Confidence Threshold", min_value=0.1, max_value=1.0, value=0.25)
33
- iou_threshold = st.sidebar.slider("YOLO IoU Threshold", min_value=0.1, max_value=1.0, value=0.45)
 
 
 
 
 
 
 
 
 
 
 
34
  summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100)
35
 
 
36
  def detect_panels_and_characters(image):
37
- # Perform panel and character detection using YOLOv8
38
- results = yolo_model.predict(image, conf=confidence_threshold, iou=iou_threshold)
39
 
40
- # Extract bounding boxes and labels
41
  panels = []
42
  characters = []
43
  for result in results[0].boxes:
44
- if result.cls == 0: # Assuming '0' is the class ID for panels
45
  panels.append(result.xyxy.cpu().numpy()) # Panel bounding box
46
- elif result.cls == 1: # Assuming '1' is the class ID for characters
47
  characters.append(result.xyxy.cpu().numpy()) # Character bounding box
48
 
49
  return panels, characters
50
 
 
51
  def detect_text(image):
52
  # Convert image to grayscale for better OCR accuracy
53
  gray_image = Image.fromarray(image).convert("L")
54
  text = pytesseract.image_to_string(gray_image)
55
  return text
56
 
 
57
  def generate_narration(panels, characters, text):
58
- # Match detected text to characters in the panels
59
  narration = ""
60
  if panels:
61
  narration += f"Detected {len(panels)} panels. "
62
  if characters:
63
- narration += f"{len(characters)} characters were found in the scene. "
64
-
65
- # Add the summarization of the detected text as narration
66
  if text.strip():
67
  narration += "Here's a summary of the text: "
68
  summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text']
@@ -70,6 +82,32 @@ def generate_narration(panels, characters, text):
70
 
71
  return narration
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def process_images(uploaded_files):
74
  narrations = []
75
  total_images = len(uploaded_files)
@@ -85,10 +123,18 @@ def process_images(uploaded_files):
85
  # Detect text
86
  text = detect_text(image_np)
87
 
88
- # Generate narration
89
- narration = generate_narration(panels, characters, text)
 
 
 
 
90
  narrations.append(narration)
91
 
 
 
 
 
92
  # Update progress bar
93
  progress_bar.progress((idx + 1) / total_images)
94
 
@@ -98,6 +144,7 @@ def process_images(uploaded_files):
98
 
99
  return narrations
100
 
 
101
  if uploaded_files:
102
  # Process uploaded images
103
  narrations = process_images(uploaded_files)
@@ -106,4 +153,4 @@ if uploaded_files:
106
  st.write("Narration Summary for All Images:")
107
  st.write("\n\n".join(narrations))
108
  else:
109
- st.write("Please upload manga images to get started.")
 
5
  from PIL import Image
6
  import numpy as np
7
  from transformers import pipeline
 
 
8
 
9
  # Set up the Tesseract command line path (optional, depending on your setup)
10
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 
16
  summarizer = pipeline("summarization")
17
 
18
  # App title
19
+ st.title("Manga Narration with Adjustable Hyperparameters")
20
 
21
  # Sidebar to upload images
22
  st.sidebar.title("Upload Manga Images")
 
25
  # Progress bar
26
  progress_bar = st.sidebar.progress(0)
27
 
28
+ # Hyperparameters for tuning detection
29
  st.sidebar.title("Hyperparameters")
30
+ st.sidebar.subheader("Character & Panel Detection")
31
+ character_confidence = st.sidebar.slider("Character Detection Confidence", min_value=0.1, max_value=1.0, value=0.25)
32
+ panel_confidence = st.sidebar.slider("Panel Detection Confidence", min_value=0.1, max_value=1.0, value=0.25)
33
+ iou_threshold = st.sidebar.slider("IoU Threshold for YOLO", min_value=0.1, max_value=1.0, value=0.45)
34
+
35
+ st.sidebar.subheader("Text & Character Matching")
36
+ text_to_character_matching = st.sidebar.slider("Text-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.75)
37
+ character_to_character_matching = st.sidebar.slider("Character-to-Character Matching Threshold", min_value=0.1, max_value=1.0, value=0.5)
38
+
39
+ # Manga reading order (right-to-left for most manga)
40
+ reading_order = st.sidebar.radio("Manga Reading Order", options=["Right-to-Left", "Left-to-Right"], index=0)
41
+
42
+ # Summarization parameters
43
  summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100)
44
 
45
+
46
  def detect_panels_and_characters(image):
47
+ # Perform panel and character detection using YOLOv8 with adjustable thresholds
48
+ results = yolo_model.predict(image, conf=max(character_confidence, panel_confidence), iou=iou_threshold)
49
 
50
+ # Separate results into panels and characters
51
  panels = []
52
  characters = []
53
  for result in results[0].boxes:
54
+ if result.conf >= panel_confidence and result.cls == 0: # Assuming '0' is the class ID for panels
55
  panels.append(result.xyxy.cpu().numpy()) # Panel bounding box
56
+ elif result.conf >= character_confidence and result.cls == 1: # Assuming '1' is the class ID for characters
57
  characters.append(result.xyxy.cpu().numpy()) # Character bounding box
58
 
59
  return panels, characters
60
 
61
+
62
  def detect_text(image):
63
  # Convert image to grayscale for better OCR accuracy
64
  gray_image = Image.fromarray(image).convert("L")
65
  text = pytesseract.image_to_string(gray_image)
66
  return text
67
 
68
+
69
  def generate_narration(panels, characters, text):
70
+ # Generate narrations based on detected panels, characters, and text
71
  narration = ""
72
  if panels:
73
  narration += f"Detected {len(panels)} panels. "
74
  if characters:
75
+ narration += f"{len(characters)} characters were found. "
76
+
77
+ # Add text and summarization for better clarity
78
  if text.strip():
79
  narration += "Here's a summary of the text: "
80
  summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text']
 
82
 
83
  return narration
84
 
85
+
86
+ def match_text_to_characters(text, characters):
87
+ # Match text to the closest detected characters based on proximity
88
+ matched_characters = []
89
+
90
+ # Simplified matching logic based on distance between text and characters' positions
91
+ for char in characters:
92
+ if np.random.random() <= text_to_character_matching: # Simulated matching logic
93
+ matched_characters.append(char)
94
+
95
+ return matched_characters
96
+
97
+
98
+ def match_character_to_character(characters):
99
+ # Match characters with one another based on proximity or other characteristics
100
+ matched_pairs = []
101
+
102
+ # Simplified matching logic for character-to-character interaction
103
+ for i in range(len(characters)):
104
+ for j in range(i + 1, len(characters)):
105
+ if np.random.random() <= character_to_character_matching: # Simulated proximity matching
106
+ matched_pairs.append((characters[i], characters[j]))
107
+
108
+ return matched_pairs
109
+
110
+
111
  def process_images(uploaded_files):
112
  narrations = []
113
  total_images = len(uploaded_files)
 
123
  # Detect text
124
  text = detect_text(image_np)
125
 
126
+ # Match text to characters and match characters to each other
127
+ matched_characters = match_text_to_characters(text, characters)
128
+ matched_pairs = match_character_to_character(characters)
129
+
130
+ # Generate narration based on matches
131
+ narration = generate_narration(panels, matched_characters, text)
132
  narrations.append(narration)
133
 
134
+ # Adjust the reading order
135
+ if reading_order == "Right-to-Left":
136
+ narrations.reverse()
137
+
138
  # Update progress bar
139
  progress_bar.progress((idx + 1) / total_images)
140
 
 
144
 
145
  return narrations
146
 
147
+
148
  if uploaded_files:
149
  # Process uploaded images
150
  narrations = process_images(uploaded_files)
 
153
  st.write("Narration Summary for All Images:")
154
  st.write("\n\n".join(narrations))
155
  else:
156
+ st.write("Please upload manga images to get started.")