daranaka commited on
Commit
f31366c
·
verified ·
1 Parent(s): 5be894e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -98
app.py CHANGED
@@ -4,8 +4,9 @@ from PIL import Image
4
  import torch
5
  import numpy as np
6
  import urllib.request
 
7
 
8
- # Load the model
9
  @st.cache_resource
10
  def load_model():
11
  model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
@@ -13,7 +14,7 @@ def load_model():
13
  model.to(device)
14
  return model
15
 
16
- # Read image
17
  @st.cache_data
18
  def read_image_as_np_array(image_path):
19
  if "http" in image_path:
@@ -45,113 +46,100 @@ def predict_detections_and_associations(
45
  )[0]
46
  return result
47
 
48
- # Generate coherent narrative
49
- def generate_story_from_manga(image_path, result):
50
- # Extract characters, dialogues, and panels
51
- characters = result["characters"]
52
- dialogues = result["texts"]
53
-
54
- # Start building the story
55
- story = []
56
- for i, panel in enumerate(result["panels"]):
57
- story.append(f"Panel {i+1}:")
58
- for char_id, character in enumerate(characters[i]):
59
- dialogue = dialogues[char_id] if char_id < len(dialogues) else "..."
60
- story.append(f"{character}: {dialogue}")
61
- story.append("") # Separate panels
62
- return "\n".join(story)
 
 
 
 
 
 
 
 
63
 
64
- # App Layout
 
 
 
 
 
 
 
 
 
65
  model = load_model()
66
 
67
- st.markdown("""
68
- <style>
69
- .title-container {
70
- background-color: #0d1117;
71
- padding: 20px;
72
- border-radius: 10px;
73
- margin: 20px;
74
- }
75
- .title {
76
- font-size: 2em;
77
- text-align: center;
78
- color: #fff;
79
- font-family: 'Comic Sans MS', cursive;
80
- text-transform: uppercase;
81
- letter-spacing: 0.1em;
82
- padding: 0.5em 0 0.2em;
83
- background: 0 0;
84
- }
85
- .title span {
86
- background: -webkit-linear-gradient(45deg, #6495ed, #4169e1);
87
- -webkit-background-clip: text;
88
- -webkit-text-fill-color: transparent;
89
- }
90
- .subheading {
91
- font-size: 1.5em;
92
- text-align: center;
93
- color: #ddd;
94
- font-family: 'Comic Sans MS', cursive;
95
- }
96
- .affil, .authors {
97
- font-size: 1em;
98
- text-align: center;
99
- color: #ddd;
100
- font-family: 'Comic Sans MS', cursive;
101
- }
102
- .authors {
103
- padding-top: 1em;
104
- }
105
- </style>
106
- <div class='title-container'>
107
- <div class='title'>
108
- The <span>Ma</span>n<span>g</span>a Wh<span>i</span>sperer
109
- </div>
110
- <div class='subheading'>
111
- Automatically Generating Transcriptions for Comics
112
- </div>
113
- <div class='authors'>
114
- Ragav Sachdeva and Andrew Zisserman
115
- </div>
116
- <div class='affil'>
117
- University of Oxford
118
- </div>
119
- </div>
120
- """, unsafe_allow_html=True)
121
 
122
- # File uploader for images
123
  path_to_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
124
 
125
- # Sidebar Hyperparameters
126
- st.sidebar.markdown("**Mode**")
127
- generate_detections_and_associations = st.sidebar.checkbox("Generate detections and associations", True)
128
  st.sidebar.markdown("**Hyperparameters**")
129
- input_character_detection_threshold = st.sidebar.slider('Character detection threshold', 0.0, 1.0, 0.30, step=0.01)
130
- input_panel_detection_threshold = st.sidebar.slider('Panel detection threshold', 0.0, 1.0, 0.2, step=0.01)
131
- input_text_detection_threshold = st.sidebar.slider('Text detection threshold', 0.0, 1.0, 0.25, step=0.01)
132
- input_character_character_matching_threshold = st.sidebar.slider('Character-character matching threshold', 0.0, 1.0, 0.7, step=0.01)
133
- input_text_character_matching_threshold = st.sidebar.slider('Text-character matching threshold', 0.0, 1.0, 0.4, step=0.01)
134
 
135
- # If an image is uploaded, process it
136
  if path_to_image is not None:
137
- image = read_image_as_np_array(path_to_image)
138
-
139
  st.markdown("**Prediction**")
140
- if generate_detections_and_associations:
 
 
 
141
  result = predict_detections_and_associations(
142
  path_to_image,
143
- input_character_detection_threshold,
144
- input_panel_detection_threshold,
145
- input_text_detection_threshold,
146
- input_character_character_matching_threshold,
147
- input_text_character_matching_threshold,
148
  )
149
-
150
- # Generate and display the story
151
- story = generate_story_from_manga(path_to_image, result)
152
- st.markdown("### Generated Story:")
153
- st.text(story)
154
 
155
- # Display detection visualization
156
- output = model.visualise_single_image_prediction(image, result)
157
- st.image(output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import torch
5
  import numpy as np
6
  import urllib.request
7
+ import subprocess
8
 
9
+ # Load model
10
  @st.cache_resource
11
  def load_model():
12
  model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
 
14
  model.to(device)
15
  return model
16
 
17
+ # Read image as numpy array
18
  @st.cache_data
19
  def read_image_as_np_array(image_path):
20
  if "http" in image_path:
 
46
  )[0]
47
  return result
48
 
49
+ # OCR prediction for transcript
50
+ @st.cache_data
51
+ def predict_ocr(
52
+ image_path,
53
+ character_detection_threshold,
54
+ panel_detection_threshold,
55
+ text_detection_threshold,
56
+ character_character_matching_threshold,
57
+ text_character_matching_threshold,
58
+ ):
59
+ image = read_image_as_np_array(image_path)
60
+ result = predict_detections_and_associations(
61
+ image_path,
62
+ character_detection_threshold,
63
+ panel_detection_threshold,
64
+ text_detection_threshold,
65
+ character_character_matching_threshold,
66
+ text_character_matching_threshold,
67
+ )
68
+ text_bboxes_for_all_images = [result["texts"]]
69
+ with torch.no_grad():
70
+ ocr_results = model.predict_ocr([image], text_bboxes_for_all_images)
71
+ return ocr_results
72
 
73
+ # Terminal command function
74
+ def run_command(command):
75
+ try:
76
+ result = subprocess.run(command, shell=True, text=True, capture_output=True)
77
+ output = result.stdout + result.stderr
78
+ return output
79
+ except Exception as e:
80
+ return str(e)
81
+
82
+ # Load the model
83
  model = load_model()
84
 
85
+ # UI Design
86
+ st.markdown("""<style>
87
+ .title-container { background-color: #0d1117; padding: 20px; border-radius: 10px; margin: 20px; }
88
+ .title { font-size: 2em; text-align: center; color: #fff; font-family: 'Comic Sans MS', cursive; text-transform: uppercase; letter-spacing: 0.1em; padding: 0.5em 0 0.2em; background: 0 0; }
89
+ .title span { background: -webkit-linear-gradient(45deg, #6495ed, #4169e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; }
90
+ .subheading { font-size: 1.5em; text-align: center; color: #ddd; font-family: 'Comic Sans MS', cursive; }
91
+ </style>""", unsafe_allow_html=True)
92
+
93
+ st.title("Manga Narrator and Terminal App")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ # File uploader for image
96
  path_to_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
97
 
98
+ # Sidebar with hyperparameters
 
 
99
  st.sidebar.markdown("**Hyperparameters**")
100
+ character_detection_threshold = st.sidebar.slider('Character detection threshold', 0.0, 1.0, 0.30, step=0.01)
101
+ panel_detection_threshold = st.sidebar.slider('Panel detection threshold', 0.0, 1.0, 0.2, step=0.01)
102
+ text_detection_threshold = st.sidebar.slider('Text detection threshold', 0.0, 1.0, 0.25, step=0.01)
103
+ character_character_matching_threshold = st.sidebar.slider('Character-character matching threshold', 0.0, 1.0, 0.7, step=0.01)
104
+ text_character_matching_threshold = st.sidebar.slider('Text-character matching threshold', 0.0, 1.0, 0.4, step=0.01)
105
 
106
+ # Generate Narration button
107
  if path_to_image is not None:
 
 
108
  st.markdown("**Prediction**")
109
+
110
+ # Button to generate narration
111
+ if st.button("Generate Narration"):
112
+ # Generate detections and associations
113
  result = predict_detections_and_associations(
114
  path_to_image,
115
+ character_detection_threshold,
116
+ panel_detection_threshold,
117
+ text_detection_threshold,
118
+ character_character_matching_threshold,
119
+ text_character_matching_threshold,
120
  )
 
 
 
 
 
121
 
122
+ # OCR result
123
+ ocr_results = predict_ocr(
124
+ path_to_image,
125
+ character_detection_threshold,
126
+ panel_detection_threshold,
127
+ text_detection_threshold,
128
+ character_character_matching_threshold,
129
+ text_character_matching_threshold,
130
+ )
131
+
132
+ # Display results
133
+ st.image(result['image'], caption="Detected Panels and Characters")
134
+ st.text_area("Narration", result.get("narration", "Narration not available."))
135
+
136
+ # Terminal command input
137
+ st.markdown("**Terminal**")
138
+ command_input = st.text_input("Enter a command", key='input')
139
+ if st.button("Run Command"):
140
+ if command_input:
141
+ # Execute command
142
+ output = run_command(command_input)
143
+ # Display output
144
+ st.text_area("Terminal Output", value=output, height=300)
145
+