Spaces:

daranaka
/

manga-narrator-ai

Runtime error

App Files Files Community

daranaka commited on Oct 21, 2024

Commit

580bfba

verified ·

1 Parent(s): f31366c

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -67

app.py CHANGED Viewed

@@ -4,9 +4,7 @@ from PIL import Image
 import torch
 import numpy as np
 import urllib.request
-import subprocess
-# Load model
 @st.cache_resource
 def load_model():
     model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
@@ -14,7 +12,6 @@ def load_model():
     model.to(device)
     return model
-# Read image as numpy array
 @st.cache_data
 def read_image_as_np_array(image_path):
     if "http" in image_path:
@@ -24,7 +21,6 @@ def read_image_as_np_array(image_path):
     image = np.array(image)
     return image
-# Predict detections and associations
 @st.cache_data
 def predict_detections_and_associations(
         image_path,
@@ -46,7 +42,6 @@ def predict_detections_and_associations(
             )[0]
     return result
-# OCR prediction for transcript
 @st.cache_data
 def predict_ocr(
     image_path,
@@ -56,9 +51,11 @@ def predict_ocr(
     character_character_matching_threshold,
     text_character_matching_threshold,
 ):
     image = read_image_as_np_array(image_path)
     result = predict_detections_and_associations(
-        image_path,
         character_detection_threshold,
         panel_detection_threshold,
         text_detection_threshold,
@@ -70,76 +67,59 @@ def predict_ocr(
         ocr_results = model.predict_ocr([image], text_bboxes_for_all_images)
     return ocr_results
-# Terminal command function
-def run_command(command):
-    try:
-        result = subprocess.run(command, shell=True, text=True, capture_output=True)
-        output = result.stdout + result.stderr
-        return output
-    except Exception as e:
-        return str(e)
-# Load the model
 model = load_model()
-# UI Design
-st.markdown("""<style>
-    .title-container { background-color: #0d1117; padding: 20px; border-radius: 10px; margin: 20px; }
-    .title { font-size: 2em; text-align: center; color: #fff; font-family: 'Comic Sans MS', cursive; text-transform: uppercase; letter-spacing: 0.1em; padding: 0.5em 0 0.2em; background: 0 0; }
-    .title span { background: -webkit-linear-gradient(45deg, #6495ed, #4169e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; }
-    .subheading { font-size: 1.5em; text-align: center; color: #ddd; font-family: 'Comic Sans MS', cursive; }
-</style>""", unsafe_allow_html=True)
-st.title("Manga Narrator and Terminal App")
-# File uploader for image
 path_to_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
-# Sidebar with hyperparameters
 st.sidebar.markdown("**Hyperparameters**")
-character_detection_threshold = st.sidebar.slider('Character detection threshold', 0.0, 1.0, 0.30, step=0.01)
-panel_detection_threshold = st.sidebar.slider('Panel detection threshold', 0.0, 1.0, 0.2, step=0.01)
-text_detection_threshold = st.sidebar.slider('Text detection threshold', 0.0, 1.0, 0.25, step=0.01)
-character_character_matching_threshold = st.sidebar.slider('Character-character matching threshold', 0.0, 1.0, 0.7, step=0.01)
-text_character_matching_threshold = st.sidebar.slider('Text-character matching threshold', 0.0, 1.0, 0.4, step=0.01)
-# Generate Narration button
 if path_to_image is not None:
-    st.markdown("**Prediction**")
-    # Button to generate narration
-    if st.button("Generate Narration"):
-        # Generate detections and associations
         result = predict_detections_and_associations(
-            path_to_image,
-            character_detection_threshold,
-            panel_detection_threshold,
-            text_detection_threshold,
-            character_character_matching_threshold,
-            text_character_matching_threshold,
-        )
-        # OCR result
         ocr_results = predict_ocr(
             path_to_image,
-            character_detection_threshold,
-            panel_detection_threshold,
-            text_detection_threshold,
-            character_character_matching_threshold,
-            text_character_matching_threshold,
         )
-        # Display results
-        st.image(result['image'], caption="Detected Panels and Characters")
-        st.text_area("Narration", result.get("narration", "Narration not available."))
-# Terminal command input
-st.markdown("**Terminal**")
-command_input = st.text_input("Enter a command", key='input')
-if st.button("Run Command"):
-    if command_input:
-        # Execute command
-        output = run_command(command_input)
-        # Display output
-        st.text_area("Terminal Output", value=output, height=300)

 import torch
 import numpy as np
 import urllib.request
 @st.cache_resource
 def load_model():
     model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
     model.to(device)
     return model
 @st.cache_data
 def read_image_as_np_array(image_path):
     if "http" in image_path:
     image = np.array(image)
     return image
 @st.cache_data
 def predict_detections_and_associations(
         image_path,
             )[0]
     return result
 @st.cache_data
 def predict_ocr(
     image_path,
     character_character_matching_threshold,
     text_character_matching_threshold,
 ):
+    if not generate_transcript:
+        return
     image = read_image_as_np_array(image_path)
     result = predict_detections_and_associations(
+        path_to_image,
         character_detection_threshold,
         panel_detection_threshold,
         text_detection_threshold,
         ocr_results = model.predict_ocr([image], text_bboxes_for_all_images)
     return ocr_results
 model = load_model()
+st.markdown("""    <style>        .title-container {            background-color: #0d1117;            padding: 20px;            border-radius: 10px;            margin: 20px;        }        .title {            font-size: 2em;            text-align: center;            color: #fff;            font-family: 'Comic Sans MS', cursive;            text-transform: uppercase;            letter-spacing: 0.1em;            padding: 0.5em 0 0.2em;            background: 0 0;        }        .title span {            background: -webkit-linear-gradient(45deg, #6495ed, #4169e1);            -webkit-background-clip: text;            -webkit-text-fill-color: transparent;        }        .subheading {            font-size: 1.5em;            text-align: center;            color: #ddd;            font-family: 'Comic Sans MS', cursive;        }        .affil, .authors {            font-size: 1em;            text-align: center;            color: #ddd;            font-family: 'Comic Sans MS', cursive;        }        .authors {            padding-top: 1em;        }    </style>    <div class='title-container'>        <div class='title'>            The <span>Ma</span>n<span>g</span>a Wh<span>i</span>sperer        </div>        <div class='subheading'>            Automatically Generating Transcriptions for Comics        </div>        <div class='authors'>            Ragav Sachdeva and Andrew Zisserman        </div>        <div class='affil'>            University of Oxford        </div>    </div>""", unsafe_allow_html=True)
 path_to_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
+st.sidebar.markdown("**Mode**")
+generate_detections_and_associations = st.sidebar.toggle("Generate detections and associations", True)
+generate_transcript = st.sidebar.toggle("Generate transcript (slower)", False)
 st.sidebar.markdown("**Hyperparameters**")
+input_character_detection_threshold = st.sidebar.slider('Character detection threshold', 0.0, 1.0, 0.30, step=0.01)
+input_panel_detection_threshold = st.sidebar.slider('Panel detection threshold', 0.0, 1.0, 0.2, step=0.01)
+input_text_detection_threshold = st.sidebar.slider('Text detection threshold', 0.0, 1.0, 0.25, step=0.01)
+input_character_character_matching_threshold = st.sidebar.slider('Character-character matching threshold', 0.0, 1.0, 0.7, step=0.01)
+input_text_character_matching_threshold = st.sidebar.slider('Text-character matching threshold', 0.0, 1.0, 0.4, step=0.01)
 if path_to_image is not None:
+    image = read_image_as_np_array(path_to_image)
+    st.markdown("**Prediction**")
+    if generate_detections_and_associations or generate_transcript:
         result = predict_detections_and_associations(
+        path_to_image,
+        input_character_detection_threshold,
+        input_panel_detection_threshold,
+        input_text_detection_threshold,
+        input_character_character_matching_threshold,
+        input_text_character_matching_threshold,
+    )
+    if generate_transcript:
         ocr_results = predict_ocr(
             path_to_image,
+            input_character_detection_threshold,
+            input_panel_detection_threshold,
+            input_text_detection_threshold,
+            input_character_character_matching_threshold,
+            input_text_character_matching_threshold,
         )
+    if generate_detections_and_associations and generate_transcript:
+        col1, col2 = st.columns(2)
+        output = model.visualise_single_image_prediction(image, result)
+        col1.image(output)
+        text_bboxes_for_all_images = [result["texts"]]
+        ocr_results = model.predict_ocr([image], text_bboxes_for_all_images)
+        transcript = model.generate_transcript_for_single_image(result, ocr_results[0])
+        col2.text(transcript)
+    elif generate_detections_and_associations:
+        output = model.visualise_single_image_prediction(image, result)
+        st.image(output)
+    elif generate_transcript:
+        transcript = model.generate_transcript_for_single_image(result, ocr_results[0])
+        st.text(transcript)