Spaces:

daranaka
/

manga-narrator-ai

Runtime error

App Files Files Community

daranaka commited on Oct 21, 2024

Commit

f31366c

verified ·

1 Parent(s): 5be894e

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -98

app.py CHANGED Viewed

@@ -4,8 +4,9 @@ from PIL import Image
 import torch
 import numpy as np
 import urllib.request
-# Load the model
 @st.cache_resource
 def load_model():
     model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
@@ -13,7 +14,7 @@ def load_model():
     model.to(device)
     return model
-# Read image
 @st.cache_data
 def read_image_as_np_array(image_path):
     if "http" in image_path:
@@ -45,113 +46,100 @@ def predict_detections_and_associations(
             )[0]
     return result
-# Generate coherent narrative
-def generate_story_from_manga(image_path, result):
-    # Extract characters, dialogues, and panels
-    characters = result["characters"]
-    dialogues = result["texts"]
-    # Start building the story
-    story = []
-    for i, panel in enumerate(result["panels"]):
-        story.append(f"Panel {i+1}:")
-        for char_id, character in enumerate(characters[i]):
-            dialogue = dialogues[char_id] if char_id < len(dialogues) else "..."
-            story.append(f"{character}: {dialogue}")
-        story.append("")  # Separate panels
-    return "\n".join(story)
-# App Layout
 model = load_model()
-st.markdown("""
-    <style>
-        .title-container {
-            background-color: #0d1117;
-            padding: 20px;
-            border-radius: 10px;
-            margin: 20px;
-        }
-        .title {
-            font-size: 2em;
-            text-align: center;
-            color: #fff;
-            font-family: 'Comic Sans MS', cursive;
-            text-transform: uppercase;
-            letter-spacing: 0.1em;
-            padding: 0.5em 0 0.2em;
-            background: 0 0;
-        }
-        .title span {
-            background: -webkit-linear-gradient(45deg, #6495ed, #4169e1);
-            -webkit-background-clip: text;
-            -webkit-text-fill-color: transparent;
-        }
-        .subheading {
-            font-size: 1.5em;
-            text-align: center;
-            color: #ddd;
-            font-family: 'Comic Sans MS', cursive;
-        }
-        .affil, .authors {
-            font-size: 1em;
-            text-align: center;
-            color: #ddd;
-            font-family: 'Comic Sans MS', cursive;
-        }
-        .authors {
-            padding-top: 1em;
-        }
-    </style>
-    <div class='title-container'>
-        <div class='title'>
-            The <span>Ma</span>n<span>g</span>a Wh<span>i</span>sperer
-        </div>
-        <div class='subheading'>
-            Automatically Generating Transcriptions for Comics
-        </div>
-        <div class='authors'>
-            Ragav Sachdeva and Andrew Zisserman
-        </div>
-        <div class='affil'>
-            University of Oxford
-        </div>
-    </div>
-""", unsafe_allow_html=True)
-# File uploader for images
 path_to_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
-# Sidebar Hyperparameters
-st.sidebar.markdown("**Mode**")
-generate_detections_and_associations = st.sidebar.checkbox("Generate detections and associations", True)
 st.sidebar.markdown("**Hyperparameters**")
-input_character_detection_threshold = st.sidebar.slider('Character detection threshold', 0.0, 1.0, 0.30, step=0.01)
-input_panel_detection_threshold = st.sidebar.slider('Panel detection threshold', 0.0, 1.0, 0.2, step=0.01)
-input_text_detection_threshold = st.sidebar.slider('Text detection threshold', 0.0, 1.0, 0.25, step=0.01)
-input_character_character_matching_threshold = st.sidebar.slider('Character-character matching threshold', 0.0, 1.0, 0.7, step=0.01)
-input_text_character_matching_threshold = st.sidebar.slider('Text-character matching threshold', 0.0, 1.0, 0.4, step=0.01)
-# If an image is uploaded, process it
 if path_to_image is not None:
-    image = read_image_as_np_array(path_to_image)
     st.markdown("**Prediction**")
-    if generate_detections_and_associations:
         result = predict_detections_and_associations(
             path_to_image,
-            input_character_detection_threshold,
-            input_panel_detection_threshold,
-            input_text_detection_threshold,
-            input_character_character_matching_threshold,
-            input_text_character_matching_threshold,
         )
-        # Generate and display the story
-        story = generate_story_from_manga(path_to_image, result)
-        st.markdown("### Generated Story:")
-        st.text(story)
-        # Display detection visualization
-        output = model.visualise_single_image_prediction(image, result)
-        st.image(output)

 import torch
 import numpy as np
 import urllib.request
+import subprocess
+# Load model
 @st.cache_resource
 def load_model():
     model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
     model.to(device)
     return model
+# Read image as numpy array
 @st.cache_data
 def read_image_as_np_array(image_path):
     if "http" in image_path:
             )[0]
     return result
+# OCR prediction for transcript
+@st.cache_data
+def predict_ocr(
+    image_path,
+    character_detection_threshold,
+    panel_detection_threshold,
+    text_detection_threshold,
+    character_character_matching_threshold,
+    text_character_matching_threshold,
+):
+    image = read_image_as_np_array(image_path)
+    result = predict_detections_and_associations(
+        image_path,
+        character_detection_threshold,
+        panel_detection_threshold,
+        text_detection_threshold,
+        character_character_matching_threshold,
+        text_character_matching_threshold,
+    )
+    text_bboxes_for_all_images = [result["texts"]]
+    with torch.no_grad():
+        ocr_results = model.predict_ocr([image], text_bboxes_for_all_images)
+    return ocr_results
+# Terminal command function
+def run_command(command):
+    try:
+        result = subprocess.run(command, shell=True, text=True, capture_output=True)
+        output = result.stdout + result.stderr
+        return output
+    except Exception as e:
+        return str(e)
+# Load the model
 model = load_model()
+# UI Design
+st.markdown("""<style>
+    .title-container { background-color: #0d1117; padding: 20px; border-radius: 10px; margin: 20px; }
+    .title { font-size: 2em; text-align: center; color: #fff; font-family: 'Comic Sans MS', cursive; text-transform: uppercase; letter-spacing: 0.1em; padding: 0.5em 0 0.2em; background: 0 0; }
+    .title span { background: -webkit-linear-gradient(45deg, #6495ed, #4169e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; }
+    .subheading { font-size: 1.5em; text-align: center; color: #ddd; font-family: 'Comic Sans MS', cursive; }
+</style>""", unsafe_allow_html=True)
+st.title("Manga Narrator and Terminal App")
+# File uploader for image
 path_to_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
+# Sidebar with hyperparameters
 st.sidebar.markdown("**Hyperparameters**")
+character_detection_threshold = st.sidebar.slider('Character detection threshold', 0.0, 1.0, 0.30, step=0.01)
+panel_detection_threshold = st.sidebar.slider('Panel detection threshold', 0.0, 1.0, 0.2, step=0.01)
+text_detection_threshold = st.sidebar.slider('Text detection threshold', 0.0, 1.0, 0.25, step=0.01)
+character_character_matching_threshold = st.sidebar.slider('Character-character matching threshold', 0.0, 1.0, 0.7, step=0.01)
+text_character_matching_threshold = st.sidebar.slider('Text-character matching threshold', 0.0, 1.0, 0.4, step=0.01)
+# Generate Narration button
 if path_to_image is not None:
     st.markdown("**Prediction**")
+    # Button to generate narration
+    if st.button("Generate Narration"):
+        # Generate detections and associations
         result = predict_detections_and_associations(
             path_to_image,
+            character_detection_threshold,
+            panel_detection_threshold,
+            text_detection_threshold,
+            character_character_matching_threshold,
+            text_character_matching_threshold,
         )
+        # OCR result
+        ocr_results = predict_ocr(
+            path_to_image,
+            character_detection_threshold,
+            panel_detection_threshold,
+            text_detection_threshold,
+            character_character_matching_threshold,
+            text_character_matching_threshold,
+        )
+        # Display results
+        st.image(result['image'], caption="Detected Panels and Characters")
+        st.text_area("Narration", result.get("narration", "Narration not available."))
+# Terminal command input
+st.markdown("**Terminal**")
+command_input = st.text_input("Enter a command", key='input')
+if st.button("Run Command"):
+    if command_input:
+        # Execute command
+        output = run_command(command_input)
+        # Display output
+        st.text_area("Terminal Output", value=output, height=300)