Spaces:

daranaka
/

manga-narrator-ai

Runtime error

App Files Files Community

daranaka commited on Oct 26, 2024

Commit

585854e

verified ·

1 Parent(s): 6552ee7

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -81

app.py CHANGED Viewed

@@ -1,84 +1,143 @@
 import streamlit as st
 from PIL import Image
-import cv2
-import numpy as np
-import pytesseract
 import torch
-from torchvision import models, transforms
-from transformers import DetrImageProcessor, DetrForObjectDetection
-# Load a pre-trained DETR model for object detection
-processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
-model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
-# Image transformations
-transform = transforms.Compose([
-    transforms.ToTensor()
-])
-def detect_panels(image, threshold):
-    # Convert image to grayscale
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    edges = cv2.Canny(gray, 100, 200)
-    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    panels = []
-    for cnt in contours:
-        x, y, w, h = cv2.boundingRect(cnt)
-        if w > threshold and h > threshold:
-            panels.append({"coords": (x, y, w, h)})
-    return panels
-def detect_characters(image, threshold):
-    # Apply DETR model to detect characters
-    inputs = processor(images=image, return_tensors="pt")
-    outputs = model(**inputs)
-    logits = outputs.logits
-    bboxes = outputs.pred_boxes
-    # Filter results
-    characters = []
-    for logit, box in zip(logits[0], bboxes[0]):
-        if logit.argmax() == 0:  # Assuming '0' corresponds to 'character'
-            x, y, w, h = box * torch.tensor([image.width, image.height, image.width, image.height])
-            if w > threshold and h > threshold:
-                characters.append({"coords": (x.item(), y.item(), w.item(), h.item())})
-    return characters
-def match_text_to_characters(image, panels):
-    text_matches = []
-    for panel in panels:
-        x, y, w, h = map(int, panel['coords'])
-        panel_img = image.crop((x, y, x+w, y+h))
-        text = pytesseract.image_to_string(panel_img)
-        text_matches.append({"panel": panel, "dialog": text})
-    return text_matches
-def match_characters(characters):
-    coords = np.array([((c['coords'][0] + c['coords'][2]) / 2, (c['coords'][1] + c['coords'][3]) / 2) for c in characters])
-    clustering = DBSCAN(eps=20, min_samples=1).fit(coords)
-    character_matches = [{"character": c, "cluster": cluster} for c, cluster in zip(characters, clustering.labels_)]
-    return character_matches
-# Streamlit UI
-st.title("Advanced Manga Reader")
-uploaded_file = st.file_uploader("Upload a manga page", type=["jpg", "png"])
-if uploaded_file is not None:
-    image = Image.open(uploaded_file).convert('RGB')
-    st.image(image, caption='Uploaded Manga Page', use_column_width=True)
-    panel_threshold = st.slider("Panel Detection Threshold", 0, 500, 100)
-    character_threshold = st.slider("Character Detection Threshold", 0.0, 50.0, 10.0)
-    panels = detect_panels(np.array(image), panel_threshold)
-    characters = detect_characters(image, character_threshold)
-    dialogues = match_text_to_characters(image, panels)
-    st.write("Detected Panels:", panels)
-    st.write("Detected Characters:", characters)
-    st.write("Dialogues:", dialogues)
-    for dialogue in dialogues:
-        st.write(f"Panel: {dialogue['dialog']}")

 import streamlit as st
+from transformers import AutoModel
 from PIL import Image
 import torch
+import numpy as np
+import urllib.request
+# Initialize session state for memory if not already
+if "memory" not in st.session_state:
+    st.session_state.memory = {"characters": {}, "transcript": ""}
+@st.cache_resource
+def load_model():
+    model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    return model
+@st.cache_data
+def read_image_as_np_array(image_path):
+    if "http" in image_path:
+        image = Image.open(urllib.request.urlopen(image_path)).convert("L").convert("RGB")
+    else:
+        image = Image.open(image_path).convert("L").convert("RGB")
+    image = np.array(image)
+    return image
+@st.cache_data
+def predict_detections_and_associations(
+        image_path,
+        character_detection_threshold,
+        panel_detection_threshold,
+        text_detection_threshold,
+        character_character_matching_threshold,
+        text_character_matching_threshold,
+):
+    image = read_image_as_np_array(image_path)
+    with torch.no_grad():
+        result = model.predict_detections_and_associations(
+            [image],
+            character_detection_threshold=character_detection_threshold,
+            panel_detection_threshold=panel_detection_threshold,
+            text_detection_threshold=text_detection_threshold,
+            character_character_matching_threshold=character_character_matching_threshold,
+            text_character_matching_threshold=text_character_matching_threshold,
+            )[0]
+    return result
+@st.cache_data
+def predict_ocr(
+    image_path,
+    character_detection_threshold,
+    panel_detection_threshold,
+    text_detection_threshold,
+    character_character_matching_threshold,
+    text_character_matching_threshold,
+):
+    if not generate_transcript:
+        return
+    image = read_image_as_np_array(image_path)
+    result = predict_detections_and_associations(
+        image_path,
+        character_detection_threshold,
+        panel_detection_threshold,
+        text_detection_threshold,
+        character_character_matching_threshold,
+        text_character_matching_threshold,
+    )
+    text_bboxes_for_all_images = [result["texts"]]
+    with torch.no_grad():
+        ocr_results = model.predict_ocr([image], text_bboxes_for_all_images)
+    return ocr_results
+def clear_memory():
+    st.session_state.memory = {"characters": {}, "transcript": ""}
+    st.write("Memory cleared.")
+model = load_model()
+# Display header and UI components
+st.markdown("""    <style> ... styles here ... </style> """, unsafe_allow_html=True)
+path_to_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
+# Memory control button
+st.button("Clear Memory", on_click=clear_memory)
+st.sidebar.markdown("**Mode**")
+generate_detections_and_associations = st.sidebar.toggle("Generate detections and associations", True)
+generate_transcript = st.sidebar.toggle("Generate transcript (slower)", False)
+st.sidebar.markdown("**Hyperparameters**")
+input_character_detection_threshold = st.sidebar.slider('Character detection threshold', 0.0, 1.0, 0.30, step=0.01)
+input_panel_detection_threshold = st.sidebar.slider('Panel detection threshold', 0.0, 1.0, 0.2, step=0.01)
+input_text_detection_threshold = st.sidebar.slider('Text detection threshold', 0.0, 1.0, 0.25, step=0.01)
+input_character_character_matching_threshold = st.sidebar.slider('Character-character matching threshold', 0.0, 1.0, 0.7, step=0.01)
+input_text_character_matching_threshold = st.sidebar.slider('Text-character matching threshold', 0.0, 1.0, 0.4, step=0.01)
+if path_to_image is not None:
+    image = read_image_as_np_array(path_to_image)
+    st.markdown("**Prediction**")
+    if generate_detections_and_associations or generate_transcript:
+        result = predict_detections_and_associations(
+            path_to_image,
+            input_character_detection_threshold,
+            input_panel_detection_threshold,
+            input_text_detection_threshold,
+            input_character_character_matching_threshold,
+            input_text_character_matching_threshold,
+        )
+    if generate_transcript:
+        ocr_results = predict_ocr(
+            path_to_image,
+            input_character_detection_threshold,
+            input_panel_detection_threshold,
+            input_text_detection_threshold,
+            input_character_character_matching_threshold,
+            input_text_character_matching_threshold,
+        )
+        # Append new characters and transcript to memory
+        if generate_detections_and_associations:
+            output = model.visualise_single_image_prediction(image, result)
+            st.image(output)
+            # Update character memory based on detected characters
+            detected_characters = result.get("characters", {})
+            st.session_state.memory["characters"].update(detected_characters)
+        # Append the current transcript to the ongoing transcript in memory
+        transcript = model.generate_transcript_for_single_image(result, ocr_results[0])
+        st.session_state.memory["transcript"] += transcript + "\n"
+        # Display the cumulative transcript from memory
+        st.text(st.session_state.memory["transcript"])
+    elif generate_detections_and_associations:
+        output = model.visualise_single_image_prediction(image, result)
+        st.image(output)
+    elif generate_transcript:
+        # Display the cumulative transcript
+        st.text(st.session_state.memory["transcript"])