Spaces:

daranaka
/

manga-narrator-ai

Runtime error

App Files Files Community

daranaka commited on Oct 21, 2024

Commit

83a0630

verified ·

1 Parent(s): 580bfba

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -113

app.py CHANGED Viewed

@@ -1,125 +1,84 @@
 import streamlit as st
-from transformers import AutoModel
 from PIL import Image
-import torch
 import numpy as np
-import urllib.request
-@st.cache_resource
-def load_model():
-    model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device)
-    return model
-@st.cache_data
-def read_image_as_np_array(image_path):
-    if "http" in image_path:
-        image = Image.open(urllib.request.urlopen(image_path)).convert("L").convert("RGB")
-    else:
-        image = Image.open(image_path).convert("L").convert("RGB")
-    image = np.array(image)
-    return image
-@st.cache_data
-def predict_detections_and_associations(
-        image_path,
-        character_detection_threshold,
-        panel_detection_threshold,
-        text_detection_threshold,
-        character_character_matching_threshold,
-        text_character_matching_threshold,
-):
-    image = read_image_as_np_array(image_path)
-    with torch.no_grad():
-        result = model.predict_detections_and_associations(
-            [image],
-            character_detection_threshold=character_detection_threshold,
-            panel_detection_threshold=panel_detection_threshold,
-            text_detection_threshold=text_detection_threshold,
-            character_character_matching_threshold=character_character_matching_threshold,
-            text_character_matching_threshold=text_character_matching_threshold,
-            )[0]
-    return result
-@st.cache_data
-def predict_ocr(
-    image_path,
-    character_detection_threshold,
-    panel_detection_threshold,
-    text_detection_threshold,
-    character_character_matching_threshold,
-    text_character_matching_threshold,
-):
-    if not generate_transcript:
-        return
-    image = read_image_as_np_array(image_path)
-    result = predict_detections_and_associations(
-        path_to_image,
-        character_detection_threshold,
-        panel_detection_threshold,
-        text_detection_threshold,
-        character_character_matching_threshold,
-        text_character_matching_threshold,
-    )
-    text_bboxes_for_all_images = [result["texts"]]
-    with torch.no_grad():
-        ocr_results = model.predict_ocr([image], text_bboxes_for_all_images)
-    return ocr_results
-model = load_model()
-st.markdown("""    <style>        .title-container {            background-color: #0d1117;            padding: 20px;            border-radius: 10px;            margin: 20px;        }        .title {            font-size: 2em;            text-align: center;            color: #fff;            font-family: 'Comic Sans MS', cursive;            text-transform: uppercase;            letter-spacing: 0.1em;            padding: 0.5em 0 0.2em;            background: 0 0;        }        .title span {            background: -webkit-linear-gradient(45deg, #6495ed, #4169e1);            -webkit-background-clip: text;            -webkit-text-fill-color: transparent;        }        .subheading {            font-size: 1.5em;            text-align: center;            color: #ddd;            font-family: 'Comic Sans MS', cursive;        }        .affil, .authors {            font-size: 1em;            text-align: center;            color: #ddd;            font-family: 'Comic Sans MS', cursive;        }        .authors {            padding-top: 1em;        }    </style>    <div class='title-container'>        <div class='title'>            The <span>Ma</span>n<span>g</span>a Wh<span>i</span>sperer        </div>        <div class='subheading'>            Automatically Generating Transcriptions for Comics        </div>        <div class='authors'>            Ragav Sachdeva and Andrew Zisserman        </div>        <div class='affil'>            University of Oxford        </div>    </div>""", unsafe_allow_html=True)
-path_to_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
-st.sidebar.markdown("**Mode**")
-generate_detections_and_associations = st.sidebar.toggle("Generate detections and associations", True)
-generate_transcript = st.sidebar.toggle("Generate transcript (slower)", False)
-st.sidebar.markdown("**Hyperparameters**")
-input_character_detection_threshold = st.sidebar.slider('Character detection threshold', 0.0, 1.0, 0.30, step=0.01)
-input_panel_detection_threshold = st.sidebar.slider('Panel detection threshold', 0.0, 1.0, 0.2, step=0.01)
-input_text_detection_threshold = st.sidebar.slider('Text detection threshold', 0.0, 1.0, 0.25, step=0.01)
-input_character_character_matching_threshold = st.sidebar.slider('Character-character matching threshold', 0.0, 1.0, 0.7, step=0.01)
-input_text_character_matching_threshold = st.sidebar.slider('Text-character matching threshold', 0.0, 1.0, 0.4, step=0.01)
-if path_to_image is not None:
-    image = read_image_as_np_array(path_to_image)
-    st.markdown("**Prediction**")
-    if generate_detections_and_associations or generate_transcript:
-        result = predict_detections_and_associations(
-        path_to_image,
-        input_character_detection_threshold,
-        input_panel_detection_threshold,
-        input_text_detection_threshold,
-        input_character_character_matching_threshold,
-        input_text_character_matching_threshold,
-    )
-    if generate_transcript:
-        ocr_results = predict_ocr(
-            path_to_image,
-            input_character_detection_threshold,
-            input_panel_detection_threshold,
-            input_text_detection_threshold,
-            input_character_character_matching_threshold,
-            input_text_character_matching_threshold,
-        )
-    if generate_detections_and_associations and generate_transcript:
-        col1, col2 = st.columns(2)
-        output = model.visualise_single_image_prediction(image, result)
-        col1.image(output)
-        text_bboxes_for_all_images = [result["texts"]]
-        ocr_results = model.predict_ocr([image], text_bboxes_for_all_images)
-        transcript = model.generate_transcript_for_single_image(result, ocr_results[0])
-        col2.text(transcript)
-    elif generate_detections_and_associations:
-        output = model.visualise_single_image_prediction(image, result)
-        st.image(output)
-    elif generate_transcript:
-        transcript = model.generate_transcript_for_single_image(result, ocr_results[0])
-        st.text(transcript)

 import streamlit as st
 from PIL import Image
+import cv2
 import numpy as np
+import pytesseract
+import torch
+from torchvision import models, transforms
+from transformers import DetrImageProcessor, DetrForObjectDetection
+# Load a pre-trained DETR model for object detection
+processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
+model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
+# Image transformations
+transform = transforms.Compose([
+    transforms.ToTensor()
+])
+def detect_panels(image, threshold):
+    # Convert image to grayscale
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    edges = cv2.Canny(gray, 100, 200)
+    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    panels = []
+    for cnt in contours:
+        x, y, w, h = cv2.boundingRect(cnt)
+        if w > threshold and h > threshold:
+            panels.append({"coords": (x, y, w, h)})
+    return panels
+def detect_characters(image, threshold):
+    # Apply DETR model to detect characters
+    inputs = processor(images=image, return_tensors="pt")
+    outputs = model(**inputs)
+    logits = outputs.logits
+    bboxes = outputs.pred_boxes
+    # Filter results
+    characters = []
+    for logit, box in zip(logits[0], bboxes[0]):
+        if logit.argmax() == 0:  # Assuming '0' corresponds to 'character'
+            x, y, w, h = box * torch.tensor([image.width, image.height, image.width, image.height])
+            if w > threshold and h > threshold:
+                characters.append({"coords": (x.item(), y.item(), w.item(), h.item())})
+    return characters
+def match_text_to_characters(image, panels):
+    text_matches = []
+    for panel in panels:
+        x, y, w, h = map(int, panel['coords'])
+        panel_img = image.crop((x, y, x+w, y+h))
+        text = pytesseract.image_to_string(panel_img)
+        text_matches.append({"panel": panel, "dialog": text})
+    return text_matches
+def match_characters(characters):
+    coords = np.array([((c['coords'][0] + c['coords'][2]) / 2, (c['coords'][1] + c['coords'][3]) / 2) for c in characters])
+    clustering = DBSCAN(eps=20, min_samples=1).fit(coords)
+    character_matches = [{"character": c, "cluster": cluster} for c, cluster in zip(characters, clustering.labels_)]
+    return character_matches
+# Streamlit UI
+st.title("Advanced Manga Reader")
+uploaded_file = st.file_uploader("Upload a manga page", type=["jpg", "png"])
+if uploaded_file is not None:
+    image = Image.open(uploaded_file).convert('RGB')
+    st.image(image, caption='Uploaded Manga Page', use_column_width=True)
+    panel_threshold = st.slider("Panel Detection Threshold", 0, 500, 100)
+    character_threshold = st.slider("Character Detection Threshold", 0.0, 50.0, 10.0)
+    panels = detect_panels(np.array(image), panel_threshold)
+    characters = detect_characters(image, character_threshold)
+    dialogues = match_text_to_characters(image, panels)
+    st.write("Detected Panels:", panels)
+    st.write("Detected Characters:", characters)
+    st.write("Dialogues:", dialogues)
+    for dialogue in dialogues:
+        st.write(f"Panel: {dialogue['dialog']}")