Spaces:

daranaka
/

manga-narrator-ai

Runtime error

App Files Files Community

manga-narrator-ai / app.py

daranaka

Update app.py

06d0589 verified 22 days ago

raw

history blame

5.55 kB

	import streamlit as st
	from transformers import AutoModel
	from PIL import Image
	import torch
	import numpy as np
	import urllib.request

	# Load model without caching due to serialization issue with PretrainedConfig
	def load_model():
	model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)
	return model

	@st.cache_data
	def read_image_as_np_array(image_path):
	if "http" in image_path:
	image = Image.open(urllib.request.urlopen(image_path)).convert("L").convert("RGB")
	else:
	image = Image.open(image_path).convert("L").convert("RGB")
	image = np.array(image)
	return image

	model = load_model()

	@st.cache_data
	def predict_detections_and_associations(
	image_path,
	character_detection_threshold,
	panel_detection_threshold,
	text_detection_threshold,
	character_character_matching_threshold,
	text_character_matching_threshold,
	):
	image = read_image_as_np_array(image_path)
	with torch.no_grad():
	result = model.predict_detections_and_associations(
	[image],
	character_detection_threshold=character_detection_threshold,
	panel_detection_threshold=panel_detection_threshold,
	text_detection_threshold=text_detection_threshold,
	character_character_matching_threshold=character_character_matching_threshold,
	text_character_matching_threshold=text_character_matching_threshold,
	)[0]
	return result

	@st.cache_data
	def predict_ocr(
	image_path,
	character_detection_threshold,
	panel_detection_threshold,
	text_detection_threshold,
	character_character_matching_threshold,
	text_character_matching_threshold,
	):
	if not generate_transcript:
	return
	image = read_image_as_np_array(image_path)
	result = predict_detections_and_associations(
	image_path,
	character_detection_threshold,
	panel_detection_threshold,
	text_detection_threshold,
	character_character_matching_threshold,
	text_character_matching_threshold,
	)
	text_bboxes_for_all_images = [result["texts"]]
	with torch.no_grad():
	ocr_results = model.predict_ocr([image], text_bboxes_for_all_images)
	return ocr_results

	def clear_memory():
	st.session_state.memory = {"characters": {}, "transcript": ""}
	st.write("Memory cleared.")

	model = load_model()

	# Display header and UI components
	st.markdown(""" <style> ... styles here ... </style> """, unsafe_allow_html=True)
	path_to_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])

	# Memory control button
	st.button("Clear Memory", on_click=clear_memory)

	st.sidebar.markdown("Mode")
	generate_detections_and_associations = st.sidebar.toggle("Generate detections and associations", True)
	generate_transcript = st.sidebar.toggle("Generate transcript (slower)", False)

	st.sidebar.markdown("Hyperparameters")
	input_character_detection_threshold = st.sidebar.slider('Character detection threshold', 0.0, 1.0, 0.30, step=0.01)
	input_panel_detection_threshold = st.sidebar.slider('Panel detection threshold', 0.0, 1.0, 0.2, step=0.01)
	input_text_detection_threshold = st.sidebar.slider('Text detection threshold', 0.0, 1.0, 0.25, step=0.01)
	input_character_character_matching_threshold = st.sidebar.slider('Character-character matching threshold', 0.0, 1.0, 0.7, step=0.01)
	input_text_character_matching_threshold = st.sidebar.slider('Text-character matching threshold', 0.0, 1.0, 0.4, step=0.01)

	if path_to_image is not None:
	image = read_image_as_np_array(path_to_image)
	st.markdown("Prediction")

	if generate_detections_and_associations or generate_transcript:
	result = predict_detections_and_associations(
	path_to_image,
	input_character_detection_threshold,
	input_panel_detection_threshold,
	input_text_detection_threshold,
	input_character_character_matching_threshold,
	input_text_character_matching_threshold,
	)

	if generate_transcript:
	ocr_results = predict_ocr(
	path_to_image,
	input_character_detection_threshold,
	input_panel_detection_threshold,
	input_text_detection_threshold,
	input_character_character_matching_threshold,
	input_text_character_matching_threshold,
	)

	# Append new characters and transcript to memory
	if generate_detections_and_associations:
	output = model.visualise_single_image_prediction(image, result)
	st.image(output)
	# Update character memory based on detected characters
	detected_characters = result.get("characters", {})
	st.session_state.memory["characters"].update(detected_characters)

	# Append the current transcript to the ongoing transcript in memory
	transcript = model.generate_transcript_for_single_image(result, ocr_results[0])
	st.session_state.memory["transcript"] += transcript + "\n"

	# Display the cumulative transcript from memory
	st.text(st.session_state.memory["transcript"])

	elif generate_detections_and_associations:
	output = model.visualise_single_image_prediction(image, result)
	st.image(output)

	elif generate_transcript:
	# Display the cumulative transcript
	st.text(st.session_state.memory["transcript"])