Spaces:

daranaka
/

manga-narrator-ai

Runtime error

App Files Files Community

manga-narrator-ai / app.py

daranaka

Update app.py

78e178a verified 22 days ago

raw

history blame

5.93 kB

	import streamlit as st
	from transformers import AutoModel
	from PIL import Image
	import torch
	import numpy as np
	import urllib.request

	# Initialize session state for memory if not already
	if "memory" not in st.session_state:
	st.session_state.memory = {"characters": {}, "transcript": ""}

	@st.cache_resource
	def load_model():
	try:
	model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)
	return model
	except Exception as e:
	st.error(f"Error loading model: {e}")
	return None

	@st.cache_data
	def read_image_as_np_array(image_path):
	try:
	if "http" in image_path:
	image = Image.open(urllib.request.urlopen(image_path)).convert("L").convert("RGB")
	else:
	image = Image.open(image_path).convert("L").convert("RGB")
	image = np.array(image)
	return image
	except Exception as e:
	st.error(f"Error reading image: {e}")
	return None

	@st.cache_data
	def predict_detections_and_associations(
	image_path,
	char_detect_thresh,
	panel_detect_thresh,
	text_detect_thresh,
	char_char_match_thresh,
	text_char_match_thresh,
	):
	image = read_image_as_np_array(image_path)
	if image is None:
	return None
	try:
	with torch.no_grad():
	result = model.predict_detections_and_associations(
	[image],
	character_detection_threshold=char_detect_thresh,
	panel_detection_threshold=panel_detect_thresh,
	text_detection_threshold=text_detect_thresh,
	character_character_matching_threshold=char_char_match_thresh,
	text_character_matching_threshold=text_char_match_thresh,
	)[0]
	return result
	except Exception as e:
	st.error(f"Error during prediction: {e}")
	return None

	@st.cache_data
	def predict_ocr(
	image_path,
	character_detection_threshold,
	panel_detection_threshold,
	text_detection_threshold,
	character_character_matching_threshold,
	text_character_matching_threshold,
	):
	if not generate_transcript:
	return
	image = read_image_as_np_array(image_path)
	result = predict_detections_and_associations(
	image_path,
	character_detection_threshold,
	panel_detection_threshold,
	text_detection_threshold,
	character_character_matching_threshold,
	text_character_matching_threshold,
	)
	text_bboxes_for_all_images = [result["texts"]]
	with torch.no_grad():
	ocr_results = model.predict_ocr([image], text_bboxes_for_all_images)
	return ocr_results

	def clear_memory():
	st.session_state.memory = {"characters": {}, "transcript": ""}
	st.write("Memory cleared.")

	model = load_model()

	# Display header and UI components
	st.markdown(""" <style> ... styles here ... </style> """, unsafe_allow_html=True)
	path_to_image = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])

	# Memory control button
	st.button("Clear Memory", on_click=clear_memory)

	st.sidebar.markdown("Mode")
	generate_detections_and_associations = st.sidebar.toggle("Generate detections and associations", True)
	generate_transcript = st.sidebar.toggle("Generate transcript (slower)", False)

	st.sidebar.markdown("Hyperparameters")
	input_character_detection_threshold = st.sidebar.slider('Character detection threshold', 0.0, 1.0, 0.30, step=0.01)
	input_panel_detection_threshold = st.sidebar.slider('Panel detection threshold', 0.0, 1.0, 0.2, step=0.01)
	input_text_detection_threshold = st.sidebar.slider('Text detection threshold', 0.0, 1.0, 0.25, step=0.01)
	input_character_character_matching_threshold = st.sidebar.slider('Character-character matching threshold', 0.0, 1.0, 0.7, step=0.01)
	input_text_character_matching_threshold = st.sidebar.slider('Text-character matching threshold', 0.0, 1.0, 0.4, step=0.01)

	if path_to_image is not None:
	image = read_image_as_np_array(path_to_image)
	st.markdown("Prediction")

	if generate_detections_and_associations or generate_transcript:
	result = predict_detections_and_associations(
	path_to_image,
	input_character_detection_threshold,
	input_panel_detection_threshold,
	input_text_detection_threshold,
	input_character_character_matching_threshold,
	input_text_character_matching_threshold,
	)

	if generate_transcript:
	ocr_results = predict_ocr(
	path_to_image,
	input_character_detection_threshold,
	input_panel_detection_threshold,
	input_text_detection_threshold,
	input_character_character_matching_threshold,
	input_text_character_matching_threshold,
	)

	# Append new characters and transcript to memory
	if generate_detections_and_associations:
	output = model.visualise_single_image_prediction(image, result)
	st.image(output)
	# Update character memory based on detected characters
	detected_characters = result.get("characters", {})
	st.session_state.memory["characters"].update(detected_characters)

	# Append the current transcript to the ongoing transcript in memory
	transcript = model.generate_transcript_for_single_image(result, ocr_results[0])
	st.session_state.memory["transcript"] += transcript + "\n"

	# Display the cumulative transcript from memory
	st.text(st.session_state.memory["transcript"])

	elif generate_detections_and_associations:
	output = model.visualise_single_image_prediction(image, result)
	st.image(output)

	elif generate_transcript:
	# Display the cumulative transcript
	st.text(st.session_state.memory["transcript"])