Spaces:

K00B404
/

Image_to_role

Runtime error

App Files Files Community

Image_to_role / app.py

K00B404

Update app.py

2d1cfc3 verified 4 months ago

raw

history blame contribute delete

3.8 kB

	import gradio as gr
	import transformers
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from PIL import Image
	import warnings

	# Disable warnings and progress bars
	transformers.logging.set_verbosity_error()
	transformers.logging.disable_progress_bar()
	warnings.filterwarnings('ignore')

	# Initialize model and tokenizer
	def load_model(device='cpu'):
	model = AutoModelForCausalLM.from_pretrained(
	'qnguyen3/nanoLLaVA',
	torch_dtype=torch.float16,
	device_map='auto',
	trust_remote_code=True
	)
	tokenizer = AutoTokenizer.from_pretrained(
	'qnguyen3/nanoLLaVA',
	trust_remote_code=True
	)
	return model, tokenizer

	def generate_caption(image, model, tokenizer):
	# Prepare the prompt
	prompt = "Describe this image in detail"
	messages = [
	{"role": "system", "content": "Answer the question"},
	{"role": "user", "content": f'<image>\n{prompt}'}
	]

	# Apply chat template
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Process text and image
	text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
	input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
	image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)

	# Generate caption
	output_ids = model.generate(
	input_ids,
	images=image_tensor,
	max_new_tokens=2048,
	use_cache=True
	)[0]

	# Decode the output
	caption = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
	return caption

	def create_persona(caption):
	persona_prompt = f"""<\|im_start\|>system
	You are a character based on this description: {caption}

	Role: An entity exactly as described in the image
	Background: Your appearance and characteristics match the image description
	Personality: Reflect the mood, style, and elements captured in the image
	Goal: Interact authentically based on your visual characteristics

	Please stay in character and respond as this entity would, incorporating visual elements from your description into your responses.<\|im_end\|>"""

	return persona_prompt

	def process_image_to_persona(image, model, tokenizer):
	if image is None:
	return "Please upload an image.", ""
	# Convert to PIL Image if needed
	if not isinstance(image, Image.Image):
	image = Image.fromarray(image)

	# Generate caption from image
	caption = generate_caption(image, model, tokenizer)

	# Transform caption into persona
	persona = create_persona(caption)

	return caption, persona

	# Create Gradio interface
	def create_interface():
	# Load model and tokenizer
	model, tokenizer = load_model()

	with gr.Blocks() as app:
	gr.Markdown("# Image to Chatbot Persona Generator")
	gr.Markdown("Upload an image of a character to generate a persona for a chatbot based on the image.")

	with gr.Row():
	image_input = gr.Image(type="pil", label="Upload Character Image")

	with gr.Row():
	generate_button = gr.Button("Generate Persona")

	with gr.Row():
	caption_output = gr.Textbox(label="Generated Caption", lines=3)
	persona_output = gr.Textbox(label="Chatbot Persona", lines=10)

	generate_button.click(
	fn=lambda img: process_image_to_persona(img, model, tokenizer),
	inputs=[image_input],
	outputs=[caption_output, persona_output]
	)

	return app

	# Launch the app
	if __name__ == "__main__":
	app = create_interface()
	app.launch(share=True)