Spaces:

HazlamiMalek
/

Image_Audio_Description

Sleeping

App Files Files Community

Image_Audio_Description / app.py

HazlamiMalek

Update app,py

de5c1c1 verified 2 months ago

raw

history blame contribute delete

2.04 kB

	import streamlit as st
	from PIL import Image
	from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
	#from gtts import gTTS
	import torch
	import cProfile
	import pstats
	torch_dtype=torch.float32

	# Profile your app
	with cProfile.Profile() as pr:

	st.title("Image-to-Audio Description Generator")

	# Load the processor and model
	processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
	model = LlavaNextForConditionalGeneration.from_pretrained(
	"llava-hf/llava-v1.6-mistral-7b-hf",
	torch_dtype=torch.float16,
	low_cpu_mem_usage=True
	).to("cpu") # Use "cpu" instead of "cuda:0"

	# File uploader
	uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
	if uploaded_image:
	image = Image.open(uploaded_image).convert("RGB")
	image = image.resize((336, 336)) # Ensure compatibility with the model
	st.image(image, caption="Uploaded Image", use_container_width=True)

	# Generate description
	conversation = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": "What is shown in this image?"},
	{"type": "image"},
	],
	},
	]
	prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
	inputs = processor(images=image, text=prompt, return_tensors="pt").to("cpu")
	output = model.generate(**inputs, max_new_tokens=100, pad_token_id=processor.tokenizer.eos_token_id)
	description = processor.decode(output[0], skip_special_tokens=True)
	st.write(f"Generated Description: {description}")

	# Convert description to audio
	#tts = gTTS(description)
	#audio_path = "output.mp3"
	#tts.save(audio_path)

	# Play audio
	#st.audio(audio_path, format="audio/mp3")

	# Print profiling stats
	stats = pstats.Stats(pr)
	stats.sort_stats(pstats.SortKey.TIME)
	stats.print_stats()