import streamlit as st from PIL import Image from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration #from gtts import gTTS import torch import cProfile import pstats torch_dtype=torch.float32 # Profile your app with cProfile.Profile() as pr: st.title("Image-to-Audio Description Generator") # Load the processor and model processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") model = LlavaNextForConditionalGeneration.from_pretrained( "llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True ).to("cpu") # Use "cpu" instead of "cuda:0" # File uploader uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"]) if uploaded_image: image = Image.open(uploaded_image).convert("RGB") image = image.resize((336, 336)) # Ensure compatibility with the model st.image(image, caption="Uploaded Image", use_container_width=True) # Generate description conversation = [ { "role": "user", "content": [ {"type": "text", "text": "What is shown in this image?"}, {"type": "image"}, ], }, ] prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) inputs = processor(images=image, text=prompt, return_tensors="pt").to("cpu") output = model.generate(**inputs, max_new_tokens=100, pad_token_id=processor.tokenizer.eos_token_id) description = processor.decode(output[0], skip_special_tokens=True) st.write(f"Generated Description: {description}") # Convert description to audio #tts = gTTS(description) #audio_path = "output.mp3" #tts.save(audio_path) # Play audio #st.audio(audio_path, format="audio/mp3") # Print profiling stats stats = pstats.Stats(pr) stats.sort_stats(pstats.SortKey.TIME) stats.print_stats()