import streamlit as st
from PIL import Image
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
#from gtts import gTTS
import torch
import cProfile
import pstats
torch_dtype=torch.float32

# Profile your app
with cProfile.Profile() as pr:

    st.title("Image-to-Audio Description Generator")

    # Load the processor and model
    processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
    model = LlavaNextForConditionalGeneration.from_pretrained(
        "llava-hf/llava-v1.6-mistral-7b-hf",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    ).to("cpu")  # Use "cpu" instead of "cuda:0"

    # File uploader
    uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
    if uploaded_image:
        image = Image.open(uploaded_image).convert("RGB")
        image = image.resize((336, 336))  # Ensure compatibility with the model
        st.image(image, caption="Uploaded Image", use_container_width=True)

        # Generate description
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What is shown in this image?"},
                    {"type": "image"},
                ],
            },
        ]
        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        inputs = processor(images=image, text=prompt, return_tensors="pt").to("cpu")
        output = model.generate(**inputs, max_new_tokens=100, pad_token_id=processor.tokenizer.eos_token_id)
        description = processor.decode(output[0], skip_special_tokens=True)
        st.write(f"Generated Description: {description}")

        # Convert description to audio
        #tts = gTTS(description)
        #audio_path = "output.mp3"
        #tts.save(audio_path)

        # Play audio
        #st.audio(audio_path, format="audio/mp3")

# Print profiling stats
stats = pstats.Stats(pr)
stats.sort_stats(pstats.SortKey.TIME)
stats.print_stats()