|
import streamlit as st |
|
from PIL import Image |
|
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration |
|
|
|
import torch |
|
import cProfile |
|
import pstats |
|
torch_dtype=torch.float32 |
|
|
|
|
|
with cProfile.Profile() as pr: |
|
|
|
st.title("Image-to-Audio Description Generator") |
|
|
|
|
|
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") |
|
model = LlavaNextForConditionalGeneration.from_pretrained( |
|
"llava-hf/llava-v1.6-mistral-7b-hf", |
|
torch_dtype=torch.float16, |
|
low_cpu_mem_usage=True |
|
).to("cpu") |
|
|
|
|
|
uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"]) |
|
if uploaded_image: |
|
image = Image.open(uploaded_image).convert("RGB") |
|
image = image.resize((336, 336)) |
|
st.image(image, caption="Uploaded Image", use_container_width=True) |
|
|
|
|
|
conversation = [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": "What is shown in this image?"}, |
|
{"type": "image"}, |
|
], |
|
}, |
|
] |
|
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) |
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to("cpu") |
|
output = model.generate(**inputs, max_new_tokens=100, pad_token_id=processor.tokenizer.eos_token_id) |
|
description = processor.decode(output[0], skip_special_tokens=True) |
|
st.write(f"Generated Description: {description}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stats = pstats.Stats(pr) |
|
stats.sort_stats(pstats.SortKey.TIME) |
|
stats.print_stats() |
|
|