Spaces:

HazlamiMalek
/

Image_Audio_Description

Sleeping

App Files Files Community

HazlamiMalek commited on Jan 3

Commit

d3a3de7

verified ·

1 Parent(s): 46eb893

Upload app.py

Browse files

Files changed (1) hide show

app.py +59 -0

app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import streamlit as st
+from PIL import Image
+from gtts import gTTS
+import os
+# Load your LLaVA model and processor here
+from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
+import torch
+# Load the processor and model
+processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+model = LlavaNextForConditionalGeneration.from_pretrained(
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    torch_dtype=torch.float16,
+    low_cpu_mem_usage=True
+).to("cuda:0")
+# Streamlit Interface
+st.title("Image-to-Audio Description Generator")
+# Upload an image
+uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
+if uploaded_image:
+    # Load and preprocess the image
+    image = Image.open(uploaded_image).convert("RGB")
+    st.image(image, caption="Uploaded Image", use_column_width=True)
+    # Define the conversation template
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is shown in this image?"},
+                {"type": "image"},
+            ],
+        },
+    ]
+    # Prepare inputs
+    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")
+    # Generate the description
+    output = model.generate(**inputs, max_new_tokens=100, pad_token_id=processor.tokenizer.eos_token_id)
+    description = processor.decode(output[0], skip_special_tokens=True)
+    # Display the description
+    st.write(f"Generated Description: {description}")
+    # Convert description to audio
+    tts = gTTS(description)
+    audio_path = "output.mp3"
+    tts.save(audio_path)
+    # Play the audio
+    audio_file = open(audio_path, "rb")
+    audio_bytes = audio_file.read()
+    st.audio(audio_bytes, format="audio/mp3")