Spaces:

HazlamiMalek
/

Image_Audio_Description

Sleeping

App Files Files Community

HazlamiMalek commited on Jan 3

Commit

de5c1c1

verified ·

1 Parent(s): d3a3de7

Update app,py

Browse files

Files changed (1) hide show

app.py +53 -55

app.py CHANGED Viewed

@@ -1,59 +1,57 @@
 import streamlit as st
 from PIL import Image
-from gtts import gTTS
-import os
-# Load your LLaVA model and processor here
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 import torch
-# Load the processor and model
-processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-model = LlavaNextForConditionalGeneration.from_pretrained(
-    "llava-hf/llava-v1.6-mistral-7b-hf",
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True
-).to("cuda:0")
-# Streamlit Interface
-st.title("Image-to-Audio Description Generator")
-# Upload an image
-uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
-if uploaded_image:
-    # Load and preprocess the image
-    image = Image.open(uploaded_image).convert("RGB")
-    st.image(image, caption="Uploaded Image", use_column_width=True)
-    # Define the conversation template
-    conversation = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "What is shown in this image?"},
-                {"type": "image"},
-            ],
-        },
-    ]
-    # Prepare inputs
-    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-    inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")
-    # Generate the description
-    output = model.generate(**inputs, max_new_tokens=100, pad_token_id=processor.tokenizer.eos_token_id)
-    description = processor.decode(output[0], skip_special_tokens=True)
-    # Display the description
-    st.write(f"Generated Description: {description}")
-    # Convert description to audio
-    tts = gTTS(description)
-    audio_path = "output.mp3"
-    tts.save(audio_path)
-    # Play the audio
-    audio_file = open(audio_path, "rb")
-    audio_bytes = audio_file.read()
-    st.audio(audio_bytes, format="audio/mp3")

 import streamlit as st
 from PIL import Image
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
+#from gtts import gTTS
 import torch
+import cProfile
+import pstats
+torch_dtype=torch.float32
+# Profile your app
+with cProfile.Profile() as pr:
+    st.title("Image-to-Audio Description Generator")
+    # Load the processor and model
+    processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+    model = LlavaNextForConditionalGeneration.from_pretrained(
+        "llava-hf/llava-v1.6-mistral-7b-hf",
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True
+    ).to("cpu")  # Use "cpu" instead of "cuda:0"
+    # File uploader
+    uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
+    if uploaded_image:
+        image = Image.open(uploaded_image).convert("RGB")
+        image = image.resize((336, 336))  # Ensure compatibility with the model
+        st.image(image, caption="Uploaded Image", use_container_width=True)
+        # Generate description
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is shown in this image?"},
+                    {"type": "image"},
+                ],
+            },
+        ]
+        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to("cpu")
+        output = model.generate(**inputs, max_new_tokens=100, pad_token_id=processor.tokenizer.eos_token_id)
+        description = processor.decode(output[0], skip_special_tokens=True)
+        st.write(f"Generated Description: {description}")
+        # Convert description to audio
+        #tts = gTTS(description)
+        #audio_path = "output.mp3"
+        #tts.save(audio_path)
+        # Play audio
+        #st.audio(audio_path, format="audio/mp3")
+# Print profiling stats
+stats = pstats.Stats(pr)
+stats.sort_stats(pstats.SortKey.TIME)
+stats.print_stats()