Spaces:

HazlamiMalek
/

Image_Audio_Description

Sleeping

App Files Files Community

HazlamiMalek commited on Dec 30, 2024

Commit

77025c3

verified ·

1 Parent(s): b1e008f

Full app.py with debug and pipeline fixes.

Browse files

Files changed (1) hide show

app.py +71 -7

app.py CHANGED Viewed

@@ -1,9 +1,73 @@
-import torch
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
-# Test model loading
-processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-model = LlavaNextForConditionalGeneration.from_pretrained(
-    "llava-hf/llava-v1.6-mistral-7b-hf"
-)
-print("Transformers and model loaded successfully!")

+import streamlit as st
+from PIL import Image
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
+from gtts import gTTS
+import torch
+# Debug: Start of the app
+st.title("Image-to-Audio Description Generator")
+# Step 1: Load LLaVA Processor and Model
+st.write("Loading processor and model...")
+try:
+    processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+    st.write("Processor loaded successfully!")
+except Exception as e:
+    st.write(f"Error loading processor: {str(e)}")
+try:
+    model = LlavaNextForConditionalGeneration.from_pretrained(
+        "llava-hf/llava-v1.6-mistral-7b-hf",
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True
+    ).to("cuda:0")
+    st.write("Model loaded successfully!")
+except Exception as e:
+    st.write(f"Error loading model: {str(e)}")
+# Step 2: Upload Image
+uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
+if uploaded_image:
+    st.write("Processing uploaded image...")
+    # Load and preprocess image
+    try:
+        image = Image.open(uploaded_image).convert("RGB")
+        image = image.resize((336, 336))
+        st.image(image, caption="Uploaded Image", use_column_width=True)
+    except Exception as e:
+        st.write(f"Error loading image: {str(e)}")
+    # Step 3: Generate Description
+    st.write("Generating description...")
+    try:
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is shown in this image?"},
+                    {"type": "image"},
+                ],
+            },
+        ]
+        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")
+        output = model.generate(
+            **inputs, max_new_tokens=100, pad_token_id=processor.tokenizer.eos_token_id
+        )
+        description = processor.decode(output[0], skip_special_tokens=True)
+        st.write(f"Generated Description: {description}")
+    except Exception as e:
+        st.write(f"Error generating description: {str(e)}")
+    # Step 4: Text-to-Speech Conversion
+    st.write("Converting description to audio...")
+    try:
+        tts = gTTS(description)
+        audio_path = "output.mp3"
+        tts.save(audio_path)
+        # Step 5: Play Audio
+        st.audio(audio_path, format="audio/mp3")
+        st.write("Audio generated successfully!")
+    except Exception as e:
+        st.write(f"Error converting text to audio: {str(e)}")