HazlamiMalek commited on
Commit
77025c3
·
verified ·
1 Parent(s): b1e008f

Full app.py with debug and pipeline fixes.

Browse files
Files changed (1) hide show
  1. app.py +71 -7
app.py CHANGED
@@ -1,9 +1,73 @@
1
- import torch
 
2
  from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- # Test model loading
5
- processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
6
- model = LlavaNextForConditionalGeneration.from_pretrained(
7
- "llava-hf/llava-v1.6-mistral-7b-hf"
8
- )
9
- print("Transformers and model loaded successfully!")
 
1
+ import streamlit as st
2
+ from PIL import Image
3
  from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
4
+ from gtts import gTTS
5
+ import torch
6
+
7
+ # Debug: Start of the app
8
+ st.title("Image-to-Audio Description Generator")
9
+
10
+ # Step 1: Load LLaVA Processor and Model
11
+ st.write("Loading processor and model...")
12
+ try:
13
+ processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
14
+ st.write("Processor loaded successfully!")
15
+ except Exception as e:
16
+ st.write(f"Error loading processor: {str(e)}")
17
+
18
+ try:
19
+ model = LlavaNextForConditionalGeneration.from_pretrained(
20
+ "llava-hf/llava-v1.6-mistral-7b-hf",
21
+ torch_dtype=torch.float16,
22
+ low_cpu_mem_usage=True
23
+ ).to("cuda:0")
24
+ st.write("Model loaded successfully!")
25
+ except Exception as e:
26
+ st.write(f"Error loading model: {str(e)}")
27
+
28
+ # Step 2: Upload Image
29
+ uploaded_image = st.file_uploader("Upload an Image", type=["jpg", "jpeg", "png"])
30
+ if uploaded_image:
31
+ st.write("Processing uploaded image...")
32
+ # Load and preprocess image
33
+ try:
34
+ image = Image.open(uploaded_image).convert("RGB")
35
+ image = image.resize((336, 336))
36
+ st.image(image, caption="Uploaded Image", use_column_width=True)
37
+ except Exception as e:
38
+ st.write(f"Error loading image: {str(e)}")
39
+
40
+ # Step 3: Generate Description
41
+ st.write("Generating description...")
42
+ try:
43
+ conversation = [
44
+ {
45
+ "role": "user",
46
+ "content": [
47
+ {"type": "text", "text": "What is shown in this image?"},
48
+ {"type": "image"},
49
+ ],
50
+ },
51
+ ]
52
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
53
+ inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda:0")
54
+ output = model.generate(
55
+ **inputs, max_new_tokens=100, pad_token_id=processor.tokenizer.eos_token_id
56
+ )
57
+ description = processor.decode(output[0], skip_special_tokens=True)
58
+ st.write(f"Generated Description: {description}")
59
+ except Exception as e:
60
+ st.write(f"Error generating description: {str(e)}")
61
+
62
+ # Step 4: Text-to-Speech Conversion
63
+ st.write("Converting description to audio...")
64
+ try:
65
+ tts = gTTS(description)
66
+ audio_path = "output.mp3"
67
+ tts.save(audio_path)
68
 
69
+ # Step 5: Play Audio
70
+ st.audio(audio_path, format="audio/mp3")
71
+ st.write("Audio generated successfully!")
72
+ except Exception as e:
73
+ st.write(f"Error converting text to audio: {str(e)}")