File size: 2,533 Bytes
fe0c7a1
 
 
 
 
 
 
 
 
ab1bf33
fe0c7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import streamlit as st
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import whisper
from gtts import gTTS
import tempfile

# Initialize Models
# 1. Text Model
chat_pipeline = pipeline("text2text-generation", model="facebook/blenderbot-400M-distill")

# 2. Image Model
image_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# 3. Voice Model
voice_model = whisper.load_model("base")

# Streamlit App
st.title("Multimodal AI Assistant")
st.write("Interact with AI via text, voice, and images!")

# Text Input Section
st.header("Text Interaction")
user_text = st.text_input("Enter your query:")
if st.button("Submit Text"):
    if user_text:
        response = chat_pipeline(user_text)
        st.success(f"Assistant: {response[0]['generated_text']}")

# Voice Input Section
st.header("Voice Interaction")
uploaded_audio = st.file_uploader("Upload an audio file:", type=["wav", "mp3"])
if st.button("Submit Audio"):
    if uploaded_audio:
        # Save the uploaded audio to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
            temp_audio_file.write(uploaded_audio.read())
            temp_audio_path = temp_audio_file.name
        
        # Transcribe audio to text
        transcribed_text = voice_model.transcribe(temp_audio_path)['text']
        st.write(f"Transcribed Text: {transcribed_text}")
        
        # Generate AI response
        audio_response = chat_pipeline(transcribed_text)
        st.success(f"Assistant: {audio_response[0]['generated_text']}")
        
        # Convert response to speech
        tts = gTTS(audio_response[0]['generated_text'])
        tts_output_path = "response_audio.mp3"
        tts.save(tts_output_path)
        st.audio(tts_output_path)

# Image Input Section
st.header("Image Interaction")
uploaded_image = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"])
if st.button("Submit Image"):
    if uploaded_image:
        # Display uploaded image
        image = Image.open(uploaded_image)
        st.image(image, caption="Uploaded Image")
        
        # Generate caption
        inputs = image_processor(image, return_tensors="pt")
        outputs = image_model.generate(**inputs)
        caption = image_processor.decode(outputs[0], skip_special_tokens=True)
        st.success(f"Generated Caption: {caption}")