ANASAKHTAR's picture
Update app.py
ab1bf33 verified
import streamlit as st
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import whisper
from gtts import gTTS
import tempfile
# Initialize Models
# 1. Text Model
chat_pipeline = pipeline("text2text-generation", model="facebook/blenderbot-400M-distill")
# 2. Image Model
image_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# 3. Voice Model
voice_model = whisper.load_model("base")
# Streamlit App
st.title("Multimodal AI Assistant")
st.write("Interact with AI via text, voice, and images!")
# Text Input Section
st.header("Text Interaction")
user_text = st.text_input("Enter your query:")
if st.button("Submit Text"):
if user_text:
response = chat_pipeline(user_text)
st.success(f"Assistant: {response[0]['generated_text']}")
# Voice Input Section
st.header("Voice Interaction")
uploaded_audio = st.file_uploader("Upload an audio file:", type=["wav", "mp3"])
if st.button("Submit Audio"):
if uploaded_audio:
# Save the uploaded audio to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
temp_audio_file.write(uploaded_audio.read())
temp_audio_path = temp_audio_file.name
# Transcribe audio to text
transcribed_text = voice_model.transcribe(temp_audio_path)['text']
st.write(f"Transcribed Text: {transcribed_text}")
# Generate AI response
audio_response = chat_pipeline(transcribed_text)
st.success(f"Assistant: {audio_response[0]['generated_text']}")
# Convert response to speech
tts = gTTS(audio_response[0]['generated_text'])
tts_output_path = "response_audio.mp3"
tts.save(tts_output_path)
st.audio(tts_output_path)
# Image Input Section
st.header("Image Interaction")
uploaded_image = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"])
if st.button("Submit Image"):
if uploaded_image:
# Display uploaded image
image = Image.open(uploaded_image)
st.image(image, caption="Uploaded Image")
# Generate caption
inputs = image_processor(image, return_tensors="pt")
outputs = image_model.generate(**inputs)
caption = image_processor.decode(outputs[0], skip_special_tokens=True)
st.success(f"Generated Caption: {caption}")