ANASAKHTAR's picture
Update app.py
ab1bf33 verified
raw
history blame
2.53 kB
import streamlit as st
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import whisper
from gtts import gTTS
import tempfile
# Initialize Models
# 1. Text Model
chat_pipeline = pipeline("text2text-generation", model="facebook/blenderbot-400M-distill")
# 2. Image Model
image_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# 3. Voice Model
voice_model = whisper.load_model("base")
# Streamlit App
st.title("Multimodal AI Assistant")
st.write("Interact with AI via text, voice, and images!")
# Text Input Section
st.header("Text Interaction")
user_text = st.text_input("Enter your query:")
if st.button("Submit Text"):
if user_text:
response = chat_pipeline(user_text)
st.success(f"Assistant: {response[0]['generated_text']}")
# Voice Input Section
st.header("Voice Interaction")
uploaded_audio = st.file_uploader("Upload an audio file:", type=["wav", "mp3"])
if st.button("Submit Audio"):
if uploaded_audio:
# Save the uploaded audio to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
temp_audio_file.write(uploaded_audio.read())
temp_audio_path = temp_audio_file.name
# Transcribe audio to text
transcribed_text = voice_model.transcribe(temp_audio_path)['text']
st.write(f"Transcribed Text: {transcribed_text}")
# Generate AI response
audio_response = chat_pipeline(transcribed_text)
st.success(f"Assistant: {audio_response[0]['generated_text']}")
# Convert response to speech
tts = gTTS(audio_response[0]['generated_text'])
tts_output_path = "response_audio.mp3"
tts.save(tts_output_path)
st.audio(tts_output_path)
# Image Input Section
st.header("Image Interaction")
uploaded_image = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"])
if st.button("Submit Image"):
if uploaded_image:
# Display uploaded image
image = Image.open(uploaded_image)
st.image(image, caption="Uploaded Image")
# Generate caption
inputs = image_processor(image, return_tensors="pt")
outputs = image_model.generate(**inputs)
caption = image_processor.decode(outputs[0], skip_special_tokens=True)
st.success(f"Generated Caption: {caption}")