import streamlit as st from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration from PIL import Image import whisper from gtts import gTTS import tempfile # Initialize Models # 1. Text Model chat_pipeline = pipeline("text2text-generation", model="facebook/blenderbot-400M-distill") # 2. Image Model image_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # 3. Voice Model voice_model = whisper.load_model("base") # Streamlit App st.title("Multimodal AI Assistant") st.write("Interact with AI via text, voice, and images!") # Text Input Section st.header("Text Interaction") user_text = st.text_input("Enter your query:") if st.button("Submit Text"): if user_text: response = chat_pipeline(user_text) st.success(f"Assistant: {response[0]['generated_text']}") # Voice Input Section st.header("Voice Interaction") uploaded_audio = st.file_uploader("Upload an audio file:", type=["wav", "mp3"]) if st.button("Submit Audio"): if uploaded_audio: # Save the uploaded audio to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: temp_audio_file.write(uploaded_audio.read()) temp_audio_path = temp_audio_file.name # Transcribe audio to text transcribed_text = voice_model.transcribe(temp_audio_path)['text'] st.write(f"Transcribed Text: {transcribed_text}") # Generate AI response audio_response = chat_pipeline(transcribed_text) st.success(f"Assistant: {audio_response[0]['generated_text']}") # Convert response to speech tts = gTTS(audio_response[0]['generated_text']) tts_output_path = "response_audio.mp3" tts.save(tts_output_path) st.audio(tts_output_path) # Image Input Section st.header("Image Interaction") uploaded_image = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"]) if st.button("Submit Image"): if uploaded_image: # Display uploaded image image = Image.open(uploaded_image) st.image(image, caption="Uploaded Image") # Generate caption inputs = image_processor(image, return_tensors="pt") outputs = image_model.generate(**inputs) caption = image_processor.decode(outputs[0], skip_special_tokens=True) st.success(f"Generated Caption: {caption}")