Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
|
3 |
+
from PIL import Image
|
4 |
+
import whisper
|
5 |
+
from gtts import gTTS
|
6 |
+
import tempfile
|
7 |
+
|
8 |
+
# Initialize Models
|
9 |
+
# 1. Text Model
|
10 |
+
chat_pipeline = pipeline("conversational", model="facebook/blenderbot-400M-distill")
|
11 |
+
|
12 |
+
# 2. Image Model
|
13 |
+
image_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
14 |
+
image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
15 |
+
|
16 |
+
# 3. Voice Model
|
17 |
+
voice_model = whisper.load_model("base")
|
18 |
+
|
19 |
+
# Streamlit App
|
20 |
+
st.title("Multimodal AI Assistant")
|
21 |
+
st.write("Interact with AI via text, voice, and images!")
|
22 |
+
|
23 |
+
# Text Input Section
|
24 |
+
st.header("Text Interaction")
|
25 |
+
user_text = st.text_input("Enter your query:")
|
26 |
+
if st.button("Submit Text"):
|
27 |
+
if user_text:
|
28 |
+
response = chat_pipeline(user_text)
|
29 |
+
st.success(f"Assistant: {response[0]['generated_text']}")
|
30 |
+
|
31 |
+
# Voice Input Section
|
32 |
+
st.header("Voice Interaction")
|
33 |
+
uploaded_audio = st.file_uploader("Upload an audio file:", type=["wav", "mp3"])
|
34 |
+
if st.button("Submit Audio"):
|
35 |
+
if uploaded_audio:
|
36 |
+
# Save the uploaded audio to a temporary file
|
37 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
|
38 |
+
temp_audio_file.write(uploaded_audio.read())
|
39 |
+
temp_audio_path = temp_audio_file.name
|
40 |
+
|
41 |
+
# Transcribe audio to text
|
42 |
+
transcribed_text = voice_model.transcribe(temp_audio_path)['text']
|
43 |
+
st.write(f"Transcribed Text: {transcribed_text}")
|
44 |
+
|
45 |
+
# Generate AI response
|
46 |
+
audio_response = chat_pipeline(transcribed_text)
|
47 |
+
st.success(f"Assistant: {audio_response[0]['generated_text']}")
|
48 |
+
|
49 |
+
# Convert response to speech
|
50 |
+
tts = gTTS(audio_response[0]['generated_text'])
|
51 |
+
tts_output_path = "response_audio.mp3"
|
52 |
+
tts.save(tts_output_path)
|
53 |
+
st.audio(tts_output_path)
|
54 |
+
|
55 |
+
# Image Input Section
|
56 |
+
st.header("Image Interaction")
|
57 |
+
uploaded_image = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"])
|
58 |
+
if st.button("Submit Image"):
|
59 |
+
if uploaded_image:
|
60 |
+
# Display uploaded image
|
61 |
+
image = Image.open(uploaded_image)
|
62 |
+
st.image(image, caption="Uploaded Image")
|
63 |
+
|
64 |
+
# Generate caption
|
65 |
+
inputs = image_processor(image, return_tensors="pt")
|
66 |
+
outputs = image_model.generate(**inputs)
|
67 |
+
caption = image_processor.decode(outputs[0], skip_special_tokens=True)
|
68 |
+
st.success(f"Generated Caption: {caption}")
|