ANASAKHTAR commited on
Commit
fe0c7a1
·
verified ·
1 Parent(s): 1bf8b95

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
3
+ from PIL import Image
4
+ import whisper
5
+ from gtts import gTTS
6
+ import tempfile
7
+
8
+ # Initialize Models
9
+ # 1. Text Model
10
+ chat_pipeline = pipeline("conversational", model="facebook/blenderbot-400M-distill")
11
+
12
+ # 2. Image Model
13
+ image_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
14
+ image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
15
+
16
+ # 3. Voice Model
17
+ voice_model = whisper.load_model("base")
18
+
19
+ # Streamlit App
20
+ st.title("Multimodal AI Assistant")
21
+ st.write("Interact with AI via text, voice, and images!")
22
+
23
+ # Text Input Section
24
+ st.header("Text Interaction")
25
+ user_text = st.text_input("Enter your query:")
26
+ if st.button("Submit Text"):
27
+ if user_text:
28
+ response = chat_pipeline(user_text)
29
+ st.success(f"Assistant: {response[0]['generated_text']}")
30
+
31
+ # Voice Input Section
32
+ st.header("Voice Interaction")
33
+ uploaded_audio = st.file_uploader("Upload an audio file:", type=["wav", "mp3"])
34
+ if st.button("Submit Audio"):
35
+ if uploaded_audio:
36
+ # Save the uploaded audio to a temporary file
37
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
38
+ temp_audio_file.write(uploaded_audio.read())
39
+ temp_audio_path = temp_audio_file.name
40
+
41
+ # Transcribe audio to text
42
+ transcribed_text = voice_model.transcribe(temp_audio_path)['text']
43
+ st.write(f"Transcribed Text: {transcribed_text}")
44
+
45
+ # Generate AI response
46
+ audio_response = chat_pipeline(transcribed_text)
47
+ st.success(f"Assistant: {audio_response[0]['generated_text']}")
48
+
49
+ # Convert response to speech
50
+ tts = gTTS(audio_response[0]['generated_text'])
51
+ tts_output_path = "response_audio.mp3"
52
+ tts.save(tts_output_path)
53
+ st.audio(tts_output_path)
54
+
55
+ # Image Input Section
56
+ st.header("Image Interaction")
57
+ uploaded_image = st.file_uploader("Upload an image:", type=["jpg", "png", "jpeg"])
58
+ if st.button("Submit Image"):
59
+ if uploaded_image:
60
+ # Display uploaded image
61
+ image = Image.open(uploaded_image)
62
+ st.image(image, caption="Uploaded Image")
63
+
64
+ # Generate caption
65
+ inputs = image_processor(image, return_tensors="pt")
66
+ outputs = image_model.generate(**inputs)
67
+ caption = image_processor.decode(outputs[0], skip_special_tokens=True)
68
+ st.success(f"Generated Caption: {caption}")