File size: 1,676 Bytes
f38a4cc
 
00138cd
e9ac35c
6f47d31
 
 
 
7b95213
6f47d31
 
d62fbdd
 
6f47d31
fc99228
 
d4d2e09
fc99228
 
 
d62fbdd
00138cd
d62fbdd
 
00138cd
d62fbdd
 
d4d2e09
00138cd
 
 
 
29c2e62
00138cd
7b95213
 
 
 
d62fbdd
00138cd
6f47d31
 
00138cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
os.system("pip install scipy")
# os.system('pip install -r requirements.txt')

from PIL import Image
import io
import streamlit as st
from transformers import pipeline
import scipy

# 初始化视觉问题回答和文本到语音的管道
vqa_pipeline = pipeline("visual-question-answering", model="microsoft/git-base-vqav2")
tts_pipeline = pipeline("text-to-speech", "suno/bark")

def main():
    st.title("Visual Question Answering & Text-to-Audio App")
    
    image = st.file_uploader("Upload an image", type=["jpg", "png"])
    question = st.text_input("Enter your question")

    if st.button('Run Visual Question Answering'):
        if image and question:
            # 将上传的文件转换为 PIL 图片
            image = Image.open(io.BytesIO(image.getvalue()))

            # 对用户上传的图片和问题进行视觉问题回答
            vqa_result = vqa_pipeline({"image": image, "question": question})

            # 这里假设vqa_result返回的是一个列表,其中包含字典,且字典里有'answer'键
            if vqa_result:  # 确保vqa_result不为空
                answer = vqa_result[0]['answer']  # 获取回答
                st.write(f"Answer: {answer}")  # 显示回答

                # 将回答转换为音频并播放
                speech = tts_pipeline(answer, forward_params={"do_sample": True})
                scipy.io.wavfile.write("bark_out.wav", rate=speech["sampling_rate"], data=speech["audio"])
                
                st.audio("bark_out.wav", format="audio/wav")
        else:
            st.write("Please input an image and a question first.")

if __name__ == "__main__":
    main()