import os os.system("pip install scipy") # os.system('pip install -r requirements.txt') from PIL import Image import io import streamlit as st from transformers import pipeline import scipy # 初始化视觉问题回答和文本到语音的管道 vqa_pipeline = pipeline("visual-question-answering", model="microsoft/git-base-vqav2") tts_pipeline = pipeline("text-to-speech", "suno/bark") def main(): st.title("Visual Question Answering & Text-to-Audio App") image = st.file_uploader("Upload an image", type=["jpg", "png"]) question = st.text_input("Enter your question") if st.button('Run Visual Question Answering'): if image and question: # 将上传的文件转换为 PIL 图片 image = Image.open(io.BytesIO(image.getvalue())) # 对用户上传的图片和问题进行视觉问题回答 vqa_result = vqa_pipeline({"image": image, "question": question}) # 这里假设vqa_result返回的是一个列表,其中包含字典,且字典里有'answer'键 if vqa_result: # 确保vqa_result不为空 answer = vqa_result[0]['answer'] # 获取回答 st.write(f"Answer: {answer}") # 显示回答 # 将回答转换为音频并播放 speech = tts_pipeline(answer, forward_params={"do_sample": True}) scipy.io.wavfile.write("bark_out.wav", rate=speech["sampling_rate"], data=speech["audio"]) st.audio("bark_out.wav", format="audio/wav") else: st.write("Please input an image and a question first.") if __name__ == "__main__": main()