Spaces:
Sleeping
Sleeping
File size: 1,676 Bytes
f38a4cc 00138cd e9ac35c 6f47d31 7b95213 6f47d31 d62fbdd 6f47d31 fc99228 d4d2e09 fc99228 d62fbdd 00138cd d62fbdd 00138cd d62fbdd d4d2e09 00138cd 29c2e62 00138cd 7b95213 d62fbdd 00138cd 6f47d31 00138cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import os
os.system("pip install scipy")
# os.system('pip install -r requirements.txt')
from PIL import Image
import io
import streamlit as st
from transformers import pipeline
import scipy
# 初始化视觉问题回答和文本到语音的管道
vqa_pipeline = pipeline("visual-question-answering", model="microsoft/git-base-vqav2")
tts_pipeline = pipeline("text-to-speech", "suno/bark")
def main():
st.title("Visual Question Answering & Text-to-Audio App")
image = st.file_uploader("Upload an image", type=["jpg", "png"])
question = st.text_input("Enter your question")
if st.button('Run Visual Question Answering'):
if image and question:
# 将上传的文件转换为 PIL 图片
image = Image.open(io.BytesIO(image.getvalue()))
# 对用户上传的图片和问题进行视觉问题回答
vqa_result = vqa_pipeline({"image": image, "question": question})
# 这里假设vqa_result返回的是一个列表,其中包含字典,且字典里有'answer'键
if vqa_result: # 确保vqa_result不为空
answer = vqa_result[0]['answer'] # 获取回答
st.write(f"Answer: {answer}") # 显示回答
# 将回答转换为音频并播放
speech = tts_pipeline(answer, forward_params={"do_sample": True})
scipy.io.wavfile.write("bark_out.wav", rate=speech["sampling_rate"], data=speech["audio"])
st.audio("bark_out.wav", format="audio/wav")
else:
st.write("Please input an image and a question first.")
if __name__ == "__main__":
main()
|