Spaces:
Sleeping
Sleeping
import os | |
os.system("pip install scipy") | |
# os.system('pip install -r requirements.txt') | |
from PIL import Image | |
import io | |
import streamlit as st | |
from transformers import pipeline | |
import scipy | |
# 初始化视觉问题回答和文本到语音的管道 | |
vqa_pipeline = pipeline("visual-question-answering", model="microsoft/git-base-vqav2") | |
tts_pipeline = pipeline("text-to-speech", "suno/bark") | |
def main(): | |
st.title("Visual Question Answering & Text-to-Audio App") | |
image = st.file_uploader("Upload an image", type=["jpg", "png"]) | |
question = st.text_input("Enter your question") | |
if st.button('Run Visual Question Answering'): | |
if image and question: | |
# 将上传的文件转换为 PIL 图片 | |
image = Image.open(io.BytesIO(image.getvalue())) | |
# 对用户上传的图片和问题进行视觉问题回答 | |
vqa_result = vqa_pipeline({"image": image, "question": question}) | |
# 这里假设vqa_result返回的是一个列表,其中包含字典,且字典里有'answer'键 | |
if vqa_result: # 确保vqa_result不为空 | |
answer = vqa_result[0]['answer'] # 获取回答 | |
st.write(f"Answer: {answer}") # 显示回答 | |
# 将回答转换为音频并播放 | |
speech = tts_pipeline(answer, forward_params={"do_sample": True}) | |
scipy.io.wavfile.write("bark_out.wav", rate=speech["sampling_rate"], data=speech["audio"]) | |
st.audio("bark_out.wav", format="audio/wav") | |
else: | |
st.write("Please input an image and a question first.") | |
if __name__ == "__main__": | |
main() | |