Spaces:

MonkeyDLLLLLLuffy
/

classagm

Sleeping

App Files Files Community

Leo Liu commited on Mar 4

Commit

38459f0

verified ·

1 Parent(s): 2d5d1a8

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -57

app.py CHANGED Viewed

@@ -1,58 +1,50 @@
-# import part
-from transformers import pipeline
 import streamlit as st
-# function part
-# img2text
-def img2text(url):
-    image_to_text_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
-    text = image_to_text_model(url)[0]["generated_text"]
-    return text
-# text2story
-def text2story(text):
-    story_text = ""   # to be completed
-    return story_text
-# text2audio
-def text2audio(story_text):
-    audio_data = ""     # to be completed
-    return audio_data
-# main part
-st.set_page_config(page_title="Your Image to Audio Story",
-                   page_icon="🦜")
-st.header("Turn Your Image to Audio Story")
-# Upload image here
-uploaded_file = st.file_uploader("Select an Image...")
-if uploaded_file is not None:
-    print(uploaded_file)
-    bytes_data = uploaded_file.getvalue()
-    with open(uploaded_file.name, "wb") as file:
-        file.write(bytes_data)
-    st.image(uploaded_file, caption="Uploaded Image",
-             use_column_width=True)
-    #Stage 1: Image to Text
-    st.text('Processing img2text...')
-    scenario = img2text(uploaded_file.name)
-    st.write(scenario)
-    #Stage 2: Text to Story
-    st.text('Generating a story...')
-    #story = text2story(scenario)
-    #st.write(story)
-    #Stage 3: Story to Audio data
-    #st.text('Generating audio data...')
-    #audio_data =text2audio(story)
-    # Play button
-    if st.button("Play Audio"):
-        #st.audio(audio_data['audio'],
-        #            format="audio/wav",
-        #            start_time=0,
-        #            sample_rate = audio_data['sampling_rate'])
-        st.audio("kids_playing_audio.wav")

 import streamlit as st
+from transformers import pipeline
+from PIL import Image
+import soundfile as sf
+import io
+# 1. 加载Pipeline
+#   - 图像→文本：使用 nlpconnect/vit-gpt2-image-captioning
+#   - 文本→语音：使用 facebook/mms-tts 或其它 TTS 模型
+img_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
+text_to_speech = pipeline("text-to-speech", model="facebook/mms-tts")
+st.title("Image-to-Text and Text-to-Speech App (WAV output)")
+# 2. 上传图片
+uploaded_image = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])
+if uploaded_image:
+    # 显示图片
+    img = Image.open(uploaded_image)
+    st.image(img, caption="Uploaded Image", use_container_width=True)
+    # 3. 图像 → 文本
+    text_output = img_to_text(img)[0]["generated_text"]
+    st.write("### Extracted Text")
+    st.write(text_output)
+    # 4. 文本 → 语音 (TTS)
+    #    text_to_speech(...) 返回一个 dict，包含 "audio" (numpy数组) 和 "sampling_rate"
+    st.write("### Listen to Speech Output")
+    speech_output = text_to_speech(text_output)
+    # 5. 将返回的音频数组写到内存中的 WAV 文件
+    audio_array = speech_output["audio"]           # numpy array
+    sample_rate = speech_output["sampling_rate"]   # 采样率
+    wav_io = io.BytesIO()
+    # 利用 soundfile 将音频数组写入内存，并指定格式为 WAV
+    sf.write(wav_io, audio_array, sample_rate, format="WAV")
+    wav_io.seek(0)  # 将指针重置到开头，方便后续读取
+    # 6. 使用 st.audio 播放内存中的 WAV
+    st.audio(wav_io, format="audio/wav")
+    # 7. (可选) 提供下载按钮，下载 WAV 文件
+    st.download_button(
+        label="Download WAV",
+        data=wav_io,
+        file_name="speech.wav",
+        mime="audio/wav"
+    )