Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 8

Commit

7df9b81

verified ·

1 Parent(s): c74b36f

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -14

app.py CHANGED Viewed

@@ -2,6 +2,9 @@
 import streamlit as st
 from transformers import pipeline
 import os
 # function part
 # img2text
@@ -41,15 +44,65 @@ def text2story(text):
     return story_text
-# text2audio
 def text2audio(story_text):
-    tts = pipeline("text-to-speech", model="microsoft/speecht5_tts")
-    audio_output = tts(story_text)
-    return {
-        "audio": audio_output["audio"],
-        "sampling_rate": audio_output["sampling_rate"]
-    }
 # Function to save temporary image file
 def save_uploaded_image(uploaded_file):
@@ -91,12 +144,15 @@ if uploaded_file is not None:
     # Play button
     if st.button("Play Audio"):
-        st.audio(
-            audio_data["audio"],
-            format="audio/wav",
-            start_time=0,
-            sample_rate=audio_data["sampling_rate"]
-        )
     # Clean up the temporary file
     try:

 import streamlit as st
 from transformers import pipeline
 import os
+import numpy as np
+import io
+import scipy.io.wavfile as wavfile
 # function part
 # img2text
     return story_text
+# text2audio - REVISED to use facebook/mms-tts-eng model
 def text2audio(story_text):
+    try:
+        # Use a smaller and more reliable TTS model
+        synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")
+        # Break the text into smaller chunks if needed (prevent timeout)
+        max_chunk_size = 200  # characters
+        chunks = []
+        for i in range(0, len(story_text), max_chunk_size):
+            chunk = story_text[i:i+max_chunk_size]
+            # Make sure we break at word boundaries
+            if i+max_chunk_size < len(story_text) and story_text[i+max_chunk_size] != ' ':
+                # Find the last space in this chunk
+                last_space = chunk.rfind(' ')
+                if last_space != -1:
+                    chunk = chunk[:last_space]
+            chunks.append(chunk)
+        # Process each chunk
+        audio_arrays = []
+        sampling_rate = None
+        for chunk in chunks:
+            if not chunk.strip():  # Skip empty chunks
+                continue
+            speech = synthesizer(chunk)
+            if sampling_rate is None:
+                sampling_rate = speech["sampling_rate"]
+            audio_arrays.append(speech["audio"])
+        # Combine all audio chunks
+        combined_audio = np.concatenate(audio_arrays)
+        # Create a BytesIO object to store the wave file
+        wav_buffer = io.BytesIO()
+        wavfile.write(wav_buffer, sampling_rate, combined_audio)
+        wav_buffer.seek(0)  # Rewind the buffer
+        return {
+            "audio": wav_buffer.getvalue(),
+            "sampling_rate": sampling_rate
+        }
+    except Exception as e:
+        st.error(f"Error generating audio: {str(e)}")
+        # Fallback to a pre-recorded audio file if available
+        try:
+            with open("fallback_audio.wav", "rb") as f:
+                return {
+                    "audio": f.read(),
+                    "sampling_rate": 22050  # Common sample rate
+                }
+        except:
+            return None
 # Function to save temporary image file
 def save_uploaded_image(uploaded_file):
     # Play button
     if st.button("Play Audio"):
+        if audio_data:
+            st.audio(
+                audio_data["audio"],
+                format="audio/wav",
+                start_time=0,
+                sample_rate=audio_data["sampling_rate"]
+            )
+        else:
+            st.error("Failed to generate audio. Please try again.")
     # Clean up the temporary file
     try: