CR7CAD commited on
Commit
7df9b81
·
verified ·
1 Parent(s): c74b36f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -14
app.py CHANGED
@@ -2,6 +2,9 @@
2
  import streamlit as st
3
  from transformers import pipeline
4
  import os
 
 
 
5
 
6
  # function part
7
  # img2text
@@ -41,15 +44,65 @@ def text2story(text):
41
 
42
  return story_text
43
 
44
- # text2audio
45
  def text2audio(story_text):
46
- tts = pipeline("text-to-speech", model="microsoft/speecht5_tts")
47
- audio_output = tts(story_text)
48
-
49
- return {
50
- "audio": audio_output["audio"],
51
- "sampling_rate": audio_output["sampling_rate"]
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  # Function to save temporary image file
55
  def save_uploaded_image(uploaded_file):
@@ -91,12 +144,15 @@ if uploaded_file is not None:
91
 
92
  # Play button
93
  if st.button("Play Audio"):
94
- st.audio(
95
- audio_data["audio"],
96
- format="audio/wav",
97
- start_time=0,
98
- sample_rate=audio_data["sampling_rate"]
99
- )
 
 
 
100
 
101
  # Clean up the temporary file
102
  try:
 
2
  import streamlit as st
3
  from transformers import pipeline
4
  import os
5
+ import numpy as np
6
+ import io
7
+ import scipy.io.wavfile as wavfile
8
 
9
  # function part
10
  # img2text
 
44
 
45
  return story_text
46
 
47
+ # text2audio - REVISED to use facebook/mms-tts-eng model
48
  def text2audio(story_text):
49
+ try:
50
+ # Use a smaller and more reliable TTS model
51
+ synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng")
52
+
53
+ # Break the text into smaller chunks if needed (prevent timeout)
54
+ max_chunk_size = 200 # characters
55
+ chunks = []
56
+
57
+ for i in range(0, len(story_text), max_chunk_size):
58
+ chunk = story_text[i:i+max_chunk_size]
59
+ # Make sure we break at word boundaries
60
+ if i+max_chunk_size < len(story_text) and story_text[i+max_chunk_size] != ' ':
61
+ # Find the last space in this chunk
62
+ last_space = chunk.rfind(' ')
63
+ if last_space != -1:
64
+ chunk = chunk[:last_space]
65
+
66
+ chunks.append(chunk)
67
+
68
+ # Process each chunk
69
+ audio_arrays = []
70
+ sampling_rate = None
71
+
72
+ for chunk in chunks:
73
+ if not chunk.strip(): # Skip empty chunks
74
+ continue
75
+
76
+ speech = synthesizer(chunk)
77
+ if sampling_rate is None:
78
+ sampling_rate = speech["sampling_rate"]
79
+
80
+ audio_arrays.append(speech["audio"])
81
+
82
+ # Combine all audio chunks
83
+ combined_audio = np.concatenate(audio_arrays)
84
+
85
+ # Create a BytesIO object to store the wave file
86
+ wav_buffer = io.BytesIO()
87
+ wavfile.write(wav_buffer, sampling_rate, combined_audio)
88
+ wav_buffer.seek(0) # Rewind the buffer
89
+
90
+ return {
91
+ "audio": wav_buffer.getvalue(),
92
+ "sampling_rate": sampling_rate
93
+ }
94
+
95
+ except Exception as e:
96
+ st.error(f"Error generating audio: {str(e)}")
97
+ # Fallback to a pre-recorded audio file if available
98
+ try:
99
+ with open("fallback_audio.wav", "rb") as f:
100
+ return {
101
+ "audio": f.read(),
102
+ "sampling_rate": 22050 # Common sample rate
103
+ }
104
+ except:
105
+ return None
106
 
107
  # Function to save temporary image file
108
  def save_uploaded_image(uploaded_file):
 
144
 
145
  # Play button
146
  if st.button("Play Audio"):
147
+ if audio_data:
148
+ st.audio(
149
+ audio_data["audio"],
150
+ format="audio/wav",
151
+ start_time=0,
152
+ sample_rate=audio_data["sampling_rate"]
153
+ )
154
+ else:
155
+ st.error("Failed to generate audio. Please try again.")
156
 
157
  # Clean up the temporary file
158
  try: