WhisperSpeech

Running on T4

App Files Files Community

Tonic commited on Jan 25

Commit

67dbfa2

•

1 Parent(s): aa10543

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -11

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬
 We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗
 ### How to Use
-Input text with the language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio.
 This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request !
 """
@@ -45,23 +45,37 @@ def generate_segment_audio(text, lang, speaker_url, pipe):
     resample_audio = resampler(newsr=24000)
     audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
     audio_np = audio_data_resampled.cpu().numpy()
     return audio_np
 def concatenate_audio_segments(segments):
-    concatenated_audio_data = []
-    for segment in segments:
-        if segment.ndim == 1:
-            stereo_segment = np.stack((segment, segment), axis=-1)
-        elif segment.shape[1] == 1:
-            stereo_segment = np.concatenate((segment, segment), axis=1)
         else:
-            stereo_segment = segment
-        concatenated_audio_data.append(stereo_segment)
-    concatenated_audio = np.vstack(concatenated_audio_data)
     concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
     return concatenated_audio
 @spaces.GPU
 def whisper_speech_demo(multilingual_text, speaker_audio):
     segments = parse_multilingual_text(multilingual_text)
@@ -75,9 +89,10 @@ def whisper_speech_demo(multilingual_text, speaker_audio):
     for lang, text in segments:
         text_str = text if isinstance(text, str) else str(text)
         audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
         audio_segments.append(audio_np)
     concatenated_audio = concatenate_audio_segments(audio_segments)
     audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
     audio_stereo = audio_stereo.reshape(-1, 2)

 We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗
 ### How to Use
+Input text with tahe language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio.
 This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request !
 """
     resample_audio = resampler(newsr=24000)
     audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
     audio_np = audio_data_resampled.cpu().numpy()
+    print("Shape after resampling:", audio_np.shape)  # Debug statement
     return audio_np
+# Function to append and concatenate audio segments with padding
 def concatenate_audio_segments(segments):
+    # Determine the length of the longest segment
+    max_length = max(seg.shape[0] for seg in segments)
+    print("Max length of segments:", max_length)  # Debug statement
+    # Pad each segment to the length of the longest segment and stack them
+    padded_segments = []
+    for seg in segments:
+        # Check if the segment is stereo; if not, convert it to stereo
+        if seg.ndim == 1 or seg.shape[1] == 1:
+            stereo_segment = np.stack((seg, seg), axis=-1)
         else:
+            stereo_segment = seg
+        # Pad the segment to the max length
+        padding_length = max_length - stereo_segment.shape[0]
+        padded_segment = np.pad(stereo_segment, ((0, padding_length), (0, 0)), 'constant')
+        print("Padded segment shape:", padded_segment.shape)  # Debug statement
+        padded_segments.append(padded_segment)
+    concatenated_audio = np.vstack(padded_segments)
+    print("Concatenated audio shape:", concatenated_audio.shape)  # Debug statement
     concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
     return concatenated_audio
+# The rest of the code in app.py remains the same
 @spaces.GPU
 def whisper_speech_demo(multilingual_text, speaker_audio):
     segments = parse_multilingual_text(multilingual_text)
     for lang, text in segments:
         text_str = text if isinstance(text, str) else str(text)
         audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
+        print("Audio segment shape:", audio_np.shape)  # Debug statement
         audio_segments.append(audio_np)
     concatenated_audio = concatenate_audio_segments(audio_segments)
+    print("Final concatenated audio shape:", concatenated_audio.shape)  # Debug statement
     audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
     audio_stereo = audio_stereo.reshape(-1, 2)