Spaces:
Running
on
T4
Running
on
T4
Update app.py
Browse files
app.py
CHANGED
@@ -21,7 +21,7 @@ You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬
|
|
21 |
We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗
|
22 |
|
23 |
### How to Use
|
24 |
-
Input text with
|
25 |
This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request !
|
26 |
"""
|
27 |
|
@@ -45,23 +45,37 @@ def generate_segment_audio(text, lang, speaker_url, pipe):
|
|
45 |
resample_audio = resampler(newsr=24000)
|
46 |
audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
|
47 |
audio_np = audio_data_resampled.cpu().numpy()
|
|
|
48 |
return audio_np
|
49 |
|
|
|
50 |
def concatenate_audio_segments(segments):
|
51 |
-
|
52 |
-
for
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
57 |
else:
|
58 |
-
stereo_segment =
|
59 |
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
|
63 |
return concatenated_audio
|
64 |
|
|
|
|
|
65 |
@spaces.GPU
|
66 |
def whisper_speech_demo(multilingual_text, speaker_audio):
|
67 |
segments = parse_multilingual_text(multilingual_text)
|
@@ -75,9 +89,10 @@ def whisper_speech_demo(multilingual_text, speaker_audio):
|
|
75 |
for lang, text in segments:
|
76 |
text_str = text if isinstance(text, str) else str(text)
|
77 |
audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
|
|
|
78 |
audio_segments.append(audio_np)
|
79 |
-
|
80 |
concatenated_audio = concatenate_audio_segments(audio_segments)
|
|
|
81 |
audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
|
82 |
audio_stereo = audio_stereo.reshape(-1, 2)
|
83 |
|
|
|
21 |
We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗
|
22 |
|
23 |
### How to Use
|
24 |
+
Input text with tahe language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio.
|
25 |
This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request !
|
26 |
"""
|
27 |
|
|
|
45 |
resample_audio = resampler(newsr=24000)
|
46 |
audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
|
47 |
audio_np = audio_data_resampled.cpu().numpy()
|
48 |
+
print("Shape after resampling:", audio_np.shape) # Debug statement
|
49 |
return audio_np
|
50 |
|
51 |
+
# Function to append and concatenate audio segments with padding
|
52 |
def concatenate_audio_segments(segments):
|
53 |
+
# Determine the length of the longest segment
|
54 |
+
max_length = max(seg.shape[0] for seg in segments)
|
55 |
+
print("Max length of segments:", max_length) # Debug statement
|
56 |
+
# Pad each segment to the length of the longest segment and stack them
|
57 |
+
padded_segments = []
|
58 |
+
for seg in segments:
|
59 |
+
# Check if the segment is stereo; if not, convert it to stereo
|
60 |
+
if seg.ndim == 1 or seg.shape[1] == 1:
|
61 |
+
stereo_segment = np.stack((seg, seg), axis=-1)
|
62 |
else:
|
63 |
+
stereo_segment = seg
|
64 |
|
65 |
+
# Pad the segment to the max length
|
66 |
+
padding_length = max_length - stereo_segment.shape[0]
|
67 |
+
padded_segment = np.pad(stereo_segment, ((0, padding_length), (0, 0)), 'constant')
|
68 |
+
print("Padded segment shape:", padded_segment.shape) # Debug statement
|
69 |
+
padded_segments.append(padded_segment)
|
70 |
+
|
71 |
+
concatenated_audio = np.vstack(padded_segments)
|
72 |
+
|
73 |
+
print("Concatenated audio shape:", concatenated_audio.shape) # Debug statement
|
74 |
concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
|
75 |
return concatenated_audio
|
76 |
|
77 |
+
# The rest of the code in app.py remains the same
|
78 |
+
|
79 |
@spaces.GPU
|
80 |
def whisper_speech_demo(multilingual_text, speaker_audio):
|
81 |
segments = parse_multilingual_text(multilingual_text)
|
|
|
89 |
for lang, text in segments:
|
90 |
text_str = text if isinstance(text, str) else str(text)
|
91 |
audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
|
92 |
+
print("Audio segment shape:", audio_np.shape) # Debug statement
|
93 |
audio_segments.append(audio_np)
|
|
|
94 |
concatenated_audio = concatenate_audio_segments(audio_segments)
|
95 |
+
print("Final concatenated audio shape:", concatenated_audio.shape) # Debug statement
|
96 |
audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
|
97 |
audio_stereo = audio_stereo.reshape(-1, 2)
|
98 |
|