Tonic commited on
Commit
67dbfa2
1 Parent(s): aa10543

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -11
app.py CHANGED
@@ -21,7 +21,7 @@ You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬
21
  We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗
22
 
23
  ### How to Use
24
- Input text with the language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio.
25
  This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request !
26
  """
27
 
@@ -45,23 +45,37 @@ def generate_segment_audio(text, lang, speaker_url, pipe):
45
  resample_audio = resampler(newsr=24000)
46
  audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
47
  audio_np = audio_data_resampled.cpu().numpy()
 
48
  return audio_np
49
 
 
50
  def concatenate_audio_segments(segments):
51
- concatenated_audio_data = []
52
- for segment in segments:
53
- if segment.ndim == 1:
54
- stereo_segment = np.stack((segment, segment), axis=-1)
55
- elif segment.shape[1] == 1:
56
- stereo_segment = np.concatenate((segment, segment), axis=1)
 
 
 
57
  else:
58
- stereo_segment = segment
59
 
60
- concatenated_audio_data.append(stereo_segment)
61
- concatenated_audio = np.vstack(concatenated_audio_data)
 
 
 
 
 
 
 
62
  concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
63
  return concatenated_audio
64
 
 
 
65
  @spaces.GPU
66
  def whisper_speech_demo(multilingual_text, speaker_audio):
67
  segments = parse_multilingual_text(multilingual_text)
@@ -75,9 +89,10 @@ def whisper_speech_demo(multilingual_text, speaker_audio):
75
  for lang, text in segments:
76
  text_str = text if isinstance(text, str) else str(text)
77
  audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
 
78
  audio_segments.append(audio_np)
79
-
80
  concatenated_audio = concatenate_audio_segments(audio_segments)
 
81
  audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
82
  audio_stereo = audio_stereo.reshape(-1, 2)
83
 
 
21
  We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗
22
 
23
  ### How to Use
24
+ Input text with tahe language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio.
25
  This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request !
26
  """
27
 
 
45
  resample_audio = resampler(newsr=24000)
46
  audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
47
  audio_np = audio_data_resampled.cpu().numpy()
48
+ print("Shape after resampling:", audio_np.shape) # Debug statement
49
  return audio_np
50
 
51
+ # Function to append and concatenate audio segments with padding
52
  def concatenate_audio_segments(segments):
53
+ # Determine the length of the longest segment
54
+ max_length = max(seg.shape[0] for seg in segments)
55
+ print("Max length of segments:", max_length) # Debug statement
56
+ # Pad each segment to the length of the longest segment and stack them
57
+ padded_segments = []
58
+ for seg in segments:
59
+ # Check if the segment is stereo; if not, convert it to stereo
60
+ if seg.ndim == 1 or seg.shape[1] == 1:
61
+ stereo_segment = np.stack((seg, seg), axis=-1)
62
  else:
63
+ stereo_segment = seg
64
 
65
+ # Pad the segment to the max length
66
+ padding_length = max_length - stereo_segment.shape[0]
67
+ padded_segment = np.pad(stereo_segment, ((0, padding_length), (0, 0)), 'constant')
68
+ print("Padded segment shape:", padded_segment.shape) # Debug statement
69
+ padded_segments.append(padded_segment)
70
+
71
+ concatenated_audio = np.vstack(padded_segments)
72
+
73
+ print("Concatenated audio shape:", concatenated_audio.shape) # Debug statement
74
  concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
75
  return concatenated_audio
76
 
77
+ # The rest of the code in app.py remains the same
78
+
79
  @spaces.GPU
80
  def whisper_speech_demo(multilingual_text, speaker_audio):
81
  segments = parse_multilingual_text(multilingual_text)
 
89
  for lang, text in segments:
90
  text_str = text if isinstance(text, str) else str(text)
91
  audio_np = generate_segment_audio(text_str, lang, speaker_url, pipe)
92
+ print("Audio segment shape:", audio_np.shape) # Debug statement
93
  audio_segments.append(audio_np)
 
94
  concatenated_audio = concatenate_audio_segments(audio_segments)
95
+ print("Final concatenated audio shape:", concatenated_audio.shape) # Debug statement
96
  audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
97
  audio_stereo = audio_stereo.reshape(-1, 2)
98