kevinwang676 commited on
Commit
cbe014c
·
verified ·
1 Parent(s): a35e94c

Update speech_edit.py

Browse files
Files changed (1) hide show
  1. speech_edit.py +41 -6
speech_edit.py CHANGED
@@ -8,7 +8,8 @@ CosyVoice gRPC back‑end – updated to mirror the FastAPI logic
8
  * inference_instruct2 ➜ new: prompt‑audio + speed (no speaker‑ID)
9
  """
10
 
11
- import io, os, tempfile, requests, soundfile as sf, torchaudio
 
12
  import sys
13
  from concurrent import futures
14
  import argparse
@@ -148,14 +149,48 @@ class CosyVoiceServiceImpl(cosyvoice_pb2_grpc.CosyVoiceServicer):
148
  if request.HasField("cross_lingual_request"):
149
  logging.info("Received cross‑lingual inference request")
150
  cr = request.cross_lingual_request
151
- prompt = _bytes_to_tensor(cr.prompt_audio)
152
- mo = self.cosyvoice.inference_cross_lingual(
153
- cr.tts_text,
154
- prompt
155
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  yield from _yield_audio(mo)
157
  return
158
 
 
159
  # 4. Instruction‑TTS (two flavours)
160
  if request.HasField("instruct_request"):
161
  ir = request.instruct_request
 
8
  * inference_instruct2 ➜ new: prompt‑audio + speed (no speaker‑ID)
9
  """
10
 
11
+ import io, tempfile, requests, soundfile as sf, torchaudio
12
+ import os
13
  import sys
14
  from concurrent import futures
15
  import argparse
 
149
  if request.HasField("cross_lingual_request"):
150
  logging.info("Received cross‑lingual inference request")
151
  cr = request.cross_lingual_request
152
+ tmp_path = None
153
+
154
+ try:
155
+ if cr.prompt_audio.startswith(b'http'): # S3 URL case
156
+ url = cr.prompt_audio.decode('utf‑8')
157
+ logging.info("Downloading cross‑lingual prompt from %s", url)
158
+ resp = requests.get(url, timeout=10)
159
+ resp.raise_for_status()
160
+
161
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
162
+ f.write(resp.content)
163
+ tmp_path = f.name
164
+
165
+ wav, sr = sf.read(tmp_path, dtype='float32')
166
+ if wav.ndim > 1:
167
+ wav = wav.mean(axis=1)
168
+ if sr != 16_000:
169
+ wav = torchaudio.functional.resample(
170
+ torch.from_numpy(wav).unsqueeze(0), sr, 16_000
171
+ )[0].numpy()
172
+ prompt = torch.from_numpy(wav).unsqueeze(0)
173
+
174
+ else: # legacy raw bytes
175
+ prompt = _bytes_to_tensor(cr.prompt_audio)
176
+
177
+ mo = self.cosyvoice.inference_cross_lingual(
178
+ cr.tts_text,
179
+ prompt
180
+ )
181
+
182
+ finally:
183
+ if tmp_path and os.path.exists(tmp_path):
184
+ try:
185
+ os.remove(tmp_path)
186
+ except Exception as e:
187
+ logging.warning("Could not remove temp file %s: %s",
188
+ tmp_path, e)
189
+
190
  yield from _yield_audio(mo)
191
  return
192
 
193
+
194
  # 4. Instruction‑TTS (two flavours)
195
  if request.HasField("instruct_request"):
196
  ir = request.instruct_request