Spaces:
Runtime error
Runtime error
StormblessedKal
commited on
Commit
•
5f8297f
1
Parent(s):
a81bf6e
deliverable api
Browse files- src/__pycache__/predict.cpython-310.pyc +0 -0
- src/__pycache__/rp_schema.cpython-310.pyc +0 -0
- src/__pycache__/se_extractor.cpython-310.pyc +0 -0
- src/predict.py +100 -29
- src/processed/4d651a78-ccbd-4f66-96b1-0e0ede048d77/raw/631a27e2-8466-463e-a6ca-a2afd468c5a3.wav +3 -0
- src/processed/69b28271-7198-4307-8501-e3969bbebef4/raw/631a27e2-8466-463e-a6ca-a2afd468c5a3.wav +3 -0
- src/rp_handler.py +30 -7
- src/rp_schema.py +16 -1
- src/se_extractor.py +28 -2
src/__pycache__/predict.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/predict.cpython-310.pyc and b/src/__pycache__/predict.cpython-310.pyc differ
|
|
src/__pycache__/rp_schema.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/rp_schema.cpython-310.pyc and b/src/__pycache__/rp_schema.cpython-310.pyc differ
|
|
src/__pycache__/se_extractor.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/se_extractor.cpython-310.pyc and b/src/__pycache__/se_extractor.cpython-310.pyc differ
|
|
src/predict.py
CHANGED
@@ -6,6 +6,11 @@ repository, with some modifications to make it work with the RP platform.
|
|
6 |
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
8 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
from runpod.serverless.utils import rp_cuda
|
11 |
import boto3
|
@@ -52,6 +57,9 @@ from text_utils import TextCleaner
|
|
52 |
from pydantic import BaseModel, HttpUrl
|
53 |
from api import BaseSpeakerTTS, ToneColorConverter
|
54 |
|
|
|
|
|
|
|
55 |
class Predictor:
|
56 |
def __init__(self):
|
57 |
self.model = None
|
@@ -135,7 +143,70 @@ class Predictor:
|
|
135 |
self.tone_color_converter.load_ckpt(f'{self.ckpt_converter}/checkpoint.pth')
|
136 |
|
137 |
|
138 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
output_dir = 'processed'
|
140 |
gen_id = str(uuid.uuid4())
|
141 |
os.makedirs(output_dir,exist_ok=True)
|
@@ -159,29 +230,32 @@ class Predictor:
|
|
159 |
#voice_clone with styletts2
|
160 |
model,sampler = self.model,self.sampler
|
161 |
processed_seg_dir = os.path.join(segments_dir,s3_key.split('.')[0],'wavs')
|
162 |
-
result = self.process_audio_file(
|
163 |
final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
|
|
|
164 |
sf.write(final_output,result,24000)
|
|
|
|
|
|
|
165 |
|
166 |
base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
167 |
reference_speaker = local_file_path
|
168 |
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
|
169 |
src_path = os.path.join(results_dir,f"{gen_id}-tmp.wav")
|
170 |
-
openvoice_output = os.path.join(results_dir,f"{gen_id}-2.wav")
|
171 |
base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0)
|
172 |
|
173 |
source_se = torch.load(f'{self.ckpt_base}/en_default_se.pth').to(self.device)
|
174 |
tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
|
181 |
mp3_final_output_1 = str(final_output).replace('wav','mp3')
|
182 |
-
mp3_final_output_2 = str(
|
183 |
self.convert_wav_to_mp3(final_output,mp3_final_output_1)
|
184 |
-
self.convert_wav_to_mp3(
|
185 |
print(mp3_final_output_1)
|
186 |
print(mp3_final_output_2)
|
187 |
|
@@ -200,19 +274,19 @@ class Predictor:
|
|
200 |
base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0,use_emotions=True)
|
201 |
source_se = torch.load(f'{self.ckpt_base}/en_style_se.pth').to(self.device)
|
202 |
tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
|
207 |
-
mp3_final_output_1 = str(
|
208 |
-
self.convert_wav_to_mp3(
|
209 |
print(mp3_final_output_1)
|
210 |
self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-with-emotions.mp3")
|
211 |
shutil.rmtree(os.path.join(output_dir,gen_id))
|
212 |
return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-with-emotions.mp3"
|
213 |
}
|
214 |
|
215 |
-
if method_type == '
|
216 |
#voice clone with multi-lingugal
|
217 |
_,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
218 |
reference_speaker = local_file_path
|
@@ -236,9 +310,10 @@ class Predictor:
|
|
236 |
self.tone_color_converter.convert(audio_src_path=openai_multi_lang_path, src_se=source_se, tgt_se=target_se, output_path=multi_lang_with_voice_clone_path,message='')
|
237 |
|
238 |
mp3_final_output_1 = str(multi_lang_with_voice_clone_path).replace('wav','mp3')
|
239 |
-
convert_wav_to_mp3(multi_lang_with_voice_clone_path,mp3_final_output_1)
|
240 |
print(mp3_final_output_1)
|
241 |
-
upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-multi-lang.mp3")
|
|
|
242 |
return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-multi-lang.mp3"
|
243 |
}
|
244 |
|
@@ -249,7 +324,7 @@ class Predictor:
|
|
249 |
|
250 |
solver = solver.lower()
|
251 |
nfe = int(nfe)
|
252 |
-
lambd = 0.
|
253 |
|
254 |
dwav, sr = torchaudio.load(path)
|
255 |
dwav = dwav.mean(dim=0)
|
@@ -380,14 +455,9 @@ class Predictor:
|
|
380 |
|
381 |
return torch.cat([ref_s, ref_p], dim=1)
|
382 |
|
383 |
-
def process_audio_file(self,
|
384 |
-
print(
|
385 |
-
|
386 |
-
print(audio_segs)
|
387 |
-
if len(audio_segs) >= 1:
|
388 |
-
s_ref = self.compute_style(audio_segs[0], model)
|
389 |
-
else:
|
390 |
-
raise NotImplementedError('No audio segments found!')
|
391 |
sentences = split_and_recombine_text(passage)
|
392 |
wavs = []
|
393 |
s_prev = None
|
@@ -398,7 +468,7 @@ class Predictor:
|
|
398 |
s_prev,
|
399 |
s_ref,
|
400 |
alpha = 0,
|
401 |
-
beta = 0.
|
402 |
t = 0.7,
|
403 |
diffusion_steps=10, embedding_scale=1)
|
404 |
wavs.append(wav)
|
@@ -437,4 +507,5 @@ class Predictor:
|
|
437 |
return False
|
438 |
except Exception as e:
|
439 |
print(f"Error uploading file: {e}")
|
440 |
-
return False
|
|
|
|
6 |
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
8 |
import numpy as np
|
9 |
+
import base64
|
10 |
+
from pydub.utils import mediainfo
|
11 |
+
import tempfile
|
12 |
+
|
13 |
+
|
14 |
|
15 |
from runpod.serverless.utils import rp_cuda
|
16 |
import boto3
|
|
|
57 |
from pydantic import BaseModel, HttpUrl
|
58 |
from api import BaseSpeakerTTS, ToneColorConverter
|
59 |
|
60 |
+
from pydub import AudioSegment
|
61 |
+
|
62 |
+
|
63 |
class Predictor:
|
64 |
def __init__(self):
|
65 |
self.model = None
|
|
|
143 |
self.tone_color_converter.load_ckpt(f'{self.ckpt_converter}/checkpoint.pth')
|
144 |
|
145 |
|
146 |
+
def createvoice(self,audio_base_64,cut_audio,process_audio):
|
147 |
+
file_bytes = base64.b64decode(audio_base_64)
|
148 |
+
file_buffer = io.BytesIO(file_bytes)
|
149 |
+
|
150 |
+
header = file_buffer.read(12)
|
151 |
+
print(header)
|
152 |
+
file_format = None
|
153 |
+
bucket_name = 'demovidelyuseruploads'
|
154 |
+
if b'WAVE' in header:
|
155 |
+
file_format = 'wav'
|
156 |
+
elif header.startswith((b'\xff\xfb', b'\xff\xf3', b'\xff\xe3', b'\xff\xfa')):
|
157 |
+
file_format = 'mp3'
|
158 |
+
else:
|
159 |
+
file_format = 'unknown'
|
160 |
+
if file_format == 'unknown':
|
161 |
+
return {'error':'unrecognized file format, encode audio file as base64 str'}
|
162 |
+
|
163 |
+
unique_filename = f"{uuid.uuid4()}"
|
164 |
+
|
165 |
+
local_filename = f"{unique_filename}.{file_format}"
|
166 |
+
with open(local_filename, 'wb') as file_out:
|
167 |
+
file_out.write(file_bytes)
|
168 |
+
|
169 |
+
wav_filename = local_filename
|
170 |
+
if file_format == "mp3":
|
171 |
+
wav_filename = f"{unique_filename}.wav"
|
172 |
+
subprocess.run(["ffmpeg", "-i", local_filename, wav_filename])
|
173 |
+
os.remove(local_filename)
|
174 |
+
print(wav_filename)
|
175 |
+
|
176 |
+
# if cut_audio > 0, means it was set
|
177 |
+
if cut_audio > 0:
|
178 |
+
#need to cut
|
179 |
+
se_extractor.extract_segments_to_cut_audio(cut_audio,wav_filename)
|
180 |
+
|
181 |
+
file_url = f"https://{bucket_name}.s3.amazonaws.com/{wav_filename}"
|
182 |
+
|
183 |
+
if process_audio:
|
184 |
+
(new_sr, wav1) = self._fn(wav_filename,"Midpoint",32,0.5)
|
185 |
+
print('Denoised')
|
186 |
+
buffer = io.BytesIO()
|
187 |
+
sf.write(buffer, wav1, new_sr, format='WAV')
|
188 |
+
print(new_sr)
|
189 |
+
buffer.seek(0)
|
190 |
+
else:
|
191 |
+
wav1, sr = librosa.load(wav_filename, sr=None)
|
192 |
+
buffer = io.BytesIO()
|
193 |
+
sf.write(buffer, wav1, sr, format='WAV')
|
194 |
+
buffer.seek(0)
|
195 |
+
|
196 |
+
print("uploading")
|
197 |
+
content_type = "audio/wav"
|
198 |
+
try:
|
199 |
+
self.s3_client.put_object(Bucket=bucket_name, Key=wav_filename, Body=buffer, ContentType=content_type)
|
200 |
+
print("uploaded")
|
201 |
+
except Exception as e:
|
202 |
+
print(f"Error uploading to S3: {e}")
|
203 |
+
return {"error": str(e)}
|
204 |
+
|
205 |
+
os.remove(wav_filename)
|
206 |
+
return {"url": file_url}
|
207 |
+
|
208 |
+
|
209 |
+
def predict(self,s3_url,passage,process_audio,method_type='voice_clone'):
|
210 |
output_dir = 'processed'
|
211 |
gen_id = str(uuid.uuid4())
|
212 |
os.makedirs(output_dir,exist_ok=True)
|
|
|
230 |
#voice_clone with styletts2
|
231 |
model,sampler = self.model,self.sampler
|
232 |
processed_seg_dir = os.path.join(segments_dir,s3_key.split('.')[0],'wavs')
|
233 |
+
result = self.process_audio_file(local_file_path,passage,model,sampler)
|
234 |
final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
|
235 |
+
|
236 |
sf.write(final_output,result,24000)
|
237 |
+
if process_audio:
|
238 |
+
(new_sr, wav1) = self._fn(final_output,"Midpoint",32,0.5)
|
239 |
+
sf.write(final_output,wav1,new_sr)
|
240 |
|
241 |
base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
242 |
reference_speaker = local_file_path
|
243 |
target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
|
244 |
src_path = os.path.join(results_dir,f"{gen_id}-tmp.wav")
|
245 |
+
openvoice_output = os.path.join(results_dir,f"{gen_id}-voice-clone-2.wav")
|
246 |
base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0)
|
247 |
|
248 |
source_se = torch.load(f'{self.ckpt_base}/en_default_se.pth').to(self.device)
|
249 |
tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
|
250 |
+
if process_audio:
|
251 |
+
(new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
|
252 |
+
sf.write(openvoice_output,wav1,new_sr)
|
253 |
+
|
|
|
254 |
|
255 |
mp3_final_output_1 = str(final_output).replace('wav','mp3')
|
256 |
+
mp3_final_output_2 = str(openvoice_output).replace('wav','mp3')
|
257 |
self.convert_wav_to_mp3(final_output,mp3_final_output_1)
|
258 |
+
self.convert_wav_to_mp3(openvoice_output,mp3_final_output_2)
|
259 |
print(mp3_final_output_1)
|
260 |
print(mp3_final_output_2)
|
261 |
|
|
|
274 |
base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0,use_emotions=True)
|
275 |
source_se = torch.load(f'{self.ckpt_base}/en_style_se.pth').to(self.device)
|
276 |
tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
|
277 |
+
if process_audio:
|
278 |
+
(new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
|
279 |
+
sf.write(openvoice_output,wav1,new_sr)
|
280 |
|
281 |
+
mp3_final_output_1 = str(openvoice_output).replace('wav','mp3')
|
282 |
+
self.convert_wav_to_mp3(openvoice_output,mp3_final_output_1)
|
283 |
print(mp3_final_output_1)
|
284 |
self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-with-emotions.mp3")
|
285 |
shutil.rmtree(os.path.join(output_dir,gen_id))
|
286 |
return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-with-emotions.mp3"
|
287 |
}
|
288 |
|
289 |
+
if method_type == 'voice_clone_with_multi_lang':
|
290 |
#voice clone with multi-lingugal
|
291 |
_,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
|
292 |
reference_speaker = local_file_path
|
|
|
310 |
self.tone_color_converter.convert(audio_src_path=openai_multi_lang_path, src_se=source_se, tgt_se=target_se, output_path=multi_lang_with_voice_clone_path,message='')
|
311 |
|
312 |
mp3_final_output_1 = str(multi_lang_with_voice_clone_path).replace('wav','mp3')
|
313 |
+
self.convert_wav_to_mp3(multi_lang_with_voice_clone_path,mp3_final_output_1)
|
314 |
print(mp3_final_output_1)
|
315 |
+
self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-multi-lang.mp3")
|
316 |
+
shutil.rmtree(os.path.join(output_dir,gen_id))
|
317 |
return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-multi-lang.mp3"
|
318 |
}
|
319 |
|
|
|
324 |
|
325 |
solver = solver.lower()
|
326 |
nfe = int(nfe)
|
327 |
+
lambd = 0.1 # lets remove denoise
|
328 |
|
329 |
dwav, sr = torchaudio.load(path)
|
330 |
dwav = dwav.mean(dim=0)
|
|
|
455 |
|
456 |
return torch.cat([ref_s, ref_p], dim=1)
|
457 |
|
458 |
+
def process_audio_file(self,local_file_path,passage,model,sampler):
|
459 |
+
print(local_file_path)
|
460 |
+
s_ref = self.compute_style(local_file_path, model)
|
|
|
|
|
|
|
|
|
|
|
461 |
sentences = split_and_recombine_text(passage)
|
462 |
wavs = []
|
463 |
s_prev = None
|
|
|
468 |
s_prev,
|
469 |
s_ref,
|
470 |
alpha = 0,
|
471 |
+
beta = 0.3,
|
472 |
t = 0.7,
|
473 |
diffusion_steps=10, embedding_scale=1)
|
474 |
wavs.append(wav)
|
|
|
507 |
return False
|
508 |
except Exception as e:
|
509 |
print(f"Error uploading file: {e}")
|
510 |
+
return False
|
511 |
+
|
src/processed/4d651a78-ccbd-4f66-96b1-0e0ede048d77/raw/631a27e2-8466-463e-a6ca-a2afd468c5a3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c25a5bee8b60933b09cc779d942fa5c219f437e455bf64b08c2623f1c833ccfe
|
3 |
+
size 322856
|
src/processed/69b28271-7198-4307-8501-e3969bbebef4/raw/631a27e2-8466-463e-a6ca-a2afd468c5a3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c25a5bee8b60933b09cc779d942fa5c219f437e455bf64b08c2623f1c833ccfe
|
3 |
+
size 322856
|
src/rp_handler.py
CHANGED
@@ -21,13 +21,36 @@ MODEL.setup()
|
|
21 |
@rp_debugger.FunctionTimer
|
22 |
def run_voice_clone_job(job):
|
23 |
job_input = job['input']
|
24 |
-
method_type = job_input
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
runpod.serverless.start({"handler": run_voice_clone_job})
|
|
|
21 |
@rp_debugger.FunctionTimer
|
22 |
def run_voice_clone_job(job):
|
23 |
job_input = job['input']
|
24 |
+
method_type = job_input.get('method_type')
|
25 |
+
|
26 |
+
if method_type not in ["create_voice","voice_clone","voice_clone_with_emotions","voice_clone_with_multi_lang"]:
|
27 |
+
return {"error":"Please set method_type: available options, create_voice, voice_clone, voice_clone_with_emotions,voice_clone_with_multi_lang"}
|
28 |
+
|
29 |
+
if method_type == "create_voice":
|
30 |
+
audio_base64 = job_input.get('audio_base64')
|
31 |
+
if audio_base64 is None:
|
32 |
+
return {"error":"Needs audio file as base64"}
|
33 |
+
cut_audio = job_input.get('cut_audio')
|
34 |
+
process_audio = job_input.get('process_audio')
|
35 |
+
print(process_audio)
|
36 |
+
if process_audio is None:
|
37 |
+
process_audio = False
|
38 |
+
if cut_audio is None:
|
39 |
+
cut_audio = 0
|
40 |
+
|
41 |
+
processed_urls = MODEL.createvoice(audio_base64,cut_audio,process_audio)
|
42 |
+
return processed_urls
|
43 |
+
else:
|
44 |
+
s3_url = job_input.get('s3_url')
|
45 |
+
passage = job_input.get('passage')
|
46 |
+
process_audio = job_input.get('process_audio')
|
47 |
+
print(process_audio)
|
48 |
+
if process_audio is None:
|
49 |
+
process_audio = False
|
50 |
+
|
51 |
+
result = MODEL.predict(s3_url,passage,process_audio,method_type)
|
52 |
+
|
53 |
+
return result
|
54 |
|
55 |
|
56 |
runpod.serverless.start({"handler": run_voice_clone_job})
|
src/rp_schema.py
CHANGED
@@ -14,5 +14,20 @@ INPUT_VALIDATIONS = {
|
|
14 |
'required': False,
|
15 |
'default': 'None'
|
16 |
},
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
}
|
|
|
14 |
'required': False,
|
15 |
'default': 'None'
|
16 |
},
|
17 |
+
'audio_base64': {
|
18 |
+
'type': str,
|
19 |
+
'required': False,
|
20 |
+
'default': 'None'
|
21 |
+
},
|
22 |
+
'cut_audio': {
|
23 |
+
'type': int,
|
24 |
+
'required': False,
|
25 |
+
'default': 0
|
26 |
+
},
|
27 |
+
'process_audio': {
|
28 |
+
'type': bool,
|
29 |
+
'required': False,
|
30 |
+
'default': False
|
31 |
+
}
|
32 |
+
|
33 |
}
|
src/se_extractor.py
CHANGED
@@ -10,7 +10,7 @@ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
|
|
10 |
model_size = "medium"
|
11 |
# Run on GPU with FP16
|
12 |
model = None
|
13 |
-
def split_audio_whisper(audio_path, target_dir='processed'):
|
14 |
global model
|
15 |
if model is None:
|
16 |
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
@@ -62,13 +62,18 @@ def split_audio_whisper(audio_path, target_dir='processed'):
|
|
62 |
output_file = os.path.join(wavs_folder, fname)
|
63 |
audio_seg.export(output_file, format='wav')
|
64 |
|
|
|
|
|
|
|
65 |
if k < len(segments) - 1:
|
66 |
-
start_time = max(0, segments[k+1].start -
|
67 |
|
68 |
s_ind = s_ind + 1
|
69 |
return wavs_folder
|
70 |
|
71 |
|
|
|
|
|
72 |
def split_audio_vad(audio_path, target_dir, split_seconds=10.0):
|
73 |
SAMPLE_RATE = 16000
|
74 |
audio_vad = get_audio_tensor(audio_path)
|
@@ -155,3 +160,24 @@ def generate_voice_segments(audio_path, target_dir='processed', vad=True):
|
|
155 |
def load_model():
|
156 |
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
model_size = "medium"
|
11 |
# Run on GPU with FP16
|
12 |
model = None
|
13 |
+
def split_audio_whisper(audio_path, target_dir='processed',needs_offset=True):
|
14 |
global model
|
15 |
if model is None:
|
16 |
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
|
|
62 |
output_file = os.path.join(wavs_folder, fname)
|
63 |
audio_seg.export(output_file, format='wav')
|
64 |
|
65 |
+
offset = 0.0
|
66 |
+
if needs_offset:
|
67 |
+
offset = 0.08
|
68 |
if k < len(segments) - 1:
|
69 |
+
start_time = max(0, segments[k+1].start - offset)
|
70 |
|
71 |
s_ind = s_ind + 1
|
72 |
return wavs_folder
|
73 |
|
74 |
|
75 |
+
|
76 |
+
|
77 |
def split_audio_vad(audio_path, target_dir, split_seconds=10.0):
|
78 |
SAMPLE_RATE = 16000
|
79 |
audio_vad = get_audio_tensor(audio_path)
|
|
|
160 |
def load_model():
|
161 |
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
162 |
|
163 |
+
|
164 |
+
def extract_segments_to_cut_audio(max_duration,audio_path,target_dir='processed'):
|
165 |
+
global model
|
166 |
+
if model is None:
|
167 |
+
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
168 |
+
audio = AudioSegment.from_file(audio_path)
|
169 |
+
max_len = len(audio)
|
170 |
+
|
171 |
+
segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
|
172 |
+
segments = list(segments)
|
173 |
+
start_time = 0.0
|
174 |
+
end_time = max_len
|
175 |
+
for segment in segments:
|
176 |
+
print(segment.end)
|
177 |
+
if segment.end > max_duration:
|
178 |
+
end_time = segment.end * 1000
|
179 |
+
break
|
180 |
+
max_duration_audio = audio[start_time:end_time]
|
181 |
+
max_duration_audio.export(audio_path,format='wav')
|
182 |
+
|
183 |
+
|