Spaces:
Runtime error
Runtime error
Update audio_foundation_models.py
Browse files- audio_foundation_models.py +93 -0
audio_foundation_models.py
CHANGED
@@ -892,6 +892,99 @@ class Speech_Enh_SC:
|
|
892 |
return audio_filename
|
893 |
|
894 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
895 |
class Speech_SS:
|
896 |
def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_noncausal"):
|
897 |
self.model_name = model_name
|
|
|
892 |
return audio_filename
|
893 |
|
894 |
|
895 |
+
class Speech_SS:
|
896 |
+
def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_noncausal"):
|
897 |
+
self.model_name = model_name
|
898 |
+
self.device = device
|
899 |
+
print("Initializing ESPnet SS to %s" % device)
|
900 |
+
self._initialize_model()
|
901 |
+
|
902 |
+
def _initialize_model(self):
|
903 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
904 |
+
from espnet2.bin.enh_inference import SeparateSpeech
|
905 |
+
|
906 |
+
d = ModelDownloader()
|
907 |
+
|
908 |
+
cfg = d.download_and_unpack(self.model_name)
|
909 |
+
self.separate_speech = SeparateSpeech(
|
910 |
+
train_config=cfg["train_config"],
|
911 |
+
model_file=cfg["model_file"],
|
912 |
+
# for segment-wise process on long speech
|
913 |
+
segment_size=2.4,
|
914 |
+
hop_size=0.8,
|
915 |
+
normalize_segment_scale=False,
|
916 |
+
show_progressbar=True,
|
917 |
+
ref_channel=None,
|
918 |
+
normalize_output_wav=True,
|
919 |
+
device=self.device,
|
920 |
+
)
|
921 |
+
|
922 |
+
@prompts(name="Speech Separation",
|
923 |
+
description="useful for when you want to separate each speech from the speech mixture, "
|
924 |
+
"receives audio_path as input."
|
925 |
+
"The input to this tool should be a string, "
|
926 |
+
"representing the audio_path. ")
|
927 |
+
def inference(self, speech_path):
|
928 |
+
speech, sr = soundfile.read(speech_path)
|
929 |
+
enh_speech = self.separate_speech(speech[None, ...], fs=sr)
|
930 |
+
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
931 |
+
if len(enh_speech) == 1:
|
932 |
+
soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
|
933 |
+
else:
|
934 |
+
audio_filename_1 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
935 |
+
soundfile.write(audio_filename_1, enh_speech[0].squeeze(), samplerate=sr)
|
936 |
+
audio_filename_2 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
937 |
+
soundfile.write(audio_filename_2, enh_speech[1].squeeze(), samplerate=sr)
|
938 |
+
audio_filename = merge_audio(audio_filename_1, audio_filename_2)
|
939 |
+
return audio_filename
|
940 |
+
|
941 |
+
class Speech_Enh_SC:
|
942 |
+
"""Speech Enhancement or Separation in single-channel
|
943 |
+
Example usage:
|
944 |
+
enh_model = Speech_Enh_SS("cuda")
|
945 |
+
enh_wav = enh_model.inference("./test_chime4_audio_M05_440C0213_PED_REAL.wav")
|
946 |
+
"""
|
947 |
+
|
948 |
+
def __init__(self, device="cuda", model_name="espnet/Wangyou_Zhang_chime4_enh_train_enh_conv_tasnet_raw"):
|
949 |
+
self.model_name = model_name
|
950 |
+
self.device = device
|
951 |
+
print("Initializing ESPnet Enh to %s" % device)
|
952 |
+
self._initialize_model()
|
953 |
+
|
954 |
+
def _initialize_model(self):
|
955 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
956 |
+
from espnet2.bin.enh_inference import SeparateSpeech
|
957 |
+
|
958 |
+
d = ModelDownloader()
|
959 |
+
|
960 |
+
cfg = d.download_and_unpack(self.model_name)
|
961 |
+
self.separate_speech = SeparateSpeech(
|
962 |
+
train_config=cfg["train_config"],
|
963 |
+
model_file=cfg["model_file"],
|
964 |
+
# for segment-wise process on long speech
|
965 |
+
segment_size=2.4,
|
966 |
+
hop_size=0.8,
|
967 |
+
normalize_segment_scale=False,
|
968 |
+
show_progressbar=True,
|
969 |
+
ref_channel=None,
|
970 |
+
normalize_output_wav=True,
|
971 |
+
device=self.device,
|
972 |
+
)
|
973 |
+
|
974 |
+
@prompts(name="Speech Enhancement In Single-Channel",
|
975 |
+
description="useful for when you want to enhance the quality of the speech signal by reducing background noise (single-channel), "
|
976 |
+
"receives audio_path as input."
|
977 |
+
"The input to this tool should be a string, "
|
978 |
+
"representing the audio_path. ")
|
979 |
+
def inference(self, speech_path, ref_channel=0):
|
980 |
+
speech, sr = soundfile.read(speech_path)
|
981 |
+
speech = speech[:, ref_channel]
|
982 |
+
enh_speech = self.separate_speech(speech[None, ...], fs=sr)
|
983 |
+
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
|
984 |
+
soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
|
985 |
+
return audio_filename
|
986 |
+
|
987 |
+
|
988 |
class Speech_SS:
|
989 |
def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_noncausal"):
|
990 |
self.model_name = model_name
|