lmzjms commited on
Commit
a4325b0
·
1 Parent(s): 4d6d83d

Update audio_foundation_models.py

Browse files
Files changed (1) hide show
  1. audio_foundation_models.py +93 -0
audio_foundation_models.py CHANGED
@@ -892,6 +892,99 @@ class Speech_Enh_SC:
892
  return audio_filename
893
 
894
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
895
  class Speech_SS:
896
  def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_noncausal"):
897
  self.model_name = model_name
 
892
  return audio_filename
893
 
894
 
895
+ class Speech_SS:
896
+ def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_noncausal"):
897
+ self.model_name = model_name
898
+ self.device = device
899
+ print("Initializing ESPnet SS to %s" % device)
900
+ self._initialize_model()
901
+
902
+ def _initialize_model(self):
903
+ from espnet_model_zoo.downloader import ModelDownloader
904
+ from espnet2.bin.enh_inference import SeparateSpeech
905
+
906
+ d = ModelDownloader()
907
+
908
+ cfg = d.download_and_unpack(self.model_name)
909
+ self.separate_speech = SeparateSpeech(
910
+ train_config=cfg["train_config"],
911
+ model_file=cfg["model_file"],
912
+ # for segment-wise process on long speech
913
+ segment_size=2.4,
914
+ hop_size=0.8,
915
+ normalize_segment_scale=False,
916
+ show_progressbar=True,
917
+ ref_channel=None,
918
+ normalize_output_wav=True,
919
+ device=self.device,
920
+ )
921
+
922
+ @prompts(name="Speech Separation",
923
+ description="useful for when you want to separate each speech from the speech mixture, "
924
+ "receives audio_path as input."
925
+ "The input to this tool should be a string, "
926
+ "representing the audio_path. ")
927
+ def inference(self, speech_path):
928
+ speech, sr = soundfile.read(speech_path)
929
+ enh_speech = self.separate_speech(speech[None, ...], fs=sr)
930
+ audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
931
+ if len(enh_speech) == 1:
932
+ soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
933
+ else:
934
+ audio_filename_1 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
935
+ soundfile.write(audio_filename_1, enh_speech[0].squeeze(), samplerate=sr)
936
+ audio_filename_2 = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
937
+ soundfile.write(audio_filename_2, enh_speech[1].squeeze(), samplerate=sr)
938
+ audio_filename = merge_audio(audio_filename_1, audio_filename_2)
939
+ return audio_filename
940
+
941
+ class Speech_Enh_SC:
942
+ """Speech Enhancement or Separation in single-channel
943
+ Example usage:
944
+ enh_model = Speech_Enh_SS("cuda")
945
+ enh_wav = enh_model.inference("./test_chime4_audio_M05_440C0213_PED_REAL.wav")
946
+ """
947
+
948
+ def __init__(self, device="cuda", model_name="espnet/Wangyou_Zhang_chime4_enh_train_enh_conv_tasnet_raw"):
949
+ self.model_name = model_name
950
+ self.device = device
951
+ print("Initializing ESPnet Enh to %s" % device)
952
+ self._initialize_model()
953
+
954
+ def _initialize_model(self):
955
+ from espnet_model_zoo.downloader import ModelDownloader
956
+ from espnet2.bin.enh_inference import SeparateSpeech
957
+
958
+ d = ModelDownloader()
959
+
960
+ cfg = d.download_and_unpack(self.model_name)
961
+ self.separate_speech = SeparateSpeech(
962
+ train_config=cfg["train_config"],
963
+ model_file=cfg["model_file"],
964
+ # for segment-wise process on long speech
965
+ segment_size=2.4,
966
+ hop_size=0.8,
967
+ normalize_segment_scale=False,
968
+ show_progressbar=True,
969
+ ref_channel=None,
970
+ normalize_output_wav=True,
971
+ device=self.device,
972
+ )
973
+
974
+ @prompts(name="Speech Enhancement In Single-Channel",
975
+ description="useful for when you want to enhance the quality of the speech signal by reducing background noise (single-channel), "
976
+ "receives audio_path as input."
977
+ "The input to this tool should be a string, "
978
+ "representing the audio_path. ")
979
+ def inference(self, speech_path, ref_channel=0):
980
+ speech, sr = soundfile.read(speech_path)
981
+ speech = speech[:, ref_channel]
982
+ enh_speech = self.separate_speech(speech[None, ...], fs=sr)
983
+ audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
984
+ soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
985
+ return audio_filename
986
+
987
+
988
  class Speech_SS:
989
  def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_noncausal"):
990
  self.model_name = model_name