add model files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
README.md +6 -14
config.json +3 -0
decoder.ckpt +3 -0
encoder.ckpt +0 -0
hyperparams.yaml +105 -0
masknet.ckpt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.psd filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -85,13 +85,15 @@ Please notice that we encourage you to read our tutorials and learn more about
 ### Transcribing your own audio files (in German)
 ```python
 from speechbrain.pretrained import WhisperASR
-asr_model = WhisperASR.from_hparams(source="speechbrain/rescuespeech_whisper", savedir="pretrained_models/rescuespeech_whisper")
-asr_model.transcribe_file("speechbrain/rescuespeech_whisper/example_de.wav")
 ```
 ### Inference on GPU
 To perform inference on the GPU, add  `run_opts={"device":"cuda"}`  when calling the `from_hparams` method.
@@ -136,14 +138,4 @@ GitHub: https://github.com/speechbrain/speechbrain
-```bash
-from speechbrain.pretrained import SepformerSeparation as Separator
-from speechbrain.pretrained import WhisperASR
-enh_model = Separator.from_hparams(source="CKPT+2023-06-24+21-49-17+00", savedir='pretrained_models/sepformer_rescuespeech', hparams_file='hyperparams_asr.yaml')
-asr_model = WhisperASR.from_hparams(source="CKPT+2023-06-24+21-49-17+00", savedir="pretrained_models/whisper_rescuespeech", hparams_file='hyperparams_asr.yaml')
-# For custom file, change the path accordingly
-est_sources = enh_model.separate_file(path='example_rescuespeech16k.wav')
-print(asr_model(est_sources[:, :, 0]))
-```

 ### Transcribing your own audio files (in German)
 ```python
+from speechbrain.pretrained import SepformerSeparation as Separator
 from speechbrain.pretrained import WhisperASR
+enh_model = Separator.from_hparams(source="speechbrain/noisy-whisper-resucespeech", savedir='pretrained_models/noisy-whisper-resucespeech')
+asr_model = WhisperASR.from_hparams(source="speechbrain/noisy-whisper-resucespeech", savedir="pretrained_models/noisy-whisper-resucespeech")
+# For custom file, change the path accordingly
+est_sources = enh_model.separate_file(path='example_rescuespeech16k.wav')
+print(asr_model(est_sources[:, :, 0]))
 ```
 ### Inference on GPU
 To perform inference on the GPU, add  `run_opts={"device":"cuda"}`  when calling the `from_hparams` method.

config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "speechbrain_interface": "SepformerSeparation"
+}

decoder.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00d272f965100f627a4a43d45dd919a7caf867372035139a91b8ece174c8b5f1
+size 17195

encoder.ckpt CHANGED Viewed

Binary files a/encoder.ckpt and b/encoder.ckpt differ

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+# Generated 2023-06-24 from:
+# /netscratch/sagar/thesis/speechbrain/recipes/RescueSpeech/Enhancement/joint-training/transformers/hparams/robust_asr_16k.yaml
+# yamllint disable
+# Model: wav2vec2 + DNN + CTC
+# Augmentation: SpecAugment
+# Authors: Sangeet Sagar 2023
+# ################################
+# URL for the biggest whisper model.
+# URL for the biggest Fairseq english whisper model.
+whisper_hub: openai/whisper-large-v2
+language: german
+## Model parameters
+sample_rate: 16000
+freeze_whisper: false
+freeze_encoder_only: false
+freeze_encoder: true
+# These values are only used for the searchers.
+# They needs to be hardcoded and should not be changed with Whisper.
+# They are used as part of the searching process.
+# The bos token of the searcher will be timestamp_index
+# and will be concatenated with the bos, language and task tokens.
+timestamp_index: 50363
+eos_index: 50257
+bos_index: 50258
+# Decoding parameters
+min_decode_ratio: 0.0
+max_decode_ratio: 1.0
+test_beam_size: 8
+num_spks: 1
+# Enhancement model
+Encoder: &id004 !new:speechbrain.lobes.models.dual_path.Encoder
+  kernel_size: 16
+  out_channels: 256
+SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
+  num_layers: 8
+  d_model: 256
+  nhead: 8
+  d_ffn: 1024
+  dropout: 0
+  use_positional_encoding: true
+  norm_before: true
+SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
+  num_layers: 8
+  d_model: 256
+  nhead: 8
+  d_ffn: 1024
+  dropout: 0
+  use_positional_encoding: true
+  norm_before: true
+MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
+  num_spks: 1
+  in_channels: 256
+  out_channels: 256
+  num_layers: 2
+  K: 250
+  intra_model: *id001
+  inter_model: *id002
+  norm: ln
+  linear_layer_after_inter_intra: false
+  skip_around_intra: true
+# Whisper ASR and its decoder
+Decoder: &id006 !new:speechbrain.lobes.models.dual_path.Decoder
+  in_channels: 256
+  out_channels: 1
+  kernel_size: 16
+  stride: 8
+  bias: false
+whisper: &id003 !new:speechbrain.lobes.models.huggingface_whisper.HuggingFaceWhisper
+    source: !ref <whisper_hub>
+    freeze: !ref <freeze_whisper>
+    freeze_encoder: !ref <freeze_encoder>
+    save_path: whisper_checkpoints
+    encoder_only:  False
+decoder: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearch
+  model: *id003
+  bos_index: 50363
+  eos_index: 50257
+  min_decode_ratio: 0.0
+  max_decode_ratio: 1.0
+# Change the path to use a local model instead of the remote one
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+  loadables:
+    encoder: !ref <Encoder>
+    masknet: !ref <MaskNet>
+    decoder: !ref <Decoder>
+    whisper: !ref <whisper>
+modules:
+  encoder: *id004
+  masknet: *id005
+  decoder: *id006
+  whisper: *id003

masknet.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79872065eba79aa6b2b51ea21b918491e0f9e7a7f87eea8bd2d6fe9aa434c9d7
+size 112839555