m-a-p
/

MERT-v1-330M

@@ -55,7 +55,8 @@ More details will be written in our coming-soon paper.
 # Model Usage
 ```python
-from transformers import Wav2Vec2Processor
 from transformers import AutoModel
 import torch
 from torch import nn
@@ -63,28 +64,32 @@ import torchaudio.transforms as T
 from datasets import load_dataset
 # load demo audio and set processor
 dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
 dataset = dataset.sort("id")
 sampling_rate = dataset.features["audio"].sampling_rate
-processor = Wav2Vec2Processor.from_pretrained("m-a-p/MERT-v1-330M")
-resample_rate = processor.feature_extractor.sampling_rate
 # make sure the sample_rate aligned
 if resample_rate != sampling_rate:
-  resampler = T.Resample(sample_rate, resample_rate)
 else:
-  resampler = None
-# loading our model weights
-commit_hash='7bab7bb5d8b52448eff4873a980dc17f0015a09c'# this is recommended for security reason, the hash might be updated
-model = AutoModel.from_pretrained("m-a-p/MERT-v1-330M", trust_remote_code=True, revision=commit_hash)
 # audio file is decoded on the fly
 if resampler is None:
 	input_audio = dataset[0]["audio"]["array"]
 else:
-  input_audio = resampler(dataset[0]["audio"]["array"])
 inputs = processor(input_audio, sampling_rate=resample_rate, return_tensors="pt")
 with torch.no_grad():
@@ -100,7 +105,7 @@ time_reduced_hidden_states = all_layer_hidden_states.mean(-2)
 print(time_reduced_hidden_states.shape) # [25, 1024]
 # you can even use a learnable weighted average representation
-aggregator = nn.Conv1d(in_channels=13, out_channels=1, kernel_size=1)
 weighted_avg_hidden_states = aggregator(time_reduced_hidden_states.unsqueeze(0)).squeeze()
 print(weighted_avg_hidden_states.shape) # [1024]
 ```

 # Model Usage
 ```python
+# from transformers import Wav2Vec2Processor
+from transformers import Wav2Vec2FeatureExtractor
 from transformers import AutoModel
 import torch
 from torch import nn
 from datasets import load_dataset
+commit_hash='b74e8bdecaa1aa58bbd1fd752a7db0695194d9bb'# this is recommended for security reason, the hash might be updated
+# loading our model weights
+model = AutoModel.from_pretrained("m-a-p/MERT-v1-330M", trust_remote_code=True, revision=commit_hash)
+# loading the corresponding preprocessor config
+processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-330M",trust_remote_code=True, revision=commit_hash)
 # load demo audio and set processor
 dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
 dataset = dataset.sort("id")
 sampling_rate = dataset.features["audio"].sampling_rate
+resample_rate = processor.sampling_rate
 # make sure the sample_rate aligned
 if resample_rate != sampling_rate:
+    print(f'setting rate from {sampling_rate} to {resample_rate}')
+    resampler = T.Resample(sampling_rate, resample_rate)
 else:
+    resampler = None
 # audio file is decoded on the fly
 if resampler is None:
 	input_audio = dataset[0]["audio"]["array"]
 else:
+  input_audio = resampler(torch.from_numpy(dataset[0]["audio"]["array"]))
 inputs = processor(input_audio, sampling_rate=resample_rate, return_tensors="pt")
 with torch.no_grad():
 print(time_reduced_hidden_states.shape) # [25, 1024]
 # you can even use a learnable weighted average representation
+aggregator = nn.Conv1d(in_channels=25, out_channels=1, kernel_size=1)
 weighted_avg_hidden_states = aggregator(time_reduced_hidden_states.unsqueeze(0)).squeeze()
 print(weighted_avg_hidden_states.shape) # [1024]
 ```