abdouaziiz
commited on
Commit
•
c542d41
1
Parent(s):
3f7fb84
Update README.md
Browse files
README.md
CHANGED
@@ -76,7 +76,67 @@ The following hyperparameters were used during training:
|
|
76 |
| 27000 | 0.084400 | 0.367826 | 0.212565 |
|
77 |
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
| 27000 | 0.084400 | 0.367826 | 0.212565 |
|
77 |
|
78 |
|
79 |
+
## Usage
|
80 |
+
The model can be used directly (without a language model) as follows:
|
81 |
+
```python
|
82 |
+
import librosa
|
83 |
+
import warnings
|
84 |
+
from transformers import AutoProcessor, AutoModelForCTC
|
85 |
+
from datasets import Dataset, DatasetDict
|
86 |
+
from datasets import load_metric
|
87 |
+
|
88 |
+
wer_metric = load_metric("wer")
|
89 |
+
|
90 |
+
wolof = pd.read_csv('Test.csv') # wolof contains the columns of file , and transcription
|
91 |
+
wolof = DatasetDict({'test': Dataset.from_pandas(wolof)})
|
92 |
+
|
93 |
+
chars_to_ignore_regex = '[\"\?\.\!\-\;\:\(\)\,]'
|
94 |
+
|
95 |
+
def remove_special_characters(batch):
|
96 |
+
batch["transcription"] = re.sub(chars_to_ignore_regex, '', batch["transcription"]).lower() + " "
|
97 |
+
return batch
|
98 |
+
|
99 |
+
|
100 |
+
wolof = wolof.map(remove_special_characters)
|
101 |
+
|
102 |
+
processor = AutoProcessor.from_pretrained("abdouaziiz/wav2vec2-xls-r-300m-wolof")
|
103 |
+
model = AutoModelForCTC.from_pretrained("abdouaziiz/wav2vec2-xls-r-300m-wolof")
|
104 |
+
|
105 |
+
warnings.filterwarnings("ignore")
|
106 |
+
def speech_file_to_array_fn(batch):
|
107 |
+
speech_array, sampling_rate = librosa.load(batch["file"], sr = 16000)
|
108 |
+
batch["speech"] = speech_array.astype('float16')
|
109 |
+
batch["sampling_rate"] = sampling_rate
|
110 |
+
batch["target_text"] = batch["transcription"]
|
111 |
+
return batch
|
112 |
+
|
113 |
+
wolof = wolof.map(speech_file_to_array_fn, remove_columns=wolof.column_names["test"], num_proc=1)
|
114 |
+
|
115 |
+
def map_to_result(batch):
|
116 |
+
model.to("cuda")
|
117 |
+
input_values = processor(
|
118 |
+
batch["speech"],
|
119 |
+
sampling_rate=batch["sampling_rate"],
|
120 |
+
return_tensors="pt"
|
121 |
+
).input_values.to("cuda")
|
122 |
+
|
123 |
+
with torch.no_grad():
|
124 |
+
logits = model(input_values).logits
|
125 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
126 |
+
batch["pred_str"] = processor.batch_decode(pred_ids)[0]
|
127 |
+
|
128 |
+
return batch
|
129 |
+
|
130 |
+
results = wolof["test"].map(map_to_result)
|
131 |
+
|
132 |
+
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["transcription"])))
|
133 |
+
|
134 |
+
```
|
135 |
+
|
136 |
+
## PS:
|
137 |
+
|
138 |
+
The results obtained can be improved by using :
|
139 |
+
|
140 |
+
- Wav2vec2 + language model .
|
141 |
+
- Build a Spellcheker from the text of the data
|
142 |
+
- Sentence Edit Distance
|