vineelpratap
commited on
Update asr.py
Browse files
asr.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import librosa
|
2 |
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
3 |
import torch
|
4 |
-
import json
|
5 |
import numpy as np
|
6 |
|
7 |
from huggingface_hub import hf_hub_download
|
@@ -52,7 +51,7 @@ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
|
|
52 |
# filename=decoding_config["lexiconfile"].rsplit("/", 1)[1],
|
53 |
# subfolder=decoding_config["lexiconfile"].rsplit("/", 1)[0],
|
54 |
# )
|
55 |
-
|
56 |
# beam_search_decoder = ctc_decoder(
|
57 |
# lexicon=lexicon_file,
|
58 |
# tokens=token_file,
|
@@ -67,20 +66,17 @@ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
|
|
67 |
# )
|
68 |
|
69 |
|
70 |
-
def transcribe(audio_data, lang="eng (English)"):
|
71 |
|
72 |
if isinstance(audio_data, tuple):
|
73 |
# microphone
|
74 |
sr, audio_samples = audio_data
|
75 |
-
audio_samples = (audio_samples/32768.0).astype(np.float)
|
76 |
-
print("case1", audio_samples[:5])
|
77 |
assert sr == ASR_SAMPLING_RATE, "Invalid sampling rate"
|
78 |
else:
|
79 |
# file upload
|
80 |
isinstance(audio_data, str)
|
81 |
-
|
82 |
-
audio_samples = librosa.load(audio_fp, sr=ASR_SAMPLING_RATE, mono=True)[0]
|
83 |
-
print("case2", audio_samples[:5])
|
84 |
|
85 |
lang_code = lang.split()[0]
|
86 |
processor.tokenizer.set_target_lang(lang_code)
|
@@ -112,7 +108,7 @@ def transcribe(audio_data, lang="eng (English)"):
|
|
112 |
ids = torch.argmax(outputs, dim=-1)[0]
|
113 |
transcription = processor.decode(ids)
|
114 |
else:
|
115 |
-
assert False
|
116 |
# beam_search_result = beam_search_decoder(outputs.to("cpu"))
|
117 |
# transcription = " ".join(beam_search_result[0][0].words).strip()
|
118 |
|
@@ -128,4 +124,4 @@ ASR_EXAMPLES = [
|
|
128 |
ASR_NOTE = """
|
129 |
The above demo doesn't use beam-search decoding using a language model.
|
130 |
Checkout the instructions [here](https://huggingface.co/facebook/mms-1b-all) on how to run LM decoding for better accuracy.
|
131 |
-
"""
|
|
|
1 |
import librosa
|
2 |
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
3 |
import torch
|
|
|
4 |
import numpy as np
|
5 |
|
6 |
from huggingface_hub import hf_hub_download
|
|
|
51 |
# filename=decoding_config["lexiconfile"].rsplit("/", 1)[1],
|
52 |
# subfolder=decoding_config["lexiconfile"].rsplit("/", 1)[0],
|
53 |
# )
|
54 |
+
|
55 |
# beam_search_decoder = ctc_decoder(
|
56 |
# lexicon=lexicon_file,
|
57 |
# tokens=token_file,
|
|
|
66 |
# )
|
67 |
|
68 |
|
69 |
+
def transcribe(audio_data, lang="eng (English)"):
|
70 |
|
71 |
if isinstance(audio_data, tuple):
|
72 |
# microphone
|
73 |
sr, audio_samples = audio_data
|
74 |
+
audio_samples = (audio_samples / 32768.0).astype(np.float)
|
|
|
75 |
assert sr == ASR_SAMPLING_RATE, "Invalid sampling rate"
|
76 |
else:
|
77 |
# file upload
|
78 |
isinstance(audio_data, str)
|
79 |
+
audio_samples = librosa.load(audio_data, sr=ASR_SAMPLING_RATE, mono=True)[0]
|
|
|
|
|
80 |
|
81 |
lang_code = lang.split()[0]
|
82 |
processor.tokenizer.set_target_lang(lang_code)
|
|
|
108 |
ids = torch.argmax(outputs, dim=-1)[0]
|
109 |
transcription = processor.decode(ids)
|
110 |
else:
|
111 |
+
assert False
|
112 |
# beam_search_result = beam_search_decoder(outputs.to("cpu"))
|
113 |
# transcription = " ".join(beam_search_result[0][0].words).strip()
|
114 |
|
|
|
124 |
ASR_NOTE = """
|
125 |
The above demo doesn't use beam-search decoding using a language model.
|
126 |
Checkout the instructions [here](https://huggingface.co/facebook/mms-1b-all) on how to run LM decoding for better accuracy.
|
127 |
+
"""
|