unijoh commited on
Commit
413f61e
1 Parent(s): 0fef023

Update asr.py

Browse files
Files changed (1) hide show
  1. asr.py +20 -10
asr.py CHANGED
@@ -1,17 +1,20 @@
1
  import librosa
2
- from transformers import pipeline
 
3
  import logging
4
 
5
  # Set up logging
6
  logging.basicConfig(level=logging.DEBUG)
7
 
8
  ASR_SAMPLING_RATE = 16_000
 
9
 
10
  try:
11
- pipe = pipeline("automatic-speech-recognition", model="facebook/mms-1b-all")
12
- logging.info("ASR pipeline loaded successfully.")
 
13
  except Exception as e:
14
- logging.error(f"Error loading ASR pipeline: {e}")
15
 
16
  def transcribe(audio):
17
  try:
@@ -31,12 +34,19 @@ def transcribe(audio):
31
  logging.error(f"Error loading audio file with librosa: {e}")
32
  return f"ERROR: Unable to load audio file - {e}"
33
 
34
- # Process the audio with the pipeline
35
- try:
36
- transcription = pipe(audio_samples)["text"]
37
- except Exception as e:
38
- logging.error(f"Error during transcription with pipeline: {e}")
39
- return f"ERROR: Transcription failed - {e}"
 
 
 
 
 
 
 
40
 
41
  logging.info("Transcription completed successfully.")
42
  return transcription
 
1
  import librosa
2
+ from transformers import AutoProcessor, Wav2Vec2ForCTC
3
+ import torch
4
  import logging
5
 
6
  # Set up logging
7
  logging.basicConfig(level=logging.DEBUG)
8
 
9
  ASR_SAMPLING_RATE = 16_000
10
+ MODEL_ID = "facebook/mms-1b-all"
11
 
12
  try:
13
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
14
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
15
+ logging.info("ASR model and processor loaded successfully.")
16
  except Exception as e:
17
+ logging.error(f"Error loading ASR model or processor: {e}")
18
 
19
  def transcribe(audio):
20
  try:
 
34
  logging.error(f"Error loading audio file with librosa: {e}")
35
  return f"ERROR: Unable to load audio file - {e}"
36
 
37
+ # Set the language for the processor to Faroese
38
+ lang_code = "fao"
39
+ processor.tokenizer.set_target_lang(lang_code)
40
+ model.load_adapter(lang_code)
41
+
42
+ # Process the audio with the processor
43
+ inputs = processor(audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt")
44
+
45
+ with torch.no_grad():
46
+ outputs = model(**inputs).logits
47
+
48
+ ids = torch.argmax(outputs, dim=-1)[0]
49
+ transcription = processor.decode(ids)
50
 
51
  logging.info("Transcription completed successfully.")
52
  return transcription