Spaces:

unijoh
/

metaambod

Running

unijoh commited on Jun 15, 2024

Commit

413f61e

verified ·

1 Parent(s): 0fef023

Update asr.py

Files changed (1) hide show

asr.py CHANGED Viewed

@@ -1,17 +1,20 @@
 import librosa
-from transformers import pipeline
 import logging
 # Set up logging
 logging.basicConfig(level=logging.DEBUG)
 ASR_SAMPLING_RATE = 16_000
 try:
-    pipe = pipeline("automatic-speech-recognition", model="facebook/mms-1b-all")
-    logging.info("ASR pipeline loaded successfully.")
 except Exception as e:
-    logging.error(f"Error loading ASR pipeline: {e}")
 def transcribe(audio):
     try:
@@ -31,12 +34,19 @@ def transcribe(audio):
             logging.error(f"Error loading audio file with librosa: {e}")
             return f"ERROR: Unable to load audio file - {e}"
-        # Process the audio with the pipeline
-        try:
-            transcription = pipe(audio_samples)["text"]
-        except Exception as e:
-            logging.error(f"Error during transcription with pipeline: {e}")
-            return f"ERROR: Transcription failed - {e}"
         logging.info("Transcription completed successfully.")
         return transcription

 import librosa
+from transformers import AutoProcessor, Wav2Vec2ForCTC
+import torch
 import logging
 # Set up logging
 logging.basicConfig(level=logging.DEBUG)
 ASR_SAMPLING_RATE = 16_000
+MODEL_ID = "facebook/mms-1b-all"
 try:
+    processor = AutoProcessor.from_pretrained(MODEL_ID)
+    model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+    logging.info("ASR model and processor loaded successfully.")
 except Exception as e:
+    logging.error(f"Error loading ASR model or processor: {e}")
 def transcribe(audio):
     try:
             logging.error(f"Error loading audio file with librosa: {e}")
             return f"ERROR: Unable to load audio file - {e}"
+        # Set the language for the processor to Faroese
+        lang_code = "fao"
+        processor.tokenizer.set_target_lang(lang_code)
+        model.load_adapter(lang_code)
+        # Process the audio with the processor
+        inputs = processor(audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(**inputs).logits
+        ids = torch.argmax(outputs, dim=-1)[0]
+        transcription = processor.decode(ids)
         logging.info("Transcription completed successfully.")
         return transcription