Spaces:

Merlintxu
/

Wav2Txt

Sleeping

App Files Files Community

Merlintxu commited on Jul 7, 2024

Commit

2b71965

verified ·

1 Parent(s): 7fdbed5

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -28

app.py CHANGED Viewed

@@ -1,24 +1,30 @@
 import gradio as gr
-from transformers import pipeline
 import torch
 import librosa
 import subprocess
-from langdetect import detect
 import os
-# Models by language
 MODELS = {
     "es": [
-        "jonatasgrosman/wav2vec2-large-xlsr-53-spanish",
-        "openai/whisper-small",
-        "other/spanish-model"
     ],
     "en": [
         "facebook/wav2vec2-large-960h",
-        "openai/whisper-small",
-        "other/english-model"
     ],
-    # Add more models per language if needed
 }
 def convert_audio_to_wav(audio_path):
@@ -28,10 +34,36 @@ def convert_audio_to_wav(audio_path):
     return wav_path
 def detect_language(audio_path):
-    speech, _ = librosa.load(audio_path, sr=16000, duration=15)
-    transcriber = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")
-    transcription = transcriber(speech)["text"]
-    return detect(transcription)
 def transcribe_audio(audio, model_name):
     wav_audio = convert_audio_to_wav(audio)
@@ -60,19 +92,21 @@ def detect_and_select_model(audio):
     return language, model_options
 def combined_interface(audio):
-    language, model_options = detect_and_select_model(audio)
-    selected_model = model_options[0]
-    transcription_file = transcribe_audio(audio, selected_model)
-    with open(transcription_file, "r", encoding="utf-8") as file:
-        transcription_text = file.read()
-    return {
-        "Detected Language": language,
-        "Available Models": model_options,
-        "Selected Model": selected_model,
-        "Transcription": transcription_text
-    }
 iface = gr.Interface(
     fn=combined_interface,
@@ -83,8 +117,8 @@ iface = gr.Interface(
         gr.Textbox(label="Selected Model"),
         gr.Textbox(label="Transcription", lines=10)
     ],
-    title="Multilingual Audio Transcriber",
-    description="Upload an audio file to detect the language, select the transcription model, and get the transcription."
 )
 if __name__ == "__main__":

 import gradio as gr
+from transformers import pipeline, AutoModelForCTC, AutoProcessor
 import torch
 import librosa
 import subprocess
+from langdetect import detect_langs
 import os
+import numpy as np
+# Updated models by language
 MODELS = {
     "es": [
+        "openai/whisper-large-v3",
+        "facebook/wav2vec2-large-xlsr-53-spanish",
+        "jonatasgrosman/wav2vec2-xls-r-1b-spanish"
     ],
     "en": [
+        "openai/whisper-large-v3",
         "facebook/wav2vec2-large-960h",
+        "microsoft/wav2vec2-base-960"
     ],
+    "pt": [
+        "facebook/wav2vec2-large-xlsr-53-portuguese",
+        "openai/whisper-medium",
+        "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
+    ]
+    # Add more languages and models as needed
 }
 def convert_audio_to_wav(audio_path):
     return wav_path
 def detect_language(audio_path):
+    speech, _ = librosa.load(audio_path, sr=16000, duration=30)  # Increased duration for better detection
+    # Use multiple models for transcription to improve accuracy
+    transcriptions = []
+    models = ["facebook/wav2vec2-large-xlsr-53-spanish", "facebook/wav2vec2-large-xlsr-53-portuguese", "facebook/wav2vec2-large-960h"]
+    for model_name in models:
+        processor = AutoProcessor.from_pretrained(model_name)
+        model = AutoModelForCTC.from_pretrained(model_name)
+        inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
+        with torch.no_grad():
+            logits = model(inputs.input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.batch_decode(predicted_ids)[0]
+        transcriptions.append(transcription)
+    # Combine transcriptions and detect language
+    combined_text = " ".join(transcriptions)
+    langs = detect_langs(combined_text)
+    # Check confidence levels
+    es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
+    pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
+    # If Spanish and Portuguese are close, prefer Spanish for Latin American content
+    if abs(es_confidence - pt_confidence) < 0.2:
+        return 'es'
+    return max(langs, key=lambda x: x.prob).lang
 def transcribe_audio(audio, model_name):
     wav_audio = convert_audio_to_wav(audio)
     return language, model_options
 def combined_interface(audio):
+    try:
+        language, model_options = detect_and_select_model(audio)
+        selected_model = model_options[0]
+        transcription_file = transcribe_audio(audio, selected_model)
+        with open(transcription_file, "r", encoding="utf-8") as file:
+            transcription_text = file.read()
+        # Clean up temporary files
+        os.remove(transcription_file)
+        os.remove("converted_audio.wav")
+        return language, gr.Dropdown.update(choices=model_options, value=selected_model), selected_model, transcription_text
+    except Exception as e:
+        return str(e), gr.Dropdown.update(choices=[]), "", "An error occurred during processing."
 iface = gr.Interface(
     fn=combined_interface,
         gr.Textbox(label="Selected Model"),
         gr.Textbox(label="Transcription", lines=10)
     ],
+    title="Multilingual Audio Transcriber (Latin American Spanish Optimized)",
+    description="Upload an audio file to detect the language, select the transcription model, and get the transcription. Optimized for Latin American Spanish detection."
 )
 if __name__ == "__main__":