Spaces:

Merlintxu
/

Wav2Txt

Sleeping

App Files Files Community

Merlintxu commited on Jul 7, 2024

Commit

7fdbed5

verified ·

1 Parent(s): 96f5004

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -44

app.py CHANGED Viewed

@@ -1,15 +1,24 @@
 import gradio as gr
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import torch
 import librosa
 import subprocess
 from langdetect import detect
-# Modelos por idioma
 MODELS = {
-    "es": "jonatasgrosman/wav2vec2-large-xlsr-53-spanish",
-    "en": "facebook/wav2vec2-large-960h",  # Puedes añadir más modelos aquí según sea necesario
-    # Añadir más modelos por idioma si es necesario
 }
 def convert_audio_to_wav(audio_path):
@@ -19,58 +28,64 @@ def convert_audio_to_wav(audio_path):
     return wav_path
 def detect_language(audio_path):
-    # Cargar los primeros 15 segundos del audio
     speech, _ = librosa.load(audio_path, sr=16000, duration=15)
-    # Convertir audio a texto usando el modelo inglés como predeterminado para detección
-    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
-    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
-    input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
-    with torch.no_grad():
-        logits = model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.batch_decode(predicted_ids)[0]
     return detect(transcription)
-def transcribe_audio(audio):
-    # Convertir audio a formato WAV
     wav_audio = convert_audio_to_wav(audio)
-    # Detectar el idioma del audio
-    language = detect_language(wav_audio)
-    model_name = MODELS.get(language, "facebook/wav2vec2-large-960h")  # Modelo predeterminado en caso de que no se detecte el idioma
-    # Cargar el modelo y el procesador adecuados
-    processor = Wav2Vec2Processor.from_pretrained(model_name)
-    model = Wav2Vec2ForCTC.from_pretrained(model_name)
-    # Cargar el audio completo
     speech, rate = librosa.load(wav_audio, sr=16000)
-    # Procesar el audio
-    input_values = processor(speech, return_tensors="pt", sampling_rate=rate).input_values
-    # Generar las predicciones (logits)
-    with torch.no_grad():
-        logits = model(input_values).logits
-    # Obtener las predicciones (tokens) y convertirlas en texto
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.batch_decode(predicted_ids)[0]
-    # Guardar la transcripción en un archivo de texto
-    with open("transcription.txt", "w") as file:
-        file.write(transcription)
-    return "transcription.txt"
-# Configurar la interfaz de Gradio
 iface = gr.Interface(
-    fn=transcribe_audio,
     inputs=gr.Audio(type="filepath"),
-    outputs=gr.File(),
-    title="Transcriptor de Audio Multilingüe",
-    description="Sube un archivo de audio y obtén la transcripción en un archivo de texto."
 )
-# Iniciar la interfaz
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
+from transformers import pipeline
 import torch
 import librosa
 import subprocess
 from langdetect import detect
+import os
+# Models by language
 MODELS = {
+    "es": [
+        "jonatasgrosman/wav2vec2-large-xlsr-53-spanish",
+        "openai/whisper-small",
+        "other/spanish-model"
+    ],
+    "en": [
+        "facebook/wav2vec2-large-960h",
+        "openai/whisper-small",
+        "other/english-model"
+    ],
+    # Add more models per language if needed
 }
 def convert_audio_to_wav(audio_path):
     return wav_path
 def detect_language(audio_path):
     speech, _ = librosa.load(audio_path, sr=16000, duration=15)
+    transcriber = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")
+    transcription = transcriber(speech)["text"]
     return detect(transcription)
+def transcribe_audio(audio, model_name):
     wav_audio = convert_audio_to_wav(audio)
+    transcriber = pipeline("automatic-speech-recognition", model=model_name)
+    chunk_duration = 30  # seconds
     speech, rate = librosa.load(wav_audio, sr=16000)
+    duration = len(speech) / rate
+    transcription = ""
+    for i in range(0, int(duration), chunk_duration):
+        end = min(i + chunk_duration, duration)
+        chunk = speech[int(i * rate):int(end * rate)]
+        transcription += transcriber(chunk)["text"] + " "
+    output_file = "transcription.txt"
+    with open(output_file, "w", encoding="utf-8") as file:
+        file.write(transcription.strip())
+    return output_file
+def detect_and_select_model(audio):
+    wav_audio = convert_audio_to_wav(audio)
+    language = detect_language(wav_audio)
+    model_options = MODELS.get(language, MODELS["en"])
+    return language, model_options
+def combined_interface(audio):
+    language, model_options = detect_and_select_model(audio)
+    selected_model = model_options[0]
+    transcription_file = transcribe_audio(audio, selected_model)
+    with open(transcription_file, "r", encoding="utf-8") as file:
+        transcription_text = file.read()
+    return {
+        "Detected Language": language,
+        "Available Models": model_options,
+        "Selected Model": selected_model,
+        "Transcription": transcription_text
+    }
 iface = gr.Interface(
+    fn=combined_interface,
     inputs=gr.Audio(type="filepath"),
+    outputs=[
+        gr.Textbox(label="Detected Language"),
+        gr.Dropdown(label="Available Models", choices=[]),
+        gr.Textbox(label="Selected Model"),
+        gr.Textbox(label="Transcription", lines=10)
+    ],
+    title="Multilingual Audio Transcriber",
+    description="Upload an audio file to detect the language, select the transcription model, and get the transcription."
 )
 if __name__ == "__main__":
+    iface.launch()