Merlintxu commited on
Commit
7fdbed5
·
verified ·
1 Parent(s): 96f5004

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -44
app.py CHANGED
@@ -1,15 +1,24 @@
1
  import gradio as gr
2
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
3
  import torch
4
  import librosa
5
  import subprocess
6
  from langdetect import detect
 
7
 
8
- # Modelos por idioma
9
  MODELS = {
10
- "es": "jonatasgrosman/wav2vec2-large-xlsr-53-spanish",
11
- "en": "facebook/wav2vec2-large-960h", # Puedes añadir más modelos aquí según sea necesario
12
- # Añadir más modelos por idioma si es necesario
 
 
 
 
 
 
 
 
13
  }
14
 
15
  def convert_audio_to_wav(audio_path):
@@ -19,58 +28,64 @@ def convert_audio_to_wav(audio_path):
19
  return wav_path
20
 
21
  def detect_language(audio_path):
22
- # Cargar los primeros 15 segundos del audio
23
  speech, _ = librosa.load(audio_path, sr=16000, duration=15)
24
- # Convertir audio a texto usando el modelo inglés como predeterminado para detección
25
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
26
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
27
- input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
28
- with torch.no_grad():
29
- logits = model(input_values).logits
30
- predicted_ids = torch.argmax(logits, dim=-1)
31
- transcription = processor.batch_decode(predicted_ids)[0]
32
  return detect(transcription)
33
 
34
- def transcribe_audio(audio):
35
- # Convertir audio a formato WAV
36
  wav_audio = convert_audio_to_wav(audio)
 
37
 
38
- # Detectar el idioma del audio
39
- language = detect_language(wav_audio)
40
- model_name = MODELS.get(language, "facebook/wav2vec2-large-960h") # Modelo predeterminado en caso de que no se detecte el idioma
41
-
42
- # Cargar el modelo y el procesador adecuados
43
- processor = Wav2Vec2Processor.from_pretrained(model_name)
44
- model = Wav2Vec2ForCTC.from_pretrained(model_name)
45
-
46
- # Cargar el audio completo
47
  speech, rate = librosa.load(wav_audio, sr=16000)
 
 
 
 
 
 
 
48
 
49
- # Procesar el audio
50
- input_values = processor(speech, return_tensors="pt", sampling_rate=rate).input_values
51
- # Generar las predicciones (logits)
52
- with torch.no_grad():
53
- logits = model(input_values).logits
54
 
55
- # Obtener las predicciones (tokens) y convertirlas en texto
56
- predicted_ids = torch.argmax(logits, dim=-1)
57
- transcription = processor.batch_decode(predicted_ids)[0]
 
 
 
 
 
 
 
 
 
58
 
59
- # Guardar la transcripción en un archivo de texto
60
- with open("transcription.txt", "w") as file:
61
- file.write(transcription)
62
 
63
- return "transcription.txt"
 
 
 
 
 
64
 
65
- # Configurar la interfaz de Gradio
66
  iface = gr.Interface(
67
- fn=transcribe_audio,
68
  inputs=gr.Audio(type="filepath"),
69
- outputs=gr.File(),
70
- title="Transcriptor de Audio Multilingüe",
71
- description="Sube un archivo de audio y obtén la transcripción en un archivo de texto."
 
 
 
 
 
72
  )
73
 
74
- # Iniciar la interfaz
75
  if __name__ == "__main__":
76
- iface.launch()
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
  import torch
4
  import librosa
5
  import subprocess
6
  from langdetect import detect
7
+ import os
8
 
9
+ # Models by language
10
  MODELS = {
11
+ "es": [
12
+ "jonatasgrosman/wav2vec2-large-xlsr-53-spanish",
13
+ "openai/whisper-small",
14
+ "other/spanish-model"
15
+ ],
16
+ "en": [
17
+ "facebook/wav2vec2-large-960h",
18
+ "openai/whisper-small",
19
+ "other/english-model"
20
+ ],
21
+ # Add more models per language if needed
22
  }
23
 
24
  def convert_audio_to_wav(audio_path):
 
28
  return wav_path
29
 
30
  def detect_language(audio_path):
 
31
  speech, _ = librosa.load(audio_path, sr=16000, duration=15)
32
+ transcriber = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")
33
+ transcription = transcriber(speech)["text"]
 
 
 
 
 
 
34
  return detect(transcription)
35
 
36
+ def transcribe_audio(audio, model_name):
 
37
  wav_audio = convert_audio_to_wav(audio)
38
+ transcriber = pipeline("automatic-speech-recognition", model=model_name)
39
 
40
+ chunk_duration = 30 # seconds
 
 
 
 
 
 
 
 
41
  speech, rate = librosa.load(wav_audio, sr=16000)
42
+ duration = len(speech) / rate
43
+
44
+ transcription = ""
45
+ for i in range(0, int(duration), chunk_duration):
46
+ end = min(i + chunk_duration, duration)
47
+ chunk = speech[int(i * rate):int(end * rate)]
48
+ transcription += transcriber(chunk)["text"] + " "
49
 
50
+ output_file = "transcription.txt"
51
+ with open(output_file, "w", encoding="utf-8") as file:
52
+ file.write(transcription.strip())
 
 
53
 
54
+ return output_file
55
+
56
+ def detect_and_select_model(audio):
57
+ wav_audio = convert_audio_to_wav(audio)
58
+ language = detect_language(wav_audio)
59
+ model_options = MODELS.get(language, MODELS["en"])
60
+ return language, model_options
61
+
62
+ def combined_interface(audio):
63
+ language, model_options = detect_and_select_model(audio)
64
+ selected_model = model_options[0]
65
+ transcription_file = transcribe_audio(audio, selected_model)
66
 
67
+ with open(transcription_file, "r", encoding="utf-8") as file:
68
+ transcription_text = file.read()
 
69
 
70
+ return {
71
+ "Detected Language": language,
72
+ "Available Models": model_options,
73
+ "Selected Model": selected_model,
74
+ "Transcription": transcription_text
75
+ }
76
 
 
77
  iface = gr.Interface(
78
+ fn=combined_interface,
79
  inputs=gr.Audio(type="filepath"),
80
+ outputs=[
81
+ gr.Textbox(label="Detected Language"),
82
+ gr.Dropdown(label="Available Models", choices=[]),
83
+ gr.Textbox(label="Selected Model"),
84
+ gr.Textbox(label="Transcription", lines=10)
85
+ ],
86
+ title="Multilingual Audio Transcriber",
87
+ description="Upload an audio file to detect the language, select the transcription model, and get the transcription."
88
  )
89
 
 
90
  if __name__ == "__main__":
91
+ iface.launch()