Merlintxu commited on
Commit
2b71965
·
verified ·
1 Parent(s): 7fdbed5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -28
app.py CHANGED
@@ -1,24 +1,30 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
  import torch
4
  import librosa
5
  import subprocess
6
- from langdetect import detect
7
  import os
 
8
 
9
- # Models by language
10
  MODELS = {
11
  "es": [
12
- "jonatasgrosman/wav2vec2-large-xlsr-53-spanish",
13
- "openai/whisper-small",
14
- "other/spanish-model"
15
  ],
16
  "en": [
 
17
  "facebook/wav2vec2-large-960h",
18
- "openai/whisper-small",
19
- "other/english-model"
20
  ],
21
- # Add more models per language if needed
 
 
 
 
 
22
  }
23
 
24
  def convert_audio_to_wav(audio_path):
@@ -28,10 +34,36 @@ def convert_audio_to_wav(audio_path):
28
  return wav_path
29
 
30
  def detect_language(audio_path):
31
- speech, _ = librosa.load(audio_path, sr=16000, duration=15)
32
- transcriber = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h")
33
- transcription = transcriber(speech)["text"]
34
- return detect(transcription)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def transcribe_audio(audio, model_name):
37
  wav_audio = convert_audio_to_wav(audio)
@@ -60,19 +92,21 @@ def detect_and_select_model(audio):
60
  return language, model_options
61
 
62
  def combined_interface(audio):
63
- language, model_options = detect_and_select_model(audio)
64
- selected_model = model_options[0]
65
- transcription_file = transcribe_audio(audio, selected_model)
66
-
67
- with open(transcription_file, "r", encoding="utf-8") as file:
68
- transcription_text = file.read()
69
-
70
- return {
71
- "Detected Language": language,
72
- "Available Models": model_options,
73
- "Selected Model": selected_model,
74
- "Transcription": transcription_text
75
- }
 
 
76
 
77
  iface = gr.Interface(
78
  fn=combined_interface,
@@ -83,8 +117,8 @@ iface = gr.Interface(
83
  gr.Textbox(label="Selected Model"),
84
  gr.Textbox(label="Transcription", lines=10)
85
  ],
86
- title="Multilingual Audio Transcriber",
87
- description="Upload an audio file to detect the language, select the transcription model, and get the transcription."
88
  )
89
 
90
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoModelForCTC, AutoProcessor
3
  import torch
4
  import librosa
5
  import subprocess
6
+ from langdetect import detect_langs
7
  import os
8
+ import numpy as np
9
 
10
+ # Updated models by language
11
  MODELS = {
12
  "es": [
13
+ "openai/whisper-large-v3",
14
+ "facebook/wav2vec2-large-xlsr-53-spanish",
15
+ "jonatasgrosman/wav2vec2-xls-r-1b-spanish"
16
  ],
17
  "en": [
18
+ "openai/whisper-large-v3",
19
  "facebook/wav2vec2-large-960h",
20
+ "microsoft/wav2vec2-base-960"
 
21
  ],
22
+ "pt": [
23
+ "facebook/wav2vec2-large-xlsr-53-portuguese",
24
+ "openai/whisper-medium",
25
+ "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
26
+ ]
27
+ # Add more languages and models as needed
28
  }
29
 
30
  def convert_audio_to_wav(audio_path):
 
34
  return wav_path
35
 
36
  def detect_language(audio_path):
37
+ speech, _ = librosa.load(audio_path, sr=16000, duration=30) # Increased duration for better detection
38
+
39
+ # Use multiple models for transcription to improve accuracy
40
+ transcriptions = []
41
+ models = ["facebook/wav2vec2-large-xlsr-53-spanish", "facebook/wav2vec2-large-xlsr-53-portuguese", "facebook/wav2vec2-large-960h"]
42
+
43
+ for model_name in models:
44
+ processor = AutoProcessor.from_pretrained(model_name)
45
+ model = AutoModelForCTC.from_pretrained(model_name)
46
+
47
+ inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
48
+ with torch.no_grad():
49
+ logits = model(inputs.input_values).logits
50
+ predicted_ids = torch.argmax(logits, dim=-1)
51
+ transcription = processor.batch_decode(predicted_ids)[0]
52
+ transcriptions.append(transcription)
53
+
54
+ # Combine transcriptions and detect language
55
+ combined_text = " ".join(transcriptions)
56
+ langs = detect_langs(combined_text)
57
+
58
+ # Check confidence levels
59
+ es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
60
+ pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
61
+
62
+ # If Spanish and Portuguese are close, prefer Spanish for Latin American content
63
+ if abs(es_confidence - pt_confidence) < 0.2:
64
+ return 'es'
65
+
66
+ return max(langs, key=lambda x: x.prob).lang
67
 
68
  def transcribe_audio(audio, model_name):
69
  wav_audio = convert_audio_to_wav(audio)
 
92
  return language, model_options
93
 
94
  def combined_interface(audio):
95
+ try:
96
+ language, model_options = detect_and_select_model(audio)
97
+ selected_model = model_options[0]
98
+ transcription_file = transcribe_audio(audio, selected_model)
99
+
100
+ with open(transcription_file, "r", encoding="utf-8") as file:
101
+ transcription_text = file.read()
102
+
103
+ # Clean up temporary files
104
+ os.remove(transcription_file)
105
+ os.remove("converted_audio.wav")
106
+
107
+ return language, gr.Dropdown.update(choices=model_options, value=selected_model), selected_model, transcription_text
108
+ except Exception as e:
109
+ return str(e), gr.Dropdown.update(choices=[]), "", "An error occurred during processing."
110
 
111
  iface = gr.Interface(
112
  fn=combined_interface,
 
117
  gr.Textbox(label="Selected Model"),
118
  gr.Textbox(label="Transcription", lines=10)
119
  ],
120
+ title="Multilingual Audio Transcriber (Latin American Spanish Optimized)",
121
+ description="Upload an audio file to detect the language, select the transcription model, and get the transcription. Optimized for Latin American Spanish detection."
122
  )
123
 
124
  if __name__ == "__main__":