Merlintxu commited on
Commit
df609a3
·
verified ·
1 Parent(s): 2b71965

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -47
app.py CHANGED
@@ -1,11 +1,17 @@
1
  import gradio as gr
2
- from transformers import pipeline, AutoModelForCTC, AutoProcessor
3
  import torch
4
  import librosa
5
  import subprocess
6
  from langdetect import detect_langs
7
  import os
8
- import numpy as np
 
 
 
 
 
 
9
 
10
  # Updated models by language
11
  MODELS = {
@@ -34,56 +40,57 @@ def convert_audio_to_wav(audio_path):
34
  return wav_path
35
 
36
  def detect_language(audio_path):
37
- speech, _ = librosa.load(audio_path, sr=16000, duration=30) # Increased duration for better detection
38
 
39
- # Use multiple models for transcription to improve accuracy
40
- transcriptions = []
41
- models = ["facebook/wav2vec2-large-xlsr-53-spanish", "facebook/wav2vec2-large-xlsr-53-portuguese", "facebook/wav2vec2-large-960h"]
42
 
43
- for model_name in models:
44
- processor = AutoProcessor.from_pretrained(model_name)
45
- model = AutoModelForCTC.from_pretrained(model_name)
46
-
47
- inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
48
- with torch.no_grad():
49
- logits = model(inputs.input_values).logits
50
- predicted_ids = torch.argmax(logits, dim=-1)
51
- transcription = processor.batch_decode(predicted_ids)[0]
52
- transcriptions.append(transcription)
53
 
54
- # Combine transcriptions and detect language
55
- combined_text = " ".join(transcriptions)
56
- langs = detect_langs(combined_text)
57
 
58
- # Check confidence levels
59
  es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
60
  pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
61
 
62
- # If Spanish and Portuguese are close, prefer Spanish for Latin American content
63
  if abs(es_confidence - pt_confidence) < 0.2:
64
  return 'es'
65
 
66
  return max(langs, key=lambda x: x.prob).lang
67
 
68
- def transcribe_audio(audio, model_name):
69
  wav_audio = convert_audio_to_wav(audio)
70
- transcriber = pipeline("automatic-speech-recognition", model=model_name)
71
-
72
- chunk_duration = 30 # seconds
73
- speech, rate = librosa.load(wav_audio, sr=16000)
74
- duration = len(speech) / rate
75
-
76
- transcription = ""
77
- for i in range(0, int(duration), chunk_duration):
78
- end = min(i + chunk_duration, duration)
79
- chunk = speech[int(i * rate):int(end * rate)]
80
- transcription += transcriber(chunk)["text"] + " "
81
-
82
- output_file = "transcription.txt"
83
- with open(output_file, "w", encoding="utf-8") as file:
84
- file.write(transcription.strip())
85
 
86
- return output_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def detect_and_select_model(audio):
89
  wav_audio = convert_audio_to_wav(audio)
@@ -95,18 +102,19 @@ def combined_interface(audio):
95
  try:
96
  language, model_options = detect_and_select_model(audio)
97
  selected_model = model_options[0]
98
- transcription_file = transcribe_audio(audio, selected_model)
99
 
100
- with open(transcription_file, "r", encoding="utf-8") as file:
101
- transcription_text = file.read()
 
 
 
 
102
 
103
  # Clean up temporary files
104
- os.remove(transcription_file)
105
  os.remove("converted_audio.wav")
106
 
107
- return language, gr.Dropdown.update(choices=model_options, value=selected_model), selected_model, transcription_text
108
  except Exception as e:
109
- return str(e), gr.Dropdown.update(choices=[]), "", "An error occurred during processing."
110
 
111
  iface = gr.Interface(
112
  fn=combined_interface,
@@ -117,9 +125,10 @@ iface = gr.Interface(
117
  gr.Textbox(label="Selected Model"),
118
  gr.Textbox(label="Transcription", lines=10)
119
  ],
120
- title="Multilingual Audio Transcriber (Latin American Spanish Optimized)",
121
- description="Upload an audio file to detect the language, select the transcription model, and get the transcription. Optimized for Latin American Spanish detection."
 
122
  )
123
 
124
  if __name__ == "__main__":
125
- iface.launch()
 
1
  import gradio as gr
2
+ from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
3
  import torch
4
  import librosa
5
  import subprocess
6
  from langdetect import detect_langs
7
  import os
8
+ import warnings
9
+ from transformers import logging
10
+
11
+ # Suppress warnings
12
+ warnings.filterwarnings("ignore")
13
+ logging.set_verbosity_error()
14
+
15
 
16
  # Updated models by language
17
  MODELS = {
 
40
  return wav_path
41
 
42
  def detect_language(audio_path):
43
+ speech, _ = librosa.load(audio_path, sr=16000, duration=30)
44
 
45
+ processor = WhisperProcessor.from_pretrained("openai/whisper-base")
46
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
 
47
 
48
+ input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
49
+ predicted_ids = model.generate(input_features)
50
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
 
 
 
 
 
 
51
 
52
+ langs = detect_langs(transcription)
 
 
53
 
 
54
  es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
55
  pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
56
 
 
57
  if abs(es_confidence - pt_confidence) < 0.2:
58
  return 'es'
59
 
60
  return max(langs, key=lambda x: x.prob).lang
61
 
62
+ def transcribe_audio_stream(audio, model_name):
63
  wav_audio = convert_audio_to_wav(audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ if "whisper" in model_name:
66
+ processor = WhisperProcessor.from_pretrained(model_name)
67
+ model = WhisperForConditionalGeneration.from_pretrained(model_name)
68
+
69
+ chunk_duration = 30 # seconds
70
+ speech, rate = librosa.load(wav_audio, sr=16000)
71
+ duration = len(speech) / rate
72
+
73
+ for i in range(0, int(duration), chunk_duration):
74
+ end = min(i + chunk_duration, duration)
75
+ chunk = speech[int(i * rate):int(end * rate)]
76
+
77
+ input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
78
+ predicted_ids = model.generate(input_features)
79
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
80
+
81
+ yield transcription
82
+ else:
83
+ transcriber = pipeline("automatic-speech-recognition", model=model_name)
84
+
85
+ chunk_duration = 10 # seconds
86
+ speech, rate = librosa.load(wav_audio, sr=16000)
87
+ duration = len(speech) / rate
88
+
89
+ for i in range(0, int(duration), chunk_duration):
90
+ end = min(i + chunk_duration, duration)
91
+ chunk = speech[int(i * rate):int(end * rate)]
92
+ result = transcriber(chunk)
93
+ yield result["text"]
94
 
95
  def detect_and_select_model(audio):
96
  wav_audio = convert_audio_to_wav(audio)
 
102
  try:
103
  language, model_options = detect_and_select_model(audio)
104
  selected_model = model_options[0]
 
105
 
106
+ yield language, gr.Dropdown.update(choices=model_options, value=selected_model), selected_model, ""
107
+
108
+ full_transcription = ""
109
+ for partial_transcription in transcribe_audio_stream(audio, selected_model):
110
+ full_transcription += partial_transcription + " "
111
+ yield language, gr.Dropdown.update(choices=model_options, value=selected_model), selected_model, full_transcription.strip()
112
 
113
  # Clean up temporary files
 
114
  os.remove("converted_audio.wav")
115
 
 
116
  except Exception as e:
117
+ yield str(e), gr.Dropdown.update(choices=[]), "", "An error occurred during processing."
118
 
119
  iface = gr.Interface(
120
  fn=combined_interface,
 
125
  gr.Textbox(label="Selected Model"),
126
  gr.Textbox(label="Transcription", lines=10)
127
  ],
128
+ title="Multilingual Audio Transcriber with Real-time Display",
129
+ description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish and English.",
130
+ live=True
131
  )
132
 
133
  if __name__ == "__main__":
134
+ iface.queue().launch()