Merlintxu commited on
Commit
d2e9f55
·
verified ·
1 Parent(s): c55c408

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -13
app.py CHANGED
@@ -8,6 +8,8 @@ import os
8
  import warnings
9
  from transformers import logging
10
  import math
 
 
11
 
12
  # Suppress warnings
13
  warnings.filterwarnings("ignore")
@@ -58,7 +60,12 @@ def detect_language(audio_path):
58
 
59
  return max(langs, key=lambda x: x.prob).lang
60
 
61
- def transcribe_audio_stream(audio, model_name):
 
 
 
 
 
62
  wav_audio = convert_audio_to_wav(audio)
63
  speech, rate = librosa.load(wav_audio, sr=16000)
64
  duration = len(speech) / rate
@@ -69,6 +76,7 @@ def transcribe_audio_stream(audio, model_name):
69
 
70
  chunk_duration = 30 # seconds
71
 
 
72
  for i in range(0, int(duration), chunk_duration):
73
  end = min(i + chunk_duration, duration)
74
  chunk = speech[int(i * rate):int(end * rate)]
@@ -78,19 +86,38 @@ def transcribe_audio_stream(audio, model_name):
78
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
79
 
80
  progress = min(100, (end / duration) * 100)
81
- yield transcription, progress
 
 
82
  else:
83
  transcriber = pipeline("automatic-speech-recognition", model=model_name)
84
 
85
  chunk_duration = 10 # seconds
86
 
 
87
  for i in range(0, int(duration), chunk_duration):
88
  end = min(i + chunk_duration, duration)
89
  chunk = speech[int(i * rate):int(end * rate)]
90
  result = transcriber(chunk)
91
 
92
  progress = min(100, (end / duration) * 100)
93
- yield result["text"], progress
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def detect_and_select_model(audio):
96
  wav_audio = convert_audio_to_wav(audio)
@@ -98,24 +125,38 @@ def detect_and_select_model(audio):
98
  model_options = MODELS.get(language, MODELS["en"])
99
  return language, model_options
100
 
 
 
 
 
 
 
 
 
 
 
 
101
  def combined_interface(audio):
102
  try:
103
  language, model_options = detect_and_select_model(audio)
104
  selected_model = model_options[0]
105
 
106
- yield language, model_options, selected_model, "", 0, "Initializing..."
107
 
108
- full_transcription = ""
109
- for partial_transcription, progress in transcribe_audio_stream(audio, selected_model):
110
- full_transcription += partial_transcription + " "
 
 
 
111
  progress_int = math.floor(progress)
112
  status = f"Transcribing... {progress_int}% complete"
113
- yield language, model_options, selected_model, full_transcription.strip(), progress_int, status
114
 
115
  # Clean up temporary files
116
  os.remove("converted_audio.wav")
117
 
118
- yield language, model_options, selected_model, full_transcription.strip(), 100, "Transcription complete!"
119
 
120
  except Exception as e:
121
  yield str(e), [], "", "An error occurred during processing.", 0, "Error"
@@ -129,12 +170,14 @@ iface = gr.Interface(
129
  gr.Textbox(label="Selected Model"),
130
  gr.Textbox(label="Transcription", lines=10),
131
  gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
132
- gr.Textbox(label="Status")
 
 
133
  ],
134
- title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
135
- description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
136
  live=True
137
  )
138
 
139
  if __name__ == "__main__":
140
- iface.queue().launch()
 
8
  import warnings
9
  from transformers import logging
10
  import math
11
+ import json
12
+ from pyannote.audio import Pipeline
13
 
14
  # Suppress warnings
15
  warnings.filterwarnings("ignore")
 
60
 
61
  return max(langs, key=lambda x: x.prob).lang
62
 
63
+ def diarize_audio(wav_audio):
64
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
65
+ diarization = pipeline(wav_audio)
66
+ return diarization
67
+
68
+ def transcribe_audio_stream(audio, model_name, diarization):
69
  wav_audio = convert_audio_to_wav(audio)
70
  speech, rate = librosa.load(wav_audio, sr=16000)
71
  duration = len(speech) / rate
 
76
 
77
  chunk_duration = 30 # seconds
78
 
79
+ transcriptions = []
80
  for i in range(0, int(duration), chunk_duration):
81
  end = min(i + chunk_duration, duration)
82
  chunk = speech[int(i * rate):int(end * rate)]
 
86
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
87
 
88
  progress = min(100, (end / duration) * 100)
89
+ timestamp = i
90
+ transcriptions.append((timestamp, transcription))
91
+ yield transcriptions, progress
92
  else:
93
  transcriber = pipeline("automatic-speech-recognition", model=model_name)
94
 
95
  chunk_duration = 10 # seconds
96
 
97
+ transcriptions = []
98
  for i in range(0, int(duration), chunk_duration):
99
  end = min(i + chunk_duration, duration)
100
  chunk = speech[int(i * rate):int(end * rate)]
101
  result = transcriber(chunk)
102
 
103
  progress = min(100, (end / duration) * 100)
104
+ timestamp = i
105
+ transcriptions.append((timestamp, result["text"]))
106
+ yield transcriptions, progress
107
+
108
+ # Merge diarization results with transcription
109
+ speaker_transcriptions = []
110
+ for segment in diarization.itertracks(yield_label=True):
111
+ start, end, speaker = segment
112
+ start_time = start / rate
113
+ end_time = end / rate
114
+ text_segment = ""
115
+ for ts, text in transcriptions:
116
+ if start_time <= ts <= end_time:
117
+ text_segment += text + " "
118
+ speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))
119
+
120
+ return speaker_transcriptions
121
 
122
  def detect_and_select_model(audio):
123
  wav_audio = convert_audio_to_wav(audio)
 
125
  model_options = MODELS.get(language, MODELS["en"])
126
  return language, model_options
127
 
128
+ def save_transcription(transcriptions, file_format):
129
+ if file_format == "txt":
130
+ with open("transcription.txt", "w") as f:
131
+ for start, end, speaker, text in transcriptions:
132
+ f.write(f"[{start}-{end}] {speaker}: {text}\n")
133
+ return "transcription.txt"
134
+ elif file_format == "json":
135
+ with open("transcription.json", "w") as f:
136
+ json.dump(transcriptions, f)
137
+ return "transcription.json"
138
+
139
  def combined_interface(audio):
140
  try:
141
  language, model_options = detect_and_select_model(audio)
142
  selected_model = model_options[0]
143
 
144
+ yield language, model_options, selected_model, [], 0, "Initializing..."
145
 
146
+ wav_audio = convert_audio_to_wav(audio)
147
+ diarization = diarize_audio(wav_audio)
148
+ transcriptions = []
149
+ for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model, diarization):
150
+ transcriptions = partial_transcriptions
151
+ transcriptions_text = "\n".join([f"[{start}-{end}] {speaker}: {text}" for start, end, speaker, text in transcriptions])
152
  progress_int = math.floor(progress)
153
  status = f"Transcribing... {progress_int}% complete"
154
+ yield language, model_options, selected_model, transcriptions_text, progress_int, status
155
 
156
  # Clean up temporary files
157
  os.remove("converted_audio.wav")
158
 
159
+ yield language, model_options, selected_model, transcriptions_text, 100, "Transcription complete!"
160
 
161
  except Exception as e:
162
  yield str(e), [], "", "An error occurred during processing.", 0, "Error"
 
170
  gr.Textbox(label="Selected Model"),
171
  gr.Textbox(label="Transcription", lines=10),
172
  gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
173
+ gr.Textbox(label="Status"),
174
+ gr.File(label="Download Transcription (TXT)", type="file", interactive=True, value="transcription.txt"),
175
+ gr.File(label="Download Transcription (JSON)", type="file", interactive=True, value="transcription.json")
176
  ],
177
+ title="Multilingual Audio Transcriber with Real-time Display, Timestamps, and Speaker Diarization",
178
+ description="Upload an audio file to detect the language, select the transcription model, and get the transcription with timestamps and speaker labels in real-time. Download the transcription as TXT or JSON. Optimized for Spanish, English, and Portuguese.",
179
  live=True
180
  )
181
 
182
  if __name__ == "__main__":
183
+ iface.queue().launch()