Merlintxu commited on
Commit
fe4ae7f
·
verified ·
1 Parent(s): 5d729bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -20
app.py CHANGED
@@ -65,18 +65,19 @@ def diarize_audio(wav_audio):
65
  diarization = pipeline(wav_audio)
66
  return diarization
67
 
68
- def transcribe_audio_stream(audio, model_name, diarization):
69
  wav_audio = convert_audio_to_wav(audio)
70
  speech, rate = librosa.load(wav_audio, sr=16000)
71
  duration = len(speech) / rate
72
 
 
 
73
  if "whisper" in model_name:
74
  processor = WhisperProcessor.from_pretrained(model_name)
75
  model = WhisperForConditionalGeneration.from_pretrained(model_name)
76
 
77
  chunk_duration = 30 # seconds
78
 
79
- transcriptions = []
80
  for i in range(0, int(duration), chunk_duration):
81
  end = min(i + chunk_duration, duration)
82
  chunk = speech[int(i * rate):int(end * rate)]
@@ -94,7 +95,6 @@ def transcribe_audio_stream(audio, model_name, diarization):
94
 
95
  chunk_duration = 10 # seconds
96
 
97
- transcriptions = []
98
  for i in range(0, int(duration), chunk_duration):
99
  end = min(i + chunk_duration, duration)
100
  chunk = speech[int(i * rate):int(end * rate)]
@@ -105,7 +105,7 @@ def transcribe_audio_stream(audio, model_name, diarization):
105
  transcriptions.append((timestamp, result["text"]))
106
  yield transcriptions, progress
107
 
108
- # Merge diarization results with transcription
109
  speaker_transcriptions = []
110
  for segment in diarization.itertracks(yield_label=True):
111
  start, end, speaker = segment
@@ -116,7 +116,6 @@ def transcribe_audio_stream(audio, model_name, diarization):
116
  if start_time <= ts <= end_time:
117
  text_segment += text + " "
118
  speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))
119
-
120
  return speaker_transcriptions
121
 
122
  def detect_and_select_model(audio):
@@ -127,39 +126,48 @@ def detect_and_select_model(audio):
127
 
128
  def save_transcription(transcriptions, file_format):
129
  if file_format == "txt":
130
- with open("transcription.txt", "w") as f:
 
131
  for start, end, speaker, text in transcriptions:
132
- f.write(f"[{start}-{end}] {speaker}: {text}\n")
133
- return "transcription.txt"
134
  elif file_format == "json":
135
- with open("transcription.json", "w") as f:
 
136
  json.dump(transcriptions, f)
137
- return "transcription.json"
138
 
139
  def combined_interface(audio):
140
  try:
141
  language, model_options = detect_and_select_model(audio)
142
  selected_model = model_options[0]
143
 
144
- yield language, model_options, selected_model, [], 0, "Initializing..."
145
 
146
  wav_audio = convert_audio_to_wav(audio)
147
  diarization = diarize_audio(wav_audio)
148
  transcriptions = []
149
- for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model, diarization):
 
150
  transcriptions = partial_transcriptions
151
- transcriptions_text = "\n".join([f"[{start}-{end}] {speaker}: {text}" for start, end, speaker, text in transcriptions])
152
  progress_int = math.floor(progress)
153
  status = f"Transcribing... {progress_int}% complete"
154
- yield language, model_options, selected_model, transcriptions_text, progress_int, status
 
 
 
 
155
 
156
- # Clean up temporary files
157
- os.remove("converted_audio.wav")
 
 
158
 
159
- yield language, model_options, selected_model, transcriptions_text, 100, "Transcription complete!"
160
 
161
  except Exception as e:
162
- yield str(e), [], "", "An error occurred during processing.", 0, "Error"
163
 
164
  iface = gr.Interface(
165
  fn=combined_interface,
@@ -171,8 +179,8 @@ iface = gr.Interface(
171
  gr.Textbox(label="Transcription", lines=10),
172
  gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
173
  gr.Textbox(label="Status"),
174
- gr.File(label="Download Transcription (TXT)", type="filepath", interactive=True, value="transcription.txt"),
175
- gr.File(label="Download Transcription (JSON)", type="filepath", interactive=True, value="transcription.json")
176
  ],
177
  title="Multilingual Audio Transcriber with Real-time Display, Timestamps, and Speaker Diarization",
178
  description="Upload an audio file to detect the language, select the transcription model, and get the transcription with timestamps and speaker labels in real-time. Download the transcription as TXT or JSON. Optimized for Spanish, English, and Portuguese.",
 
65
  diarization = pipeline(wav_audio)
66
  return diarization
67
 
68
+ def transcribe_audio_stream(audio, model_name):
69
  wav_audio = convert_audio_to_wav(audio)
70
  speech, rate = librosa.load(wav_audio, sr=16000)
71
  duration = len(speech) / rate
72
 
73
+ transcriptions = []
74
+
75
  if "whisper" in model_name:
76
  processor = WhisperProcessor.from_pretrained(model_name)
77
  model = WhisperForConditionalGeneration.from_pretrained(model_name)
78
 
79
  chunk_duration = 30 # seconds
80
 
 
81
  for i in range(0, int(duration), chunk_duration):
82
  end = min(i + chunk_duration, duration)
83
  chunk = speech[int(i * rate):int(end * rate)]
 
95
 
96
  chunk_duration = 10 # seconds
97
 
 
98
  for i in range(0, int(duration), chunk_duration):
99
  end = min(i + chunk_duration, duration)
100
  chunk = speech[int(i * rate):int(end * rate)]
 
105
  transcriptions.append((timestamp, result["text"]))
106
  yield transcriptions, progress
107
 
108
+ def merge_diarization_with_transcription(transcriptions, diarization, rate):
109
  speaker_transcriptions = []
110
  for segment in diarization.itertracks(yield_label=True):
111
  start, end, speaker = segment
 
116
  if start_time <= ts <= end_time:
117
  text_segment += text + " "
118
  speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))
 
119
  return speaker_transcriptions
120
 
121
  def detect_and_select_model(audio):
 
126
 
127
  def save_transcription(transcriptions, file_format):
128
  if file_format == "txt":
129
+ file_path = "/tmp/transcription.txt"
130
+ with open(file_path, "w") as f:
131
  for start, end, speaker, text in transcriptions:
132
+ f.write(f"[{start:.2f}-{end:.2f}] {speaker}: {text}\n")
133
+ return file_path
134
  elif file_format == "json":
135
+ file_path = "/tmp/transcription.json"
136
+ with open(file_path, "w") as f:
137
  json.dump(transcriptions, f)
138
+ return file_path
139
 
140
  def combined_interface(audio):
141
  try:
142
  language, model_options = detect_and_select_model(audio)
143
  selected_model = model_options[0]
144
 
145
+ yield language, model_options, selected_model, "", 0, "Initializing...", None, None
146
 
147
  wav_audio = convert_audio_to_wav(audio)
148
  diarization = diarize_audio(wav_audio)
149
  transcriptions = []
150
+
151
+ for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model):
152
  transcriptions = partial_transcriptions
153
+ transcriptions_text = "\n".join([f"[{start}-{end}] {text}" for start, end, text in transcriptions])
154
  progress_int = math.floor(progress)
155
  status = f"Transcribing... {progress_int}% complete"
156
+ yield language, model_options, selected_model, transcriptions_text, progress_int, status, None, None
157
+
158
+ rate = librosa.get_samplerate(wav_audio)
159
+ speaker_transcriptions = merge_diarization_with_transcription(transcriptions, diarization, rate)
160
+ transcriptions_text = "\n".join([f"[{start:.2f}-{end:.2f}] {speaker}: {text}" for start, end, speaker, text in speaker_transcriptions])
161
 
162
+ txt_file_path = save_transcription(speaker_transcriptions, "txt")
163
+ json_file_path = save_transcription(speaker_transcriptions, "json")
164
+
165
+ os.remove(wav_audio)
166
 
167
+ yield language, model_options, selected_model, transcriptions_text, 100, "Transcription complete!", txt_file_path, json_file_path
168
 
169
  except Exception as e:
170
+ yield str(e), [], "", "An error occurred during processing.", 0, "Error", None, None
171
 
172
  iface = gr.Interface(
173
  fn=combined_interface,
 
179
  gr.Textbox(label="Transcription", lines=10),
180
  gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
181
  gr.Textbox(label="Status"),
182
+ gr.File(label="Download Transcription (TXT)", type="filepath"),
183
+ gr.File(label="Download Transcription (JSON)", type="filepath")
184
  ],
185
  title="Multilingual Audio Transcriber with Real-time Display, Timestamps, and Speaker Diarization",
186
  description="Upload an audio file to detect the language, select the transcription model, and get the transcription with timestamps and speaker labels in real-time. Download the transcription as TXT or JSON. Optimized for Spanish, English, and Portuguese.",