import gradio as gr import json from faster_whisper import WhisperModel # Assuming you have installed this library def split_text_into_lines(data, max_chars, max_duration, max_gap): subtitles = [] line = [] line_duration = 0 for idx, word_data in enumerate(data): word = word_data["word"] start = word_data["start"] end = word_data["end"] line.append(word_data) line_duration += end - start temp = " ".join(item["word"] for item in line) duration_exceeded = line_duration > max_duration chars_exceeded = len(temp) > max_chars maxgap_exceeded = (word_data['start'] - data[idx - 1]['end']) > max_gap if idx > 0 else False if duration_exceeded or chars_exceeded or maxgap_exceeded: if line: subtitle_line = { "word": temp, "start": line[0]["start"], "end": line[-1]["end"], "textcontents": line } subtitles.append(subtitle_line) line = [] line_duration = 0 if line: subtitle_line = { "word": " ".join(item["word"] for item in line), "start": line[0]["start"], "end": line[-1]["end"], "textcontents": line } subtitles.append(subtitle_line) return subtitles def transcribe_audio(audiofilename, max_chars, max_duration, max_gap): model_size = "medium" model = WhisperModel(model_size) segments, info = model.transcribe(audiofilename, word_timestamps=True) segments = list(segments) # The transcription will actually run here. wordlevel_info = [] for segment in segments: for word in segment.words: wordlevel_info.append({'word': word.word, 'start': word.start, 'end': word.end}) linelevel_subtitles = split_text_into_lines(wordlevel_info, max_chars, max_duration, max_gap) return linelevel_subtitles def audio_transcription(audiofile, max_chars, max_duration, max_gap): transcription = transcribe_audio(audiofile, max_chars, max_duration, max_gap) return json.dumps(transcription, indent=4) iface = gr.Interface(audio_transcription, [gr.Audio(sources="upload", type="filepath"), gr.Number(label="MaxChars"), gr.Number(label="MaxDuration"), gr.Number(label="MaxGap")], "text", description="Upload an audio file and get its transcription in JSON format.") iface.launch()