wisper / app.py
lelafav502's picture
Update app.py
42e8713 verified
import gradio as gr
import json
from faster_whisper import WhisperModel # Assuming you have installed this library
def split_text_into_lines(data, max_chars, max_duration, max_gap):
subtitles = []
line = []
line_duration = 0
for idx, word_data in enumerate(data):
word = word_data["word"]
start = word_data["start"]
end = word_data["end"]
line.append(word_data)
line_duration += end - start
temp = " ".join(item["word"] for item in line)
duration_exceeded = line_duration > max_duration
chars_exceeded = len(temp) > max_chars
maxgap_exceeded = (word_data['start'] - data[idx - 1]['end']) > max_gap if idx > 0 else False
if duration_exceeded or chars_exceeded or maxgap_exceeded:
if line:
subtitle_line = {
"word": temp,
"start": line[0]["start"],
"end": line[-1]["end"],
"textcontents": line
}
subtitles.append(subtitle_line)
line = []
line_duration = 0
if line:
subtitle_line = {
"word": " ".join(item["word"] for item in line),
"start": line[0]["start"],
"end": line[-1]["end"],
"textcontents": line
}
subtitles.append(subtitle_line)
return subtitles
def transcribe_audio(audiofilename, max_chars, max_duration, max_gap):
model_size = "medium"
model = WhisperModel(model_size)
segments, info = model.transcribe(audiofilename, word_timestamps=True)
segments = list(segments) # The transcription will actually run here.
wordlevel_info = []
for segment in segments:
for word in segment.words:
wordlevel_info.append({'word': word.word, 'start': word.start, 'end': word.end})
linelevel_subtitles = split_text_into_lines(wordlevel_info, max_chars, max_duration, max_gap)
return linelevel_subtitles
def audio_transcription(audiofile, max_chars, max_duration, max_gap):
transcription = transcribe_audio(audiofile, max_chars, max_duration, max_gap)
return json.dumps(transcription, indent=4)
iface = gr.Interface(audio_transcription,
[gr.Audio(sources="upload", type="filepath"),
gr.Number(label="MaxChars"),
gr.Number(label="MaxDuration"),
gr.Number(label="MaxGap")],
"text",
description="Upload an audio file and get its transcription in JSON format.")
iface.launch()