Spaces:

lelafav502
/

wisper

Runtime error

File size: 2,618 Bytes

e7a6721
f0b42bb
fee32f6
 
edc4e08
fee32f6
 
 
 
 
 
 
 
 
 
 
 
 
f0b42bb
edc4e08
 
 
fee32f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edc4e08
f0b42bb
 
 
fee32f6
f0b42bb
 
fee32f6
f0b42bb
 
fee32f6
 
edc4e08
fee32f6
f0b42bb
edc4e08
 
e7a6721
 
 
edc4e08
42e8713
 
 
e7a6721

import gradio as gr
import json
from faster_whisper import WhisperModel  # Assuming you have installed this library

def split_text_into_lines(data, max_chars, max_duration, max_gap):
    subtitles = []
    line = []
    line_duration = 0

    for idx, word_data in enumerate(data):
        word = word_data["word"]
        start = word_data["start"]
        end = word_data["end"]

        line.append(word_data)
        line_duration += end - start

        temp = " ".join(item["word"] for item in line)

        duration_exceeded = line_duration > max_duration
        chars_exceeded = len(temp) > max_chars
        maxgap_exceeded = (word_data['start'] - data[idx - 1]['end']) > max_gap if idx > 0 else False

        if duration_exceeded or chars_exceeded or maxgap_exceeded:
            if line:
                subtitle_line = {
                    "word": temp,
                    "start": line[0]["start"],
                    "end": line[-1]["end"],
                    "textcontents": line
                }
                subtitles.append(subtitle_line)
                line = []
                line_duration = 0

    if line:
        subtitle_line = {
            "word": " ".join(item["word"] for item in line),
            "start": line[0]["start"],
            "end": line[-1]["end"],
            "textcontents": line
        }
        subtitles.append(subtitle_line)

    return subtitles

def transcribe_audio(audiofilename, max_chars, max_duration, max_gap):
    model_size = "medium"
    model = WhisperModel(model_size)

    segments, info = model.transcribe(audiofilename, word_timestamps=True)
    segments = list(segments)  # The transcription will actually run here.
    wordlevel_info = []

    for segment in segments:
        for word in segment.words:
            wordlevel_info.append({'word': word.word, 'start': word.start, 'end': word.end})

    linelevel_subtitles = split_text_into_lines(wordlevel_info, max_chars, max_duration, max_gap)
    return linelevel_subtitles

def audio_transcription(audiofile, max_chars, max_duration, max_gap):
    transcription = transcribe_audio(audiofile, max_chars, max_duration, max_gap)
    return json.dumps(transcription, indent=4)

iface = gr.Interface(audio_transcription, 
                     [gr.Audio(sources="upload", type="filepath"),
                      gr.Number(label="MaxChars"),
                      gr.Number(label="MaxDuration"),
                      gr.Number(label="MaxGap")],
                     "text", 
                     description="Upload an audio file and get its transcription in JSON format.")
iface.launch()