Spaces:

space-sue
/

hf-speech-eval

Sleeping

File size: 2,653 Bytes


import gradio as gr
import torch.cuda
import whisper
from whisper.tokenizer import LANGUAGES
from vid_to_wav import extract_audio
gpu = torch.cuda.is_available()
model = None


def analyze_transcription(text, duration):
    word_count = len(text.split())
    analysis_text = "The video is {} sec. long and the speaker speaks {} words.".format(
        duration, word_count)
    duration_in_min = duration/60
    words_per_min = round(word_count /duration_in_min)
    analysis_text = analysis_text + "The speech speed is {} words-per-minute".format(words_per_min)
    if words_per_min < 130:
        analysis_text = analysis_text + "The speaker has spoken slowly that average speakers"
    elif words_per_min > 150:
        analysis_text = analysis_text + "The speaker has spoken faster that average speakers"
    else:
        analysis_text = analysis_text + "The speaker maintains normal speed during speech making the speech comprehensible to most audiences!"
    return analysis_text


def transcribe(filepath, language, task):
    print(filepath)
    audio, audio_file, duration = extract_audio(filepath)
    print(type)
    language = None if language == "Detect" else language
    text = model.transcribe(
        audio_file, task=task.lower(), language=language, fp16=gpu,
    )["text"].strip()
    return text, analyze_transcription(text, duration)


def get_interface(model_name="medium"):
    global model
    model = whisper.load_model(model_name)

    return gr.Interface(
        fn=transcribe,
        inputs=[
            # gr.Audio(label="Record", source="microphone", type="filepath"),
            gr.Video(label="Upload", source="upload", type="filepath"),
            gr.Dropdown(
                label="Language",
                choices=["Detect"] + sorted([i.title()
                                            for i in LANGUAGES.values()]),
                value="Detect",
            ),
            gr.Dropdown(
                label="Task",
                choices=["Transcribe", "Translate"],
                value="Transcribe",
                info="Whether to perform X->X speech recognition or X->English translation",
            ),
        ],
        outputs=[
                 gr.Textbox(label="Transcription", lines=26),
                 gr.Textbox(label="Speech Analysis", lines=4)],
        # theme=gr.themes.Default(),
        theme=gr.themes.Glass(
            primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.purple),
        title="Analysis of Speech from Video",
        # description=DESCRIPTION,
        allow_flagging="never",
    )


demo = get_interface()
demo.queue().launch(debug=True)