Spaces:
Sleeping
Sleeping
File size: 2,653 Bytes
9310327 826f90c 9310327 826f90c 9310327 8f0642b 9310327 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import gradio as gr
import torch.cuda
import whisper
from whisper.tokenizer import LANGUAGES
from vid_to_wav import extract_audio
gpu = torch.cuda.is_available()
model = None
def analyze_transcription(text, duration):
word_count = len(text.split())
analysis_text = "The video is {} sec. long and the speaker speaks {} words.".format(
duration, word_count)
duration_in_min = duration/60
words_per_min = round(word_count /duration_in_min)
analysis_text = analysis_text + "The speech speed is {} words-per-minute".format(words_per_min)
if words_per_min < 130:
analysis_text = analysis_text + "The speaker has spoken slowly that average speakers"
elif words_per_min > 150:
analysis_text = analysis_text + "The speaker has spoken faster that average speakers"
else:
analysis_text = analysis_text + "The speaker maintains normal speed during speech making the speech comprehensible to most audiences!"
return analysis_text
def transcribe(filepath, language, task):
print(filepath)
audio, audio_file, duration = extract_audio(filepath)
print(type)
language = None if language == "Detect" else language
text = model.transcribe(
audio_file, task=task.lower(), language=language, fp16=gpu,
)["text"].strip()
return text, analyze_transcription(text, duration)
def get_interface(model_name="medium"):
global model
model = whisper.load_model(model_name)
return gr.Interface(
fn=transcribe,
inputs=[
# gr.Audio(label="Record", source="microphone", type="filepath"),
gr.Video(label="Upload", source="upload", type="filepath"),
gr.Dropdown(
label="Language",
choices=["Detect"] + sorted([i.title()
for i in LANGUAGES.values()]),
value="Detect",
),
gr.Dropdown(
label="Task",
choices=["Transcribe", "Translate"],
value="Transcribe",
info="Whether to perform X->X speech recognition or X->English translation",
),
],
outputs=[
gr.Textbox(label="Transcription", lines=26),
gr.Textbox(label="Speech Analysis", lines=4)],
# theme=gr.themes.Default(),
theme=gr.themes.Glass(
primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.purple),
title="Analysis of Speech from Video",
# description=DESCRIPTION,
allow_flagging="never",
)
demo = get_interface()
demo.queue().launch(debug=True)
|