import gradio as gr import torch.cuda import whisper from whisper.tokenizer import LANGUAGES from vid_to_wav import extract_audio gpu = torch.cuda.is_available() model = None def analyze_transcription(text, duration): word_count = len(text.split()) analysis_text = "The video is {} sec. long and the speaker speaks {} words.".format( duration, word_count) duration_in_min = duration/60 words_per_min = round(word_count /duration_in_min) analysis_text = analysis_text + "The speech speed is {} words-per-minute".format(words_per_min) if words_per_min < 130: analysis_text = analysis_text + "The speaker has spoken slowly that average speakers" elif words_per_min > 150: analysis_text = analysis_text + "The speaker has spoken faster that average speakers" else: analysis_text = analysis_text + "The speaker maintains normal speed during speech making the speech comprehensible to most audiences!" return analysis_text def transcribe(filepath, language, task): print(filepath) audio, audio_file, duration = extract_audio(filepath) print(type) language = None if language == "Detect" else language text = model.transcribe( audio_file, task=task.lower(), language=language, fp16=gpu, )["text"].strip() return text, analyze_transcription(text, duration) def get_interface(model_name="medium"): global model model = whisper.load_model(model_name) return gr.Interface( fn=transcribe, inputs=[ # gr.Audio(label="Record", source="microphone", type="filepath"), gr.Video(label="Upload", source="upload", type="filepath"), gr.Dropdown( label="Language", choices=["Detect"] + sorted([i.title() for i in LANGUAGES.values()]), value="Detect", ), gr.Dropdown( label="Task", choices=["Transcribe", "Translate"], value="Transcribe", info="Whether to perform X->X speech recognition or X->English translation", ), ], outputs=[ gr.Textbox(label="Transcription", lines=26), gr.Textbox(label="Speech Analysis", lines=4)], # theme=gr.themes.Default(), theme=gr.themes.Glass( primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.purple), title="Analysis of Speech from Video", # description=DESCRIPTION, allow_flagging="never", ) demo = get_interface() demo.queue().launch(debug=True)