# import gradio as gr # gr.Interface.load("models/rohitp1/kkkh_whisper_small_distillation_att_loss_libri360_epochs_100_batch_4_concat_dataset").launch() import gradio as gr import os import transformers from transformers import pipeline, WhisperForConditionalGeneration, WhisperTokenizer, WhisperFeatureExtractor import time import torch # def greet_from_secret(ignored_param): # name = os.environ.get('TOKEN') # return auth_token = os.environ.get('TOKEN') M1 = "rohitp1/subh_whisper_small_distil_att_loss_mozilla_epochs_50_batch_8" M2 = "rohitp1/dgx1_whisper_small_finetune_teacher_babble_noise_mozilla_40_epochs_batch_8" M3 = "rohitp1/dgx1_whisper_small_finetune_teacher_no_noise_mozilla_40_epochs_batch_8" model1 = WhisperForConditionalGeneration.from_pretrained(M1, use_auth_token=auth_token) tokenizer1 = WhisperTokenizer.from_pretrained(M1, use_auth_token=auth_token) feat_ext1 = WhisperFeatureExtractor.from_pretrained(M1, use_auth_token=auth_token) model2 = WhisperForConditionalGeneration.from_pretrained(M2, use_auth_token=auth_token) tokenizer2 = WhisperTokenizer.from_pretrained(M2, use_auth_token=auth_token) feat_ext2 = WhisperFeatureExtractor.from_pretrained(M2, use_auth_token=auth_token) model3 = WhisperForConditionalGeneration.from_pretrained(M3, use_auth_token=auth_token) tokenizer3 = WhisperTokenizer.from_pretrained(M3, use_auth_token=auth_token) feat_ext3 = WhisperFeatureExtractor.from_pretrained(M3, use_auth_token=auth_token) # make quantized model # quantized_model1 = torch.quantization.quantize_dynamic( # model1, {torch.nn.Linear}, dtype=torch.qint8 # ) p1 = pipeline('automatic-speech-recognition', model=model1, tokenizer=tokenizer1, feature_extractor=feat_ext1) p2 = pipeline('automatic-speech-recognition', model=model2, tokenizer=tokenizer2, feature_extractor=feat_ext2) p3 = pipeline('automatic-speech-recognition', model=model3, tokenizer=tokenizer3, feature_extractor=feat_ext3) # p1_quant = pipeline('automatic-speech-recognition', model=quantized_model1, tokenizer=tokenizer1, feature_extractor=feat_ext1) def transcribe(mic_input, upl_input, model_type): if mic_input: audio = mic_input else: audio = upl_input time.sleep(3) st_time = time.time() if model_type == 'NoisyFinetuned': text = p2(audio)["text"] elif model_type == 'CleanFinetuned': text = p3(audio)["text"] # elif model_type == 'DistilledAndQuantised': # text = p1_quant(audio)['text'] else: text = p1(audio)["text"] end_time = time.time() # state = text + " " time_taken = round((end_time - st_time) / 60 , 4) return text, time_taken # gr.Interface( # fn=transcribe, # inputs=[ # gr.inputs.Audio(source="microphone", type="filepath"), # 'state' # ], # outputs=[ # "textbox", # "state" # ], # live=False).launch() # demo = gr.load( # "huggingface/rohitp1/kkkh_whisper_small_distillation_att_loss_libri360_epochs_100_batch_4_concat_dataset", # title="Speech-to-text", # inputs="mic", # description="Let me try to guess what you're saying!", # api_key="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" # ) # demo.launch() def clear_inputs_and_outputs(): return [None, None, "CleanFinetuned", None, None] # Main function if __name__ == "__main__": demo = gr.Blocks() with demo: gr.Markdown( """