from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import soundfile as sf import torch import gradio as gr import torchaudio # load model and processor processor = Wav2Vec2Processor.from_pretrained("maher13/arabic-iti") model = Wav2Vec2ForCTC.from_pretrained("maher13/arabic-iti").eval() # define function to read in sound file def map_to_array(file): speech, sr = torchaudio.load(file) if sr != 16000: transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000) speech= transform(speech) speech = speech[0] speech = speech.numpy() return speech # tokenize def inference(audio_file, audio_file2): if audio_file: input_values = processor(map_to_array(audio_file.name), return_tensors="pt", padding="longest").input_values # Batch size 1 logits = model(input_values).logits with torch.no_grad(): predicted_ids = torch.argmax(logits, dim=-1) predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id transcription1 = processor.tokenizer.batch_decode(predicted_ids)[0] else: transcription1 = "N/A" if audio_file2: input_values = processor(map_to_array(audio_file2.name), return_tensors="pt", padding="longest").input_values # Batch size 1 logits = model(input_values).logits with torch.no_grad(): predicted_ids = torch.argmax(logits, dim=-1) predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id transcription2 = processor.tokenizer.batch_decode(predicted_ids)[0] else : transcription2 = "N/A" return transcription1, transcription2 gradio_ui = gr.Interface( fn=inference, title="Speech to Text Graduation project \n sponsored by TensorGraph", inputs= [ gr.inputs.Audio(source = 'microphone', type="file", optional = True), gr.inputs.Audio(source = 'upload', type="file", optional = True) ], outputs=[ gr.outputs.Textbox(label="Auto-Transcript"), gr.outputs.Textbox(label="Auto-Transcript") ], ) gradio_ui.launch(share=True)