File size: 2,158 Bytes
97433fd
 
b5f889a
97433fd
0fce504
97433fd
 
b5f889a
 
97433fd
 
0fce504
00592d4
0fce504
00592d4
0fce504
 
 
 
97433fd
b5f889a
97433fd
 
 
 
 
b5f889a
5cffb7f
 
4c929ff
 
ea0a23f
9ccbaed
97433fd
ea0a23f
 
 
b5f889a
ea0a23f
 
 
 
 
 
97433fd
 
ea0a23f
97433fd
 
b5f889a
21872db
b5f889a
 
 
 
 
 
 
 
 
 
 
0fce504
97433fd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf
import torch
import gradio as gr
import torchaudio

# load model and processor
processor = Wav2Vec2Processor.from_pretrained("maher13/arabic-iti")
model = Wav2Vec2ForCTC.from_pretrained("maher13/arabic-iti").eval()
# define function to read in sound file
def map_to_array(file):
   speech, sr = torchaudio.load(file)
   if sr != 16000:
      transform = torchaudio.transforms.Resample(orig_freq=sr,
                                                 new_freq=16000)
      speech= transform(speech)
      speech = speech[0]
      speech = speech.numpy()

   return speech
 
# tokenize
def inference(audio_file, audio_file2):
   if audio_file:
     input_values = processor(map_to_array(audio_file.name), return_tensors="pt", padding="longest").input_values  # Batch size 1
     logits = model(input_values).logits
  
     with torch.no_grad():
       predicted_ids = torch.argmax(logits, dim=-1)
     predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id
     transcription1 = processor.tokenizer.batch_decode(predicted_ids)[0]
   else:
     transcription1 = "N/A"
     
   if audio_file2:
     input_values = processor(map_to_array(audio_file2.name), return_tensors="pt", padding="longest").input_values  # Batch size 1
     logits = model(input_values).logits
  
     with torch.no_grad():
       predicted_ids = torch.argmax(logits, dim=-1)
     predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id
     transcription2 = processor.tokenizer.batch_decode(predicted_ids)[0]
   else :
     transcription2 = "N/A"
     
     
   return transcription1, transcription2
     
     
gradio_ui = gr.Interface(
    fn=inference,
    title="Speech to Text Graduation project \n sponsored by TensorGraph",
    inputs=
    [
    gr.inputs.Audio(source = 'microphone', type="file", optional = True),
    gr.inputs.Audio(source = 'upload', type="file", optional = True) 
    ],
    outputs=[
             gr.outputs.Textbox(label="Auto-Transcript"),
             gr.outputs.Textbox(label="Auto-Transcript")
             ],
)

gradio_ui.launch(share=True)