maher13 commited on
Commit
97433fd
·
1 Parent(s): e377670

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -27
app.py CHANGED
@@ -1,21 +1,22 @@
1
- import gradio as gr
2
- import librosa
3
  import torch
4
- import torchaudio
5
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
6
- import numpy as np
7
 
 
 
8
  processor = Wav2Vec2Processor.from_pretrained("maher13/arabic-iti")
9
  model = Wav2Vec2ForCTC.from_pretrained("maher13/arabic-iti").eval()
10
-
11
- def asr_transcript(audio_file, audio_file2):
12
- transcript = ""
13
-
14
- if audio_file :
15
- wav, sr = librosa.load(audio_file.name, sr=16000)
16
 
17
- input_values = processor(wav, sampling_rate=16000, return_tensors="pt", padding=True).input_values
18
- logits = model(input_values).logits
 
 
 
19
 
20
  with torch.no_grad():
21
  predicted_ids = torch.argmax(logits, dim=-1)
@@ -23,11 +24,9 @@ def asr_transcript(audio_file, audio_file2):
23
  transcription1 = processor.tokenizer.batch_decode(predicted_ids)[0]
24
  else:
25
  transcription1 = "N/A"
26
-
27
- if audio_file2 :
28
- wav, sr = librosa.load(audio_file2.name, sr=16000)
29
-
30
- input_values = processor(wav, sampling_rate=16000, return_tensors="pt", padding=True).input_values
31
  logits = model(input_values).logits
32
 
33
  with torch.no_grad():
@@ -36,9 +35,11 @@ def asr_transcript(audio_file, audio_file2):
36
  transcription2 = processor.tokenizer.batch_decode(predicted_ids)[0]
37
  else :
38
  transcription2 = "N/A"
39
-
40
- return transcription1, transcription2
41
-
 
 
42
  gradio_ui = gr.Interface(
43
  fn=asr_transcript,
44
  title="Speech to Text Graduation project \n sponsored by TensorGraph",
@@ -52,9 +53,4 @@ gradio_ui = gr.Interface(
52
  gr.outputs.Textbox(label="Auto-Transcript")
53
  ],
54
  )
55
-
56
-
57
-
58
- #gradio_ui.launch(share=True)
59
- gradio_ui.launch(share=True)
60
-
 
1
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
2
+ import soundfile as sf
3
  import torch
4
+ import gradio as gr
 
 
5
 
6
+
7
+ # load model and processor
8
  processor = Wav2Vec2Processor.from_pretrained("maher13/arabic-iti")
9
  model = Wav2Vec2ForCTC.from_pretrained("maher13/arabic-iti").eval()
10
+ # define function to read in sound file
11
+ def map_to_array(file):
12
+ speech, _ = sf.read(file)
13
+ return speech
 
 
14
 
15
+ # tokenize
16
+ def inference(audio_file, audio_file2):
17
+ if audio_file:
18
+ input_values = processor(map_to_array(audio_file.name), return_tensors="pt", padding="longest").input_values # Batch size 1
19
+ logits = model(input_values).logits
20
 
21
  with torch.no_grad():
22
  predicted_ids = torch.argmax(logits, dim=-1)
 
24
  transcription1 = processor.tokenizer.batch_decode(predicted_ids)[0]
25
  else:
26
  transcription1 = "N/A"
27
+
28
+ if audio_file2:
29
+ input_values = processor(map_to_array(audio_file2.name), return_tensors="pt", padding="longest").input_values # Batch size 1
 
 
30
  logits = model(input_values).logits
31
 
32
  with torch.no_grad():
 
35
  transcription2 = processor.tokenizer.batch_decode(predicted_ids)[0]
36
  else :
37
  transcription2 = "N/A"
38
+
39
+
40
+ return transcription1, transcription2
41
+
42
+
43
  gradio_ui = gr.Interface(
44
  fn=asr_transcript,
45
  title="Speech to Text Graduation project \n sponsored by TensorGraph",
 
53
  gr.outputs.Textbox(label="Auto-Transcript")
54
  ],
55
  )
56
+ gradio_ui.launch(share=True)