Harveenchadha commited on
Commit
34b86ea
·
1 Parent(s): 39e9f1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -5
app.py CHANGED
@@ -9,7 +9,7 @@ import yaml
9
  import tensorflow as tf
10
  from tensorflow_tts.inference import TFAutoModel
11
  from tensorflow_tts.inference import AutoProcessor
12
-
13
 
14
 
15
 
@@ -63,10 +63,20 @@ def translate(text):
63
 
64
  processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
65
  model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
 
 
 
 
 
 
 
 
 
66
  def parse_transcription(wav_file):
67
- filename = wav_file.name.split('.')[0]
68
- convert(wav_file.name, filename + "16k.wav")
69
- speech, _ = sf.read(filename + "16k.wav")
 
70
  input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
71
  logits = model(input_values).logits
72
  predicted_ids = torch.argmax(logits, dim=-1)
@@ -81,7 +91,7 @@ def parse_transcription(wav_file):
81
  output1 = gr.outputs.Textbox(label="Hindi Output from ASR")
82
  output2 = gr.outputs.Textbox(label="English Translated Output")
83
 
84
- input_ = gr.inputs.Audio(source="microphone", type="file")
85
 
86
 
87
  output_audio = gr.outputs.Audio(type="file", label="Output Audio")
 
9
  import tensorflow as tf
10
  from tensorflow_tts.inference import TFAutoModel
11
  from tensorflow_tts.inference import AutoProcessor
12
+ import scipy.signal as sps
13
 
14
 
15
 
 
63
 
64
  processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
65
  model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
66
+
67
+ def read_file(wav):
68
+ sample_rate, signal = wav
69
+ signal = signal.mean(-1)
70
+ number_of_samples = round(len(signal) * float(16000) / sample_rate)
71
+ resampled_signal = sps.resample(signal, number_of_samples)
72
+ return resampled_signal
73
+
74
+
75
  def parse_transcription(wav_file):
76
+ #filename = wav_file.name.split('.')[0]
77
+ #convert(wav_file.name, filename + "16k.wav")
78
+ #speech, _ = sf.read(filename + "16k.wav")
79
+ speech = read_file(wav_file)
80
  input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
81
  logits = model(input_values).logits
82
  predicted_ids = torch.argmax(logits, dim=-1)
 
91
  output1 = gr.outputs.Textbox(label="Hindi Output from ASR")
92
  output2 = gr.outputs.Textbox(label="English Translated Output")
93
 
94
+ input_ = gr.inputs.Audio(source="microphone", type="numpy")
95
 
96
 
97
  output_audio = gr.outputs.Audio(type="file", label="Output Audio")