Shubham09 commited on
Commit
f4b4907
·
1 Parent(s): dc4b312

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -0
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import librosa
3
+ import torch
4
+ import gradio as gr
5
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer
6
+ nltk.download("punkt")
7
+
8
+
9
+
10
+ model_name = "Shubham09/whisper31filescheck"
11
+ processor = WhisperProcessor.from_pretrained(model_name)
12
+ tokenizer = WhisperTokenizer.from_pretrained(model_name)
13
+ model = WhisperForConditionalGeneration.from_pretrained(model_name)
14
+
15
+ def load_data(input_file):
16
+
17
+ #reading the file
18
+ speech, sample_rate = librosa.load(input_file)
19
+ #make it 1-D
20
+ if len(speech.shape) > 1:
21
+ speech = speech[:,0] + speech[:,1]
22
+ #Resampling the audio at 16KHz
23
+ if sample_rate !=16000:
24
+ speech = librosa.resample(speech, sample_rate,16000)
25
+ return speech
26
+
27
+ def correct_casing(input_sentence):
28
+
29
+ sentences = nltk.sent_tokenize(input_sentence)
30
+ return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
31
+
32
+ def asr_transcript(input_file):
33
+
34
+ speech = load_data(input_file)
35
+ #Tokenize
36
+ input_values = tokenizer(speech, return_tensors="pt").input_values
37
+ #Take logits
38
+ logits = model(input_values).logits
39
+ #Take argmax
40
+ predicted_ids = torch.argmax(logits, dim=-1)
41
+ #Get the words from predicted word ids
42
+ transcription = tokenizer.decode(predicted_ids[0])
43
+ #Correcting the letter casing
44
+ transcription = correct_casing(transcription.lower())
45
+ return transcription
46
+
47
+ gr.Interface(asr_transcript,
48
+ inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker"),
49
+ outputs = gr.outputs.Textbox(label="Output Text"),
50
+ title="ASR using Wav2Vec 2.0",
51
+ description = "This application displays transcribed text for given audio input",
52
+ examples = [["Test_File1.wav"], ["Test_File2.wav"], ["Test_File3.wav"]], theme="grass").launch()
53
+
54
+