micknikolic commited on
Commit
836722f
1 Parent(s): 23672a2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -0
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import accelerate
2
+ import gradio as gr
3
+ import time
4
+ import io
5
+ import librosa
6
+ import torch
7
+ import soundfile as sf
8
+
9
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
10
+
11
+ #Instantiating the model object.
12
+
13
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path= "openai/whisper-large-v3",
14
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
15
+ use_safetensors=True).to("cuda")
16
+
17
+ #Instantiating the processor object.
18
+
19
+ processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path="openai/whisper-large-v3")
20
+
21
+ #Instantiating the transformer class' pipeline object.
22
+
23
+ pipe = pipeline(task="automatic-speech-recognition",
24
+ model="openai/whisper-large-v3",
25
+ tokenizer=processor.tokenizer,
26
+ feature_extractor=processor.feature_extractor,
27
+ max_new_tokens=128,
28
+ chunk_length_s=30,
29
+ batch_size=16,
30
+ return_timestamps=True,
31
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
32
+ device="cuda")
33
+
34
+ #Defining speech-to-text function.
35
+
36
+ def convert(audio, state=""):
37
+ """
38
+ This function performs speech to text conversion and will be used in Gradio's Interface function.
39
+ Parameters:
40
+ - audio: audio data as a bytes-like object.
41
+ - state: a string representing the accumulated text from previous conversions.
42
+ """
43
+ time.sleep(3)
44
+ try:
45
+ result = pipe(audio)
46
+ transcribed_text = result['text']
47
+ state += transcribed_text + " "
48
+ except Exception as e:
49
+ return f"Error processing audio: Please start recording!", state
50
+
51
+ return state, state
52
+
53
+ #Instantiating Gradio Interface.
54
+
55
+ gr_interface = gr.Interface(
56
+ fn = convert,
57
+ title = "Automatic Speech-to-Text",
58
+ description = "### Record your speech and watch it get converted to text!",
59
+ inputs = [
60
+ gr.Audio(
61
+ label="Please Record Your Speech Here!",
62
+ sources="microphone",
63
+ type="filepath"),
64
+ "state"],
65
+ outputs = [
66
+ "textbox",
67
+ "state"
68
+ ],
69
+ theme="dark",
70
+ live=True
71
+ )
72
+
73
+ #Launching the app (share=True).
74
+
75
+ gr_interface.launch()