Pooya-Fallah commited on
Commit
4f02add
·
verified ·
1 Parent(s): 9dc2477

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -74
app.py CHANGED
@@ -1,75 +1,76 @@
1
- import torch
2
- import nemo.collections.asr as nemo_asr
3
- import gc
4
- import numpy as np
5
- import torchaudio
6
-
7
- pretrained_model_path="./stt_fa_fastconformer_hybrid_large_finetuned.nemo"
8
-
9
- # Clear up memory
10
- torch.cuda.empty_cache()
11
- gc.collect()
12
- model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(pretrained_model_path)
13
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
14
- # device = 'cpu' # You can transcribe even longer samples on the CPU, though it will take much longer !
15
- model = model.to(device)
16
- model.freeze()
17
-
18
- def transcribe(stream, new_chunk):
19
- if new_chunk is None:
20
- return None, ""
21
- # 'audio' is a tuple: (sample_rate, data)
22
- sample_rate, data = new_chunk
23
-
24
- # Ensure the model is on the correct device
25
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
26
-
27
- # Convert audio data to the expected format
28
- if isinstance(data, np.ndarray):
29
- audio_tensor = torch.tensor(data, dtype=torch.float32)
30
- else:
31
- raise ValueError("Audio data must be a numpy array")
32
-
33
- # Resample if sample rate is not 16000
34
- target_sample_rate = 16000
35
- if sample_rate != target_sample_rate:
36
- resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
37
- audio_tensor = resampler(audio_tensor)
38
-
39
- if stream is not None:
40
- stream['audio'] = torch.cat([stream['audio'], audio_tensor], dim=-1)
41
- else:
42
- stream = {"text": ""}
43
- stream['audio'] = audio_tensor
44
-
45
-
46
- max_length = 5 * target_sample_rate # 5 seconds
47
- new_text = ""
48
-
49
- # Process all chunks that fit max_length
50
- while stream['audio'].shape[-1] > max_length:
51
- # Extract first max_length samples
52
- audio_chunk = stream['audio'][..., :max_length]
53
-
54
- # Transcribe
55
- with torch.no_grad():
56
- transcript = model.transcribe(audio_chunk) # Add batch dimension if needed
57
-
58
- # Update text (adjust based on model's output format)
59
- new_text += " " + transcript[0][0].strip() # Example adjustment
60
-
61
- # Remove processed audio from buffer
62
- stream['audio'] = stream['audio'][..., max_length:]
63
-
64
- stream['text'] += new_text
65
- return stream, stream['text'].strip()
66
-
67
-
68
- interface = gr.Interface(
69
- fn=transcribe,
70
- inputs=['state', gr.Audio(sources="microphone", streaming=True, type="numpy")],
71
- outputs=["state", "text"],
72
- live=True,
73
- )
74
-
 
75
  interface.launch()
 
1
+ import torch
2
+ import nemo.collections.asr as nemo_asr
3
+ import gc
4
+ import numpy as np
5
+ import torchaudio
6
+ import gradio as gr
7
+
8
+ pretrained_model_path="./stt_fa_fastconformer_hybrid_large_finetuned.nemo"
9
+
10
+ # Clear up memory
11
+ torch.cuda.empty_cache()
12
+ gc.collect()
13
+ model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(pretrained_model_path)
14
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
15
+ # device = 'cpu' # You can transcribe even longer samples on the CPU, though it will take much longer !
16
+ model = model.to(device)
17
+ model.freeze()
18
+
19
+ def transcribe(stream, new_chunk):
20
+ if new_chunk is None:
21
+ return None, ""
22
+ # 'audio' is a tuple: (sample_rate, data)
23
+ sample_rate, data = new_chunk
24
+
25
+ # Ensure the model is on the correct device
26
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
27
+
28
+ # Convert audio data to the expected format
29
+ if isinstance(data, np.ndarray):
30
+ audio_tensor = torch.tensor(data, dtype=torch.float32)
31
+ else:
32
+ raise ValueError("Audio data must be a numpy array")
33
+
34
+ # Resample if sample rate is not 16000
35
+ target_sample_rate = 16000
36
+ if sample_rate != target_sample_rate:
37
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
38
+ audio_tensor = resampler(audio_tensor)
39
+
40
+ if stream is not None:
41
+ stream['audio'] = torch.cat([stream['audio'], audio_tensor], dim=-1)
42
+ else:
43
+ stream = {"text": ""}
44
+ stream['audio'] = audio_tensor
45
+
46
+
47
+ max_length = 5 * target_sample_rate # 5 seconds
48
+ new_text = ""
49
+
50
+ # Process all chunks that fit max_length
51
+ while stream['audio'].shape[-1] > max_length:
52
+ # Extract first max_length samples
53
+ audio_chunk = stream['audio'][..., :max_length]
54
+
55
+ # Transcribe
56
+ with torch.no_grad():
57
+ transcript = model.transcribe(audio_chunk) # Add batch dimension if needed
58
+
59
+ # Update text (adjust based on model's output format)
60
+ new_text += " " + transcript[0][0].strip() # Example adjustment
61
+
62
+ # Remove processed audio from buffer
63
+ stream['audio'] = stream['audio'][..., max_length:]
64
+
65
+ stream['text'] += new_text
66
+ return stream, stream['text'].strip()
67
+
68
+
69
+ interface = gr.Interface(
70
+ fn=transcribe,
71
+ inputs=['state', gr.Audio(sources="microphone", streaming=True, type="numpy")],
72
+ outputs=["state", "text"],
73
+ live=True,
74
+ )
75
+
76
  interface.launch()