Ritwika-Das-Gupta commited on
Commit
3cae852
·
verified ·
1 Parent(s): 61bec3e

Upload grdio_audio_integration.py

Browse files
Files changed (1) hide show
  1. grdio_audio_integration.py +188 -0
grdio_audio_integration.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[65]:
5
+
6
+
7
+ import gradio as gr
8
+ import torch
9
+ import re
10
+ import soundfile as sf
11
+ import numpy as np
12
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, AutoTokenizer, AutoModelForCausalLM
13
+ import soundfile as sf
14
+ import noisereduce as nr
15
+ import numpy as np
16
+ import librosa
17
+
18
+ # Load the models and tokenizers
19
+ model1 = Wav2Vec2ForCTC.from_pretrained("ai4bharat/indicwav2vec-hindi")
20
+ tokenizer1 = Wav2Vec2Tokenizer.from_pretrained("ai4bharat/indicwav2vec-hindi")
21
+ tokenizer = AutoTokenizer.from_pretrained("soketlabs/pragna-1b", token=os.environ.get('HF_TOKEN'))
22
+ model = AutoModelForCausalLM.from_pretrained("soketlabs/pragna-1b", revision='3c5b8b1309f7d89710331ba2f164570608af0de7')
23
+ model.load_adapter('soketlabs/pragna-1b-it-v0.1', token=os.environ.get('HF_TOKEN'))
24
+ model.to('cuda')
25
+
26
+ # Function to transcribe audio
27
+ def transcribe_audio(audio_data):
28
+ input_audio = torch.tensor(audio_data).float()
29
+ input_values = tokenizer1(input_audio.squeeze(), return_tensors="pt").input_values
30
+ with torch.no_grad():
31
+ logits = model1(input_values).logits
32
+ predicted_ids = torch.argmax(logits, dim=-1)
33
+ transcription = tokenizer1.batch_decode(predicted_ids)[0]
34
+ return transcription
35
+
36
+ # Function to generate response
37
+ def generate_response(transcription):
38
+ try:
39
+ messages = [
40
+ {"role": "system", "content": " you are a friendly bot to help the user"},
41
+ {"role": "user", "content": transcription},
42
+ ]
43
+ tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
44
+ input_ids = tokenized_chat[0].to('cuda')
45
+ if len(input_ids.shape) == 1:
46
+ input_ids = input_ids.unsqueeze(0)
47
+ with torch.no_grad():
48
+ output = model.generate(
49
+ input_ids,
50
+ max_new_tokens=100,
51
+ num_return_sequences=1,
52
+ temperature=0.1,
53
+ top_k=50,
54
+ top_p=0.5,
55
+ repetition_penalty=1.2,
56
+ do_sample=True
57
+ )
58
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
59
+ return find_last_sentence(generated_text)
60
+ except Exception as e:
61
+ print("Error during response generation:", e)
62
+ return "Response generation error: " + str(e)
63
+
64
+ # Function to find last sentence in generated text
65
+ def find_last_sentence(text):
66
+ sentence_endings = re.finditer(r'[।?!]', text)
67
+ end_positions = [ending.end() for ending in sentence_endings]
68
+ if end_positions:
69
+ return text[:end_positions[-1]]
70
+ return text
71
+
72
+
73
+
74
+ # In[16]:
75
+
76
+
77
+ get_ipython().system('pip install noisereduce')
78
+
79
+
80
+ # In[76]:
81
+
82
+
83
+ import soundfile as sf
84
+ import librosa
85
+ import noisereduce as nr
86
+ import numpy as np
87
+ import gradio as gr
88
+ import pyloudnorm as pyln
89
+
90
+ def spectral_subtraction(audio_data, sample_rate):
91
+ # Compute short-time Fourier transform (STFT)
92
+ stft = librosa.stft(audio_data)
93
+
94
+ # Compute power spectrogram
95
+ power_spec = np.abs(stft)**2
96
+
97
+ # Estimate noise power spectrum
98
+ noise_power = np.median(power_spec, axis=1)
99
+
100
+ # Apply spectral subtraction
101
+ alpha = 2.0 # Adjustment factor, typically between 1.0 and 2.0
102
+ denoised_spec = np.maximum(power_spec - alpha * noise_power[:, np.newaxis], 0)
103
+
104
+ # Inverse STFT to obtain denoised audio
105
+ denoised_audio = librosa.istft(np.sqrt(denoised_spec) * np.exp(1j * np.angle(stft)))
106
+
107
+ return denoised_audio
108
+
109
+ def apply_compression(audio_data, sample_rate):
110
+ # Apply dynamic range compression
111
+ meter = pyln.Meter(sample_rate) # create BS.1770 meter
112
+ loudness = meter.integrated_loudness(audio_data)
113
+
114
+ # Normalize audio to target loudness of -24 LUFS
115
+ loud_norm = pyln.normalize.loudness(audio_data, loudness, -24.0)
116
+
117
+ return loud_norm
118
+
119
+ def process_audio(audio_file_path):
120
+ try:
121
+ # Read audio data
122
+ audio_data, sample_rate = librosa.load(audio_file_path)
123
+ print(f"Read audio data: {audio_file_path}, Sample Rate: {sample_rate}")
124
+
125
+ # Apply noise reduction using noisereduce
126
+ reduced_noise = nr.reduce_noise(y=audio_data, sr=sample_rate)
127
+ print("Noise reduction applied")
128
+
129
+ # Apply spectral subtraction for additional noise reduction
130
+ denoised_audio = spectral_subtraction(reduced_noise, sample_rate)
131
+ print("Spectral subtraction applied")
132
+
133
+ # Apply dynamic range compression to make foreground louder
134
+ compressed_audio = apply_compression(denoised_audio, sample_rate)
135
+ print("Dynamic range compression applied")
136
+
137
+ # Remove silent spaces
138
+ final_audio = librosa.effects.trim(compressed_audio)[0]
139
+ print("Silences trimmed")
140
+
141
+ # Save the final processed audio to a file with a fixed name
142
+ processed_file_path = 'processed_audio.wav'
143
+ sf.write(processed_file_path, final_audio, sample_rate)
144
+ print(f"Processed audio saved to: {processed_file_path}")
145
+
146
+ # Check if file exists to confirm it was saved
147
+ if not os.path.isfile(processed_file_path):
148
+ raise FileNotFoundError(f"Processed file not found: {processed_file_path}")
149
+
150
+ # Load the processed audio for transcription
151
+ processed_audio_data, _ = librosa.load(processed_file_path, sr=16000)
152
+ print(f"Processed audio reloaded for transcription: {processed_file_path}")
153
+
154
+ # Transcribe audio
155
+ transcription = transcribe_audio(processed_audio_data)
156
+ print("Transcription completed")
157
+
158
+ # Generate response
159
+ response = generate_response(transcription)
160
+ print("Response generated")
161
+
162
+ return processed_file_path, transcription, response
163
+ except Exception as e:
164
+ print("Error during audio processing:", e)
165
+ return "Error during audio processing:", str(e)
166
+
167
+
168
+ # Create Gradio interface
169
+ iface = gr.Interface(
170
+ fn=process_audio,
171
+ inputs=gr.Audio(label="Record Audio", type="filepath"),
172
+ outputs=[gr.Audio(label="Processed Audio"), gr.Textbox(label="Transcription"), gr.Textbox(label="Response")]
173
+ )
174
+
175
+ iface.launch(share=True)
176
+
177
+
178
+ # In[45]:
179
+
180
+
181
+ get_ipython().system('pip install pyloudnorm')
182
+
183
+
184
+ # In[ ]:
185
+
186
+
187
+
188
+