Spaces:
Runtime error
Runtime error
Upload grdio_audio_integration.py
Browse files- grdio_audio_integration.py +188 -0
grdio_audio_integration.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[65]:
|
5 |
+
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
import torch
|
9 |
+
import re
|
10 |
+
import soundfile as sf
|
11 |
+
import numpy as np
|
12 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, AutoTokenizer, AutoModelForCausalLM
|
13 |
+
import soundfile as sf
|
14 |
+
import noisereduce as nr
|
15 |
+
import numpy as np
|
16 |
+
import librosa
|
17 |
+
|
18 |
+
# Load the models and tokenizers
|
19 |
+
model1 = Wav2Vec2ForCTC.from_pretrained("ai4bharat/indicwav2vec-hindi")
|
20 |
+
tokenizer1 = Wav2Vec2Tokenizer.from_pretrained("ai4bharat/indicwav2vec-hindi")
|
21 |
+
tokenizer = AutoTokenizer.from_pretrained("soketlabs/pragna-1b", token=os.environ.get('HF_TOKEN'))
|
22 |
+
model = AutoModelForCausalLM.from_pretrained("soketlabs/pragna-1b", revision='3c5b8b1309f7d89710331ba2f164570608af0de7')
|
23 |
+
model.load_adapter('soketlabs/pragna-1b-it-v0.1', token=os.environ.get('HF_TOKEN'))
|
24 |
+
model.to('cuda')
|
25 |
+
|
26 |
+
# Function to transcribe audio
|
27 |
+
def transcribe_audio(audio_data):
|
28 |
+
input_audio = torch.tensor(audio_data).float()
|
29 |
+
input_values = tokenizer1(input_audio.squeeze(), return_tensors="pt").input_values
|
30 |
+
with torch.no_grad():
|
31 |
+
logits = model1(input_values).logits
|
32 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
33 |
+
transcription = tokenizer1.batch_decode(predicted_ids)[0]
|
34 |
+
return transcription
|
35 |
+
|
36 |
+
# Function to generate response
|
37 |
+
def generate_response(transcription):
|
38 |
+
try:
|
39 |
+
messages = [
|
40 |
+
{"role": "system", "content": " you are a friendly bot to help the user"},
|
41 |
+
{"role": "user", "content": transcription},
|
42 |
+
]
|
43 |
+
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
|
44 |
+
input_ids = tokenized_chat[0].to('cuda')
|
45 |
+
if len(input_ids.shape) == 1:
|
46 |
+
input_ids = input_ids.unsqueeze(0)
|
47 |
+
with torch.no_grad():
|
48 |
+
output = model.generate(
|
49 |
+
input_ids,
|
50 |
+
max_new_tokens=100,
|
51 |
+
num_return_sequences=1,
|
52 |
+
temperature=0.1,
|
53 |
+
top_k=50,
|
54 |
+
top_p=0.5,
|
55 |
+
repetition_penalty=1.2,
|
56 |
+
do_sample=True
|
57 |
+
)
|
58 |
+
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
|
59 |
+
return find_last_sentence(generated_text)
|
60 |
+
except Exception as e:
|
61 |
+
print("Error during response generation:", e)
|
62 |
+
return "Response generation error: " + str(e)
|
63 |
+
|
64 |
+
# Function to find last sentence in generated text
|
65 |
+
def find_last_sentence(text):
|
66 |
+
sentence_endings = re.finditer(r'[।?!]', text)
|
67 |
+
end_positions = [ending.end() for ending in sentence_endings]
|
68 |
+
if end_positions:
|
69 |
+
return text[:end_positions[-1]]
|
70 |
+
return text
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
# In[16]:
|
75 |
+
|
76 |
+
|
77 |
+
get_ipython().system('pip install noisereduce')
|
78 |
+
|
79 |
+
|
80 |
+
# In[76]:
|
81 |
+
|
82 |
+
|
83 |
+
import soundfile as sf
|
84 |
+
import librosa
|
85 |
+
import noisereduce as nr
|
86 |
+
import numpy as np
|
87 |
+
import gradio as gr
|
88 |
+
import pyloudnorm as pyln
|
89 |
+
|
90 |
+
def spectral_subtraction(audio_data, sample_rate):
|
91 |
+
# Compute short-time Fourier transform (STFT)
|
92 |
+
stft = librosa.stft(audio_data)
|
93 |
+
|
94 |
+
# Compute power spectrogram
|
95 |
+
power_spec = np.abs(stft)**2
|
96 |
+
|
97 |
+
# Estimate noise power spectrum
|
98 |
+
noise_power = np.median(power_spec, axis=1)
|
99 |
+
|
100 |
+
# Apply spectral subtraction
|
101 |
+
alpha = 2.0 # Adjustment factor, typically between 1.0 and 2.0
|
102 |
+
denoised_spec = np.maximum(power_spec - alpha * noise_power[:, np.newaxis], 0)
|
103 |
+
|
104 |
+
# Inverse STFT to obtain denoised audio
|
105 |
+
denoised_audio = librosa.istft(np.sqrt(denoised_spec) * np.exp(1j * np.angle(stft)))
|
106 |
+
|
107 |
+
return denoised_audio
|
108 |
+
|
109 |
+
def apply_compression(audio_data, sample_rate):
|
110 |
+
# Apply dynamic range compression
|
111 |
+
meter = pyln.Meter(sample_rate) # create BS.1770 meter
|
112 |
+
loudness = meter.integrated_loudness(audio_data)
|
113 |
+
|
114 |
+
# Normalize audio to target loudness of -24 LUFS
|
115 |
+
loud_norm = pyln.normalize.loudness(audio_data, loudness, -24.0)
|
116 |
+
|
117 |
+
return loud_norm
|
118 |
+
|
119 |
+
def process_audio(audio_file_path):
|
120 |
+
try:
|
121 |
+
# Read audio data
|
122 |
+
audio_data, sample_rate = librosa.load(audio_file_path)
|
123 |
+
print(f"Read audio data: {audio_file_path}, Sample Rate: {sample_rate}")
|
124 |
+
|
125 |
+
# Apply noise reduction using noisereduce
|
126 |
+
reduced_noise = nr.reduce_noise(y=audio_data, sr=sample_rate)
|
127 |
+
print("Noise reduction applied")
|
128 |
+
|
129 |
+
# Apply spectral subtraction for additional noise reduction
|
130 |
+
denoised_audio = spectral_subtraction(reduced_noise, sample_rate)
|
131 |
+
print("Spectral subtraction applied")
|
132 |
+
|
133 |
+
# Apply dynamic range compression to make foreground louder
|
134 |
+
compressed_audio = apply_compression(denoised_audio, sample_rate)
|
135 |
+
print("Dynamic range compression applied")
|
136 |
+
|
137 |
+
# Remove silent spaces
|
138 |
+
final_audio = librosa.effects.trim(compressed_audio)[0]
|
139 |
+
print("Silences trimmed")
|
140 |
+
|
141 |
+
# Save the final processed audio to a file with a fixed name
|
142 |
+
processed_file_path = 'processed_audio.wav'
|
143 |
+
sf.write(processed_file_path, final_audio, sample_rate)
|
144 |
+
print(f"Processed audio saved to: {processed_file_path}")
|
145 |
+
|
146 |
+
# Check if file exists to confirm it was saved
|
147 |
+
if not os.path.isfile(processed_file_path):
|
148 |
+
raise FileNotFoundError(f"Processed file not found: {processed_file_path}")
|
149 |
+
|
150 |
+
# Load the processed audio for transcription
|
151 |
+
processed_audio_data, _ = librosa.load(processed_file_path, sr=16000)
|
152 |
+
print(f"Processed audio reloaded for transcription: {processed_file_path}")
|
153 |
+
|
154 |
+
# Transcribe audio
|
155 |
+
transcription = transcribe_audio(processed_audio_data)
|
156 |
+
print("Transcription completed")
|
157 |
+
|
158 |
+
# Generate response
|
159 |
+
response = generate_response(transcription)
|
160 |
+
print("Response generated")
|
161 |
+
|
162 |
+
return processed_file_path, transcription, response
|
163 |
+
except Exception as e:
|
164 |
+
print("Error during audio processing:", e)
|
165 |
+
return "Error during audio processing:", str(e)
|
166 |
+
|
167 |
+
|
168 |
+
# Create Gradio interface
|
169 |
+
iface = gr.Interface(
|
170 |
+
fn=process_audio,
|
171 |
+
inputs=gr.Audio(label="Record Audio", type="filepath"),
|
172 |
+
outputs=[gr.Audio(label="Processed Audio"), gr.Textbox(label="Transcription"), gr.Textbox(label="Response")]
|
173 |
+
)
|
174 |
+
|
175 |
+
iface.launch(share=True)
|
176 |
+
|
177 |
+
|
178 |
+
# In[45]:
|
179 |
+
|
180 |
+
|
181 |
+
get_ipython().system('pip install pyloudnorm')
|
182 |
+
|
183 |
+
|
184 |
+
# In[ ]:
|
185 |
+
|
186 |
+
|
187 |
+
|
188 |
+
|