Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[65]: | |
import os | |
import gradio as gr | |
import torch | |
import re | |
import soundfile as sf | |
import numpy as np | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, AutoTokenizer, AutoModelForCausalLM | |
import soundfile as sf | |
import noisereduce as nr | |
import librosa | |
import pyloudnorm as pyln | |
# Load the models and tokenizers | |
model1 = Wav2Vec2ForCTC.from_pretrained("ai4bharat/indicwav2vec-hindi") | |
tokenizer1 = Wav2Vec2Tokenizer.from_pretrained("ai4bharat/indicwav2vec-hindi") | |
# Loading the tokenizer and model from Hugging Face's model hub. | |
tokenizer = AutoTokenizer.from_pretrained("soketlabs/pragna-1b", token=os.environ.get('HF_TOKEN')) | |
model = AutoModelForCausalLM.from_pretrained( | |
"soketlabs/pragna-1b", | |
token=os.environ.get('HF_TOKEN'), | |
revision='3c5b8b1309f7d89710331ba2f164570608af0de7' | |
) | |
model.load_adapter('soketlabs/pragna-1b-it-v0.1', token=os.environ.get('HF_TOKEN')) | |
# using CUDA for an optimal experience | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model = model.to(device) | |
# Function to transcribe audio | |
def transcribe_audio(audio_data): | |
input_audio = torch.tensor(audio_data).float() | |
input_values = tokenizer1(input_audio.squeeze(), return_tensors="pt").input_values | |
with torch.no_grad(): | |
logits = model1(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = tokenizer1.batch_decode(predicted_ids)[0] | |
return transcription | |
# Function to generate response | |
def generate_response(transcription): | |
try: | |
messages = [ | |
{"role": "system", "content": " you are a friendly bot to help the user"}, | |
{"role": "user", "content": transcription}, | |
] | |
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") | |
input_ids = tokenized_chat[0].to('cuda') | |
if len(input_ids.shape) == 1: | |
input_ids = input_ids.unsqueeze(0) | |
with torch.no_grad(): | |
output = model.generate( | |
input_ids, | |
max_new_tokens=100, | |
num_return_sequences=1, | |
temperature=0.1, | |
top_k=50, | |
top_p=0.5, | |
repetition_penalty=1.2, | |
do_sample=True | |
) | |
generated_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
return find_last_sentence(generated_text) | |
except Exception as e: | |
print("Error during response generation:", e) | |
return "Response generation error: " + str(e) | |
# Function to find last sentence in generated text | |
def find_last_sentence(text): | |
sentence_endings = re.finditer(r'[।?!]', text) | |
end_positions = [ending.end() for ending in sentence_endings] | |
if end_positions: | |
return text[:end_positions[-1]] | |
return text | |
# In[76]: | |
def spectral_subtraction(audio_data, sample_rate): | |
# Compute short-time Fourier transform (STFT) | |
stft = librosa.stft(audio_data) | |
# Compute power spectrogram | |
power_spec = np.abs(stft)**2 | |
# Estimate noise power spectrum | |
noise_power = np.median(power_spec, axis=1) | |
# Apply spectral subtraction | |
alpha = 2.0 # Adjustment factor, typically between 1.0 and 2.0 | |
denoised_spec = np.maximum(power_spec - alpha * noise_power[:, np.newaxis], 0) | |
# Inverse STFT to obtain denoised audio | |
denoised_audio = librosa.istft(np.sqrt(denoised_spec) * np.exp(1j * np.angle(stft))) | |
return denoised_audio | |
def apply_compression(audio_data, sample_rate): | |
# Apply dynamic range compression | |
meter = pyln.Meter(sample_rate) # create BS.1770 meter | |
loudness = meter.integrated_loudness(audio_data) | |
# Normalize audio to target loudness of -24 LUFS | |
loud_norm = pyln.normalize.loudness(audio_data, loudness, -24.0) | |
return loud_norm | |
def process_audio(audio_file_path): | |
try: | |
# Read audio data | |
audio_data, sample_rate = librosa.load(audio_file_path) | |
print(f"Read audio data: {audio_file_path}, Sample Rate: {sample_rate}") | |
# Apply noise reduction using noisereduce | |
reduced_noise = nr.reduce_noise(y=audio_data, sr=sample_rate) | |
print("Noise reduction applied") | |
# Apply spectral subtraction for additional noise reduction | |
denoised_audio = spectral_subtraction(reduced_noise, sample_rate) | |
print("Spectral subtraction applied") | |
# Apply dynamic range compression to make foreground louder | |
compressed_audio = apply_compression(denoised_audio, sample_rate) | |
print("Dynamic range compression applied") | |
# Remove silent spaces | |
final_audio = librosa.effects.trim(compressed_audio)[0] | |
print("Silences trimmed") | |
# Save the final processed audio to a file with a fixed name | |
processed_file_path = 'processed_audio.wav' | |
sf.write(processed_file_path, final_audio, sample_rate) | |
print(f"Processed audio saved to: {processed_file_path}") | |
# Check if file exists to confirm it was saved | |
if not os.path.isfile(processed_file_path): | |
raise FileNotFoundError(f"Processed file not found: {processed_file_path}") | |
# Load the processed audio for transcription | |
processed_audio_data, _ = librosa.load(processed_file_path, sr=16000) | |
print(f"Processed audio reloaded for transcription: {processed_file_path}") | |
# Transcribe audio | |
transcription = transcribe_audio(processed_audio_data) | |
print("Transcription completed") | |
# Generate response | |
response = generate_response(transcription) | |
print("Response generated") | |
return processed_file_path, transcription, response | |
except Exception as e: | |
print("Error during audio processing:", e) | |
return "Error during audio processing:", str(e) | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=process_audio, | |
inputs=gr.Audio(label="Record Audio", type="filepath"), | |
outputs=[gr.Audio(label="Processed Audio"), gr.Textbox(label="Transcription"), gr.Textbox(label="Response")] | |
) | |
if __name__ == "__main__": | |
iface.launch(share=True) | |