File size: 3,763 Bytes
3651420 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import speech_recognition as sr
from gtts import gTTS
from pydub import AudioSegment
from IPython.display import Audio
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import soundfile as sf
# Setup device and dtype
device = "cuda:0" if torch.cuda.is_available() else "cpu"
import os
from groq import Groq
# Initialize the Groq client with the API key
client = Groq(
api_key="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",
)
#@@##
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Load model and processor
model_id = "openai/whisper-medium"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
from transformers import pipeline
from gtts import gTTS
import gradio as gr
import torch
# Load ASR pipeline
asr_pipe =pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
torch_dtype=torch_dtype,
device=device,
)
# Initialize Groq client
client = Groq(
api_key="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"
)
# Text-to-Speech function
def text_to_speech(text):
try:
# Convert text to speech using gTTS
tts = gTTS(text, lang='hi')
tts.save("response.mp3")
return "response.mp3" # Return the MP3 file path for playback in Gradio
except Exception as e:
print(f"Text-to-speech error: {e}")
return None
# Function to process audio, get model response, and return TTS output
def process_audio(audio):
# Convert audio to text
print("Converting audio to text...")
result = asr_pipe(audio, generate_kwargs={"language": "urdu"})
# Check if audio-to-text conversion was successful
if "text" in result and result["text"].strip():
user_ques = result["text"]
print("Audio-to-text conversion successful. User Question:", user_ques)
# Prepare messages for model input
messages = [
{
"role": "system",
"content": "You are a helpful assistant named SSk BOT that stands for (sehar bot) who mostly answers in Roman Urdu. Be professional. No emojis; just Urdu written in English letters, and if you receive a prompt in Urdu font, answer only in English (Roman Urdu).",
},
{
"role": "user",
"content": user_ques,
}
]
# Get response from Groq model
print("Getting response from the model...")
response = client.chat.completions.create(
messages=messages,
model="gemma2-9b-it",
)
# Extract model's response
model_response = response['choices'][0]['message']['content']
print("Model:", model_response)
# Convert model's response to speech
audio_path = text_to_speech(model_response)
return model_response, audio_path
else:
print("Audio-to-text conversion failed or produced no text.")
return "Audio-to-text conversion failed or no text was detected.", None
# Gradio interface
interface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath"),
outputs=[gr.Textbox(label="Model Response"), gr.Audio(label="Response Audio")],
title="Real-time ASR to Language Model Response",
description="Upload an audio file in Urdu, get a text response from the model, and hear the response in English."
)
# Launch the Gradio Interface
interface.launch()
|