File size: 3,763 Bytes
3651420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144






import speech_recognition as sr
from gtts import gTTS
from pydub import AudioSegment
from IPython.display import Audio

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import soundfile as sf

# Setup device and dtype
device = "cuda:0" if torch.cuda.is_available() else "cpu"





import os
from groq import Groq

# Initialize the Groq client with the API key
client = Groq(
    api_key="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",
)











#@@##

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load model and processor
model_id = "openai/whisper-medium"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

from transformers import pipeline
from gtts import gTTS
import gradio as gr
import torch

# Load ASR pipeline
asr_pipe =pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

# Initialize Groq client
client = Groq(
    api_key="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"
)

# Text-to-Speech function
def text_to_speech(text):
    try:
        # Convert text to speech using gTTS
        tts = gTTS(text, lang='hi')
        tts.save("response.mp3")
        return "response.mp3"  # Return the MP3 file path for playback in Gradio
    except Exception as e:
        print(f"Text-to-speech error: {e}")
        return None

# Function to process audio, get model response, and return TTS output
def process_audio(audio):
    # Convert audio to text
    print("Converting audio to text...")
    result = asr_pipe(audio, generate_kwargs={"language": "urdu"})

    # Check if audio-to-text conversion was successful
    if "text" in result and result["text"].strip():
        user_ques = result["text"]
        print("Audio-to-text conversion successful. User Question:", user_ques)

        # Prepare messages for model input
        messages = [
            {
                "role": "system",
                "content": "You are a helpful assistant named SSk BOT that stands for (sehar bot) who mostly answers in Roman Urdu. Be professional. No emojis; just Urdu written in English letters, and if you receive a prompt in Urdu font, answer only in English (Roman Urdu).",
            },
            {
                "role": "user",
                "content": user_ques,
            }
        ]

        # Get response from Groq model
        print("Getting response from the model...")
        response = client.chat.completions.create(
            messages=messages,
            model="gemma2-9b-it",
        )

        # Extract model's response
        model_response = response['choices'][0]['message']['content']
        print("Model:", model_response)

        # Convert model's response to speech
        audio_path = text_to_speech(model_response)
        return model_response, audio_path

    else:
        print("Audio-to-text conversion failed or produced no text.")
        return "Audio-to-text conversion failed or no text was detected.", None

# Gradio interface
interface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath"),
    outputs=[gr.Textbox(label="Model Response"), gr.Audio(label="Response Audio")],
    title="Real-time ASR to Language Model Response",
    description="Upload an audio file in Urdu, get a text response from the model, and hear the response in English."
)

# Launch the Gradio Interface
interface.launch()