# import torch # import torchaudio # import gradio as gr # from transformers import Wav2Vec2BertProcessor, Wav2Vec2BertForCTC # # Set device # device = "cuda" if torch.cuda.is_available() else "cpu" # # Load processor & model # model_name = "cdactvm/w2v-bert-punjabi" # Change if using a Punjabi ASR model # processor = Wav2Vec2BertProcessor.from_pretrained(model_name) # model = Wav2Vec2BertForCTC.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device) # def transcribe(audio_path): # # Load audio file # waveform, sample_rate = torchaudio.load(audio_path) # # Convert stereo to mono (if needed) # if waveform.shape[0] > 1: # waveform = torch.mean(waveform, dim=0, keepdim=True) # # Resample to 16kHz # if sample_rate != 16000: # waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform) # # Process audio # inputs = processor(waveform.squeeze(0), sampling_rate=16000, return_tensors="pt") # inputs = {key: val.to(device, dtype=torch.bfloat16) for key, val in inputs.items()} # # Get logits & transcribe # with torch.no_grad(): # logits = model(**inputs).logits # predicted_ids = torch.argmax(logits, dim=-1) # transcription = processor.batch_decode(predicted_ids)[0] # return transcription # # Gradio Interface # app = gr.Interface( # fn=transcribe, # inputs=gr.Audio(sources="upload", type="filepath"), # outputs="text", # title="Punjabi Speech-to-Text", # description="Upload an audio file and get the transcription in Punjabi." # ) # if __name__ == "__main__": # app.launch() import gradio as gr import torch from transformers import pipeline # Set device device = "cuda" if torch.cuda.is_available() else "cpu" # Load ASR pipeline asr_pipeline = pipeline( "automatic-speech-recognition", model="cdactvm/w2v-bert-punjabi", # Replace with a Punjabi ASR model if available torch_dtype=torch.bfloat16, device=0 if torch.cuda.is_available() else -1 # GPU (0) or CPU (-1) ) def transcribe(audio_path): # Run inference result = asr_pipeline(audio_path) return result["text"] # Gradio Interface app = gr.Interface( fn=transcribe, inputs=gr.Audio(sources="upload", type="filepath"), outputs="text", title="Punjabi Speech-to-Text", description="Upload an audio file and get the transcription in Punjabi." ) if __name__ == "__main__": app.launch()