Spaces:
Sleeping
Sleeping
File size: 1,510 Bytes
2ed7223 2ba8923 c621812 039f770 ee53056 011a958 60f64df dc03737 2ba8923 d649fba ee53056 d649fba 2ba8923 ee53056 60f64df ee53056 60f64df ee53056 60f64df ee53056 60f64df 2ba8923 ee53056 60f64df d649fba ee53056 60f64df 2ed7223 ab07d9e 60f64df 2ed7223 60f64df 2ed7223 c621812 d649fba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import transformers
import gradio as gr
import librosa
import torch
import spaces
import numpy as np
@spaces.GPU(duration=20)
def transcribe_and_respond(audio_file):
try:
pipe = transformers.pipeline(
model='sarvamai/shuka_v1',
trust_remote_code=True,
device=0,
torch_dtype=torch.bfloat16
)
# Load the audio file
audio, sr = librosa.load(audio_file, sr=16000)
# Print audio properties for debugging
print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
turns = [
{'role': 'system', 'content': 'Respond naturally and informatively.'},
{'role': 'user', 'content': '<|audio|>'}
]
# Debug: Print the initial turns
print(f"Initial turns: {turns}")
# Call the model with the audio and prompt
output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
# Debug: Print the final output from the model
print(f"Model output: {output}")
return output
except Exception as e:
return f"Error: {str(e)}"
iface = gr.Interface(
fn=transcribe_and_respond,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs="text",
title="Live Transcription and Response",
description="Speak into your microphone, and the model will respond naturally and informatively.",
live=True
)
if __name__ == "__main__":
iface.launch()
|