Spaces:
Sleeping
Sleeping
File size: 1,783 Bytes
2ed7223 2ba8923 c621812 039f770 011a958 dc03737 2ba8923 dc03737 d649fba 2ba8923 d649fba 2ba8923 d649fba 2ba8923 fa48096 2ba8923 d649fba 2ba8923 d649fba 2ba8923 2ed7223 ab07d9e 2ba8923 2ed7223 d649fba 2ed7223 2ba8923 c621812 2ba8923 2ed7223 d649fba c621812 d649fba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import transformers
import gradio as gr
import librosa
import torch
import spaces
@spaces.GPU(duration=120)
def transcribe_and_respond(audio_file):
try:
# Load the model pipeline
pipe = transformers.pipeline(
model='sarvamai/shuka_v1',
trust_remote_code=True,
device=0,
torch_dtype=torch.bfloat16
)
# Load the audio file
audio, sr = librosa.load(audio_file, sr=16000)
# Print the path of the audio file
print(f"Audio file path: {audio_file}")
# Prepare turns with a placeholder for the audio
turns = [
{'role': 'system', 'content': 'Respond naturally and informatively.'},
{'role': 'user', 'content': '<|audio|>'}
]
# Print the constructed prompt
print(f"Constructed prompt: {turns}")
# Run the pipeline with the audio and constructed prompt
output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
# Print the output from the model
print(f"Model output: {output}")
# Return the output for the Gradio interface
return output
except Exception as e:
return f"Error: {str(e)}"
# Set up the Gradio interface
iface = gr.Interface(
fn=transcribe_and_respond,
inputs=gr.Audio(sources="microphone", type="filepath"), # Accept audio input from microphone
outputs="text", # Output as text
title="Live Transcription and Response",
description="Speak into your microphone, and the model will respond naturally and informatively.",
live=True # Enable live processing
)
# Launch the interface
if __name__ == "__main__":
iface.launch()
|