Spaces:

archit11
/

shuka_demo

Sleeping

App Files Files Community

shuka_demo / app.py

archit11

Update app.py

011a958 verified 11 months ago

raw

history blame

1.9 kB

	import transformers

	import gradio as gr
	import torch
	import numpy as np
	from typing import Dict, List
	import spaces

	# Constants
	MODEL_NAME = 'sarvamai/shuka_v1'
	SAMPLE_RATE = 16000
	MAX_NEW_TOKENS = 256

	# Load the ShukaPipeline
	def load_pipeline():
	model = transformers.AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
	pipeline = transformers.pipeline(
	"shuka-pipeline",
	model=model,
	torch_dtype=torch.float16,
	device=0 if torch.cuda.is_available() else -1,
	)
	return pipeline

	pipe = load_pipeline()

	def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
	return [
	{'role': 'system', 'content': 'Respond naturally and informatively.'},
	{'role': 'user', 'content': prompt}
	]

	@spaces.GPU(duration=120)
	def transcribe_and_respond(audio: np.ndarray) -> str:
	try:
	# Ensure audio is float32
	if audio.dtype != np.float32:
	audio = audio.astype(np.float32)




	# Create input for the pipeline
	turns = create_conversation_turns("<\|audio\|>")
	inputs = {
	'audio': audio,
	'turns': turns,
	'sampling_rate': SAMPLE_RATE
	}

	# Generate response
	response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS, temperature=0.7, repetition_penalty=1.1)

	return response
	except Exception as e:
	return f"Error processing audio: {str(e)}"

	# Create the Gradio interface
	iface = gr.Interface(
	fn=transcribe_and_respond,
	inputs=gr.Audio(sources="microphone", type="numpy", sampling_rate=SAMPLE_RATE),
	outputs="text",
	title="Live Voice Input for Transcription and Response",
	description="Speak into your microphone, and the model will respond naturally and informatively.",
	live=True
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()