File size: 1,897 Bytes
2ed7223
011a958
2ed7223
c621812
8b70c99
011a958
039f770
011a958
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62dda31
dc03737
011a958
dc03737
8b70c99
 
 
c621812
dc03737
2ed7223
011a958
 
 
 
 
 
 
 
 
 
 
 
 
ab07d9e
8b70c99
2ed7223
011a958
2ed7223
 
011a958
 
 
c621812
011a958
2ed7223
 
011a958
c621812
011a958
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import transformers

import gradio as gr
import torch
import numpy as np
from typing import Dict, List
import spaces

# Constants
MODEL_NAME = 'sarvamai/shuka_v1'
SAMPLE_RATE = 16000
MAX_NEW_TOKENS = 256

# Load the ShukaPipeline
def load_pipeline():
    model = transformers.AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
    pipeline = transformers.pipeline(
        "shuka-pipeline",
        model=model,
        torch_dtype=torch.float16,
        device=0 if torch.cuda.is_available() else -1,
    )
    return pipeline

pipe = load_pipeline()

def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
    return [
        {'role': 'system', 'content': 'Respond naturally and informatively.'},
        {'role': 'user', 'content': prompt}
    ]

@spaces.GPU(duration=120)
def transcribe_and_respond(audio: np.ndarray) -> str:
    try:
        # Ensure audio is float32
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)



        
        # Create input for the pipeline
        turns = create_conversation_turns("<|audio|>")
        inputs = {
            'audio': audio,
            'turns': turns,
            'sampling_rate': SAMPLE_RATE
        }
        
        # Generate response
        response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS, temperature=0.7, repetition_penalty=1.1)
        
        return response
    except Exception as e:
        return f"Error processing audio: {str(e)}"

# Create the Gradio interface
iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=gr.Audio(sources="microphone", type="numpy", sampling_rate=SAMPLE_RATE),
    outputs="text",
    title="Live Voice Input for Transcription and Response",
    description="Speak into your microphone, and the model will respond naturally and informatively.",
    live=True
)

# Launch the app
if __name__ == "__main__":
    iface.launch()