Spaces:

archit11
/

shuka_demo

Sleeping

App Files Files Community

archit11 commited on Aug 16, 2024

Commit

011a958

verified ·

1 Parent(s): 06c5535

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -31

app.py CHANGED Viewed

@@ -1,54 +1,70 @@
 import transformers
 import gradio as gr
-import librosa
 import torch
 import numpy as np
 import spaces
-from typing import Tuple
 @spaces.GPU(duration=120)
-def transcribe_and_respond(audio_input: Tuple[np.ndarray, int]) -> str:
     try:
-        pipe = transformers.pipeline(
-    model='sarvamai/shuka_v1',
-    trust_remote_code=True,
-    device=0,
-    torch_dtype=torch.bfloat16
-)
-        # Unpack the audio input
-        audio, sr = audio_input
         # Ensure audio is float32
         if audio.dtype != np.float32:
             audio = audio.astype(np.float32)
-        # Resample if necessary
-        if sr != 16000:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
-        # Define conversation turns
-        turns = [
-            {'role': 'system', 'content': 'Respond naturally and informatively.'},
-            {'role': 'user', 'content': ''}
-        ]
-        # Run the pipeline with the audio and conversation turns
-        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': 16000}, max_new_tokens=512)
-        # Return the model's response
-        return output
     except Exception as e:
         return f"Error processing audio: {str(e)}"
 iface = gr.Interface(
     fn=transcribe_and_respond,
-    inputs=gr.Audio(sources="microphone", type="numpy"),
-    outputs="text",
-    title="Live Transcription and Response",
     description="Speak into your microphone, and the model will respond naturally and informatively.",
-    live=True  # Enable live processing
 )
 if __name__ == "__main__":
-    iface.launch()

 import transformers
 import gradio as gr
 import torch
 import numpy as np
+from typing import Dict, List
 import spaces
+# Constants
+MODEL_NAME = 'sarvamai/shuka_v1'
+SAMPLE_RATE = 16000
+MAX_NEW_TOKENS = 256
+# Load the ShukaPipeline
+def load_pipeline():
+    model = transformers.AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    pipeline = transformers.pipeline(
+        "shuka-pipeline",
+        model=model,
+        torch_dtype=torch.float16,
+        device=0 if torch.cuda.is_available() else -1,
+    )
+    return pipeline
+pipe = load_pipeline()
+def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
+    return [
+        {'role': 'system', 'content': 'Respond naturally and informatively.'},
+        {'role': 'user', 'content': prompt}
+    ]
 @spaces.GPU(duration=120)
+def transcribe_and_respond(audio: np.ndarray) -> str:
     try:
         # Ensure audio is float32
         if audio.dtype != np.float32:
             audio = audio.astype(np.float32)
+        # Create input for the pipeline
+        turns = create_conversation_turns("<|audio|>")
+        inputs = {
+            'audio': audio,
+            'turns': turns,
+            'sampling_rate': SAMPLE_RATE
+        }
+        # Generate response
+        response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS, temperature=0.7, repetition_penalty=1.1)
+        return response
     except Exception as e:
         return f"Error processing audio: {str(e)}"
+# Create the Gradio interface
 iface = gr.Interface(
     fn=transcribe_and_respond,
+    inputs=gr.Audio(sources="microphone", type="numpy", sampling_rate=SAMPLE_RATE),
+    outputs="text",
+    title="Live Voice Input for Transcription and Response",
     description="Speak into your microphone, and the model will respond naturally and informatively.",
+    live=True
 )
+# Launch the app
 if __name__ == "__main__":
+    iface.launch()