import gradio as gr from transformers import pipeline import numpy as np import time # Initialize the pipelines transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en") classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli") candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "not about lighting"] last_update_time = time.time() - 5 # Initialize with a value to ensure immediate first update # Buffer to hold the last updated values last_transcription = "" last_classification = "" def transcribe_and_classify(stream, new_chunk): global last_update_time, last_transcription, last_classification sr, y = new_chunk y = y.astype(np.float32) y /= np.max(np.abs(y)) # Concatenate new audio chunk to the stream if stream is not None: stream = np.concatenate([stream, y]) else: stream = y # Keep only the last 10 seconds of audio num_samples_last_10_seconds = 5 * sr if len(stream) > num_samples_last_10_seconds: stream = stream[-num_samples_last_10_seconds:] current_time = time.time() # Update every 5 seconds if current_time - last_update_time >= 5: last_update_time = current_time # Transcribe the last 10 seconds of audio transcription = transcriber({"sampling_rate": sr, "task": "transcribe", "language": "english", "raw": stream})["text"] last_transcription = transcription # Update the buffer # Classify the transcribed text if transcription.strip(): output = classifier(transcription, candidate_labels, multi_label=False) top_label = output['labels'][0] top_score = output['scores'][0] last_classification = f"{top_label.upper()}, score: {top_score:.2f}" # Return the last updated transcription and classification return stream, last_transcription, last_classification # Define the Gradio interface demo = gr.Interface( fn=transcribe_and_classify, inputs=[ "state", gr.Audio(sources=["microphone"], streaming=True) ], outputs=[ "state", "text", "text" ], live=True ) # Launch the demo demo.launch(debug=True)