import argparse import json import threading import time from pathlib import Path from typing import List import websocket import os import librosa import numpy as np # Define the default WebSocket endpoint DEFAULT_WS_URL = "ws://localhost:8000/v1/ws_transcribe_streaming" def parse_arguments(): parser = argparse.ArgumentParser(description="Stream audio to the transcription WebSocket endpoint.") parser.add_argument("audio_file", help="Path to the input audio file.") parser.add_argument("--url", default=DEFAULT_WS_URL, help="WebSocket endpoint URL.") parser.add_argument("--model", type=str, help="Model name to use for transcription.") parser.add_argument("--language", type=str, help="Language code for transcription.") parser.add_argument( "--response_format", type=str, default="verbose_json", choices=["text", "json", "verbose_json"], help="Response format.", ) parser.add_argument("--temperature", type=float, default=0.0, help="Temperature for transcription.") parser.add_argument("--vad_filter", action="store_true", help="Enable voice activity detection filter.") parser.add_argument("--chunk_duration", type=float, default=1.0, help="Duration of each audio chunk in seconds.") return parser.parse_args() # def preprocess_audio(audio_file, target_sr=16000): # """ # Load the audio file, convert to mono 16kHz, and return the audio data. # """ # if audio_file.endswith(".mp3"): # # Convert MP3 to WAV using ffmpeg # wav_file = audio_file.replace(".mp3", ".wav") # if not os.path.exists(wav_file): # command = f'ffmpeg -i "{audio_file}" -ac 1 -ar {target_sr} "{wav_file}"' # print(f"Converting MP3 to WAV: {command}") # os.system(command) # audio_file = wav_file # # print(f"Loading audio file {audio_file}") # audio_data, sr = librosa.load(audio_file, sr=target_sr, mono=True) # return audio_data, sr # # def chunk_audio(audio_data, sr, chunk_duration): # """ # Split the audio data into chunks of specified duration. # """ # chunk_samples = int(chunk_duration * sr) # total_samples = len(audio_data) # chunks = [ # audio_data[i:i + chunk_samples] # for i in range(0, total_samples, chunk_samples) # ] # print(f"Split audio into {len(chunks)} chunks of {chunk_duration} seconds each.") # return chunks def read_audio_in_chunks(audio_file, target_sr=16000, chunk_duration=1) -> List[np.ndarray]: """ Reads a 16kHz mono audio file in 1-second chunks and returns them as little-endian 16-bit integer arrays. Args: file_path (str): Path to the audio file. expected_sr (int): Expected sample rate (16000 by default). expected_mono (bool): Expect the file to be mono (True by default). chunk_duration (int): Duration of each chunk in seconds (1 second by default). Returns: List of numpy arrays: Each array is a 1-second chunk of the audio as 16-bit integers. Raises: ValueError: If the audio file's sample rate or number of channels doesn't match expectations. """ if not str(audio_file).endswith(".wav"): # Convert MP3 to WAV using ffmpeg wav_file = Path(audio_file).with_suffix(".wav") if not wav_file.exists(): command = f'ffmpeg -i "{audio_file}" -ac 1 -ar {target_sr} "{wav_file}"' print(f"Converting MP3 to WAV: {command}") os.system(command) audio_file = wav_file # Load the audio file audio_data, sr = librosa.load(audio_file, sr=None, mono=True) # Raise an exception if the sample rate doesn't match if sr != target_sr: raise ValueError(f"Unexpected sample rate {sr}. Expected {target_sr}.") # Convert the audio data to 16-bit PCM (little-endian) audio_data_int16 = (audio_data * 32767).astype(np.int16) # Check if the current byte order is little-endian if audio_data_int16.dtype.byteorder == '>' or ( audio_data_int16.dtype.byteorder == '=' and np.dtype(np.int16).byteorder == '>'): print("Byte swap performed to convert to little-endian.") # Ensure little-endian format (if the current format is big-endian) audio_data_little_endian = audio_data_int16.byteswap().newbyteorder('L') else: print("No byte swap needed. Already little-endian.") audio_data_little_endian = audio_data_int16 # Calculate the number of samples per chunk samples_per_chunk = target_sr * chunk_duration # Split the audio into chunks chunks = [ audio_data_little_endian[i:i + samples_per_chunk] for i in range(0, len(audio_data_little_endian), samples_per_chunk) ] return chunks def build_query_params(args): """ Build the query parameters for the WebSocket URL based on command-line arguments. """ params = {} if args.model: params["model"] = args.model if args.language: params["language"] = args.language if args.response_format: params["response_format"] = args.response_format if args.temperature is not None: params["temperature"] = str(args.temperature) if args.vad_filter: params["vad_filter"] = "true" return params def websocket_url_with_params(base_url, params): """ Append query parameters to the WebSocket URL. """ from urllib.parse import urlencode if params: query_string = urlencode(params) url = f"{base_url}?{query_string}" else: url = base_url return url def on_message(ws, message): """ Callback function when a message is received from the server. """ try: data = json.loads(message) # Accumulate transcriptions if ws.args.response_format == "verbose_json": segments = data.get('segments', []) ws.transcriptions.extend(segments) for segment in segments: print(f"Received segment: {segment['text']}") else: # For 'json' or 'text' format ws.transcriptions.append(data) print(f"Transcription: {data['text']}") except json.JSONDecodeError: print(f"Received non-JSON message: {message}") def on_error(ws, error): """ Callback function when an error occurs. """ print(f"WebSocket error: {error}") def on_close(ws, close_status_code, close_msg): """ Callback function when the WebSocket connection is closed. """ print("WebSocket connection closed") def on_open(ws): """ Callback function when the WebSocket connection is opened. """ print("WebSocket connection opened") ws.transcriptions = [] # Initialize the list to store transcriptions def send_audio_chunks(ws, audio_chunks, sr): """ Send audio chunks to the WebSocket server. """ for idx, chunk in enumerate(audio_chunks): # Ensure little-endian format audio_bytes = chunk.astype(' {end_time}") print(f"{text}\n") def run_websocket_client(args): """ Run the WebSocket client to stream audio and receive transcriptions. """ try: audio_chunks = read_audio_in_chunks(args.audio_file) # params = build_query_params(args) # ws_url = websocket_url_with_params(args.url, params) ws_url = args.url ws = websocket.WebSocketApp( ws_url, on_open=on_open, on_message=on_message, on_error=on_error, on_close=on_close, ) ws.args = args # Attach args to ws to access inside callbacks # Run the WebSocket in a separate thread to allow sending and receiving simultaneously ws_thread = threading.Thread(target=ws.run_forever) ws_thread.start() # Wait for the connection to open while not ws.sock or not ws.sock.connected: time.sleep(0.1) # Send the audio chunks send_audio_chunks(ws, audio_chunks, 16000) except Exception as e: print(f"An error occurred: {e}") # Wait for the WebSocket thread to finish ws_thread.join() # Generate SRT if transcriptions are available if hasattr(ws, 'transcriptions') and ws.transcriptions: generate_srt(ws.transcriptions) else: print("No transcriptions received.") if __name__ == "__main__": args = parse_arguments() run_websocket_client(args)