Spaces:

Gigaverse
/

ivrit-ai-streaming

Sleeping

App Files Files Community

aviadr1 commited on Oct 16, 2024

Commit

9d710fb

1 Parent(s): e8aa012

WIP

Browse files

Files changed (4) hide show

faster-whisper-server-client.py +104 -42
pyproject.toml +1 -1
ws_client.py +288 -0
ws_server.py +111 -47

faster-whisper-server-client.py CHANGED Viewed

@@ -2,6 +2,9 @@ import argparse
 import json
 import threading
 import time
 import websocket
 import os
@@ -31,34 +34,91 @@ def parse_arguments():
     return parser.parse_args()
-def preprocess_audio(audio_file, target_sr=16000):
     """
-    Load the audio file, convert to mono 16kHz, and return the audio data.
     """
-    if audio_file.endswith(".mp3"):
         # Convert MP3 to WAV using ffmpeg
-        wav_file = audio_file.replace(".mp3", ".wav")
-        if not os.path.exists(wav_file):
             command = f'ffmpeg -i "{audio_file}" -ac 1 -ar {target_sr} "{wav_file}"'
             print(f"Converting MP3 to WAV: {command}")
             os.system(command)
         audio_file = wav_file
-    print(f"Loading audio file {audio_file}")
-    audio_data, sr = librosa.load(audio_file, sr=target_sr, mono=True)
-    return audio_data, sr
-def chunk_audio(audio_data, sr, chunk_duration):
-    """
-    Split the audio data into chunks of specified duration.
-    """
-    chunk_samples = int(chunk_duration * sr)
-    total_samples = len(audio_data)
     chunks = [
-        audio_data[i:i + chunk_samples]
-        for i in range(0, total_samples, chunk_samples)
     ]
-    print(f"Split audio into {len(chunks)} chunks of {chunk_duration} seconds each.")
     return chunks
@@ -184,31 +244,33 @@ def run_websocket_client(args):
     """
     Run the WebSocket client to stream audio and receive transcriptions.
     """
-    audio_data, sr = preprocess_audio(args.audio_file)
-    audio_chunks = chunk_audio(audio_data, sr, args.chunk_duration)
-    params = build_query_params(args)
-    ws_url = websocket_url_with_params(args.url, params)
-    ws = websocket.WebSocketApp(
-        ws_url,
-        on_open=on_open,
-        on_message=on_message,
-        on_error=on_error,
-        on_close=on_close,
-    )
-    ws.args = args  # Attach args to ws to access inside callbacks
-    # Run the WebSocket in a separate thread to allow sending and receiving simultaneously
-    ws_thread = threading.Thread(target=ws.run_forever)
-    ws_thread.start()
-    # Wait for the connection to open
-    while not ws.sock or not ws.sock.connected:
-        time.sleep(0.1)
-    # Send the audio chunks
-    send_audio_chunks(ws, audio_chunks, sr)
     # Wait for the WebSocket thread to finish
     ws_thread.join()

 import json
 import threading
 import time
+from pathlib import Path
+from typing import List
 import websocket
 import os
     return parser.parse_args()
+# def preprocess_audio(audio_file, target_sr=16000):
+#     """
+#     Load the audio file, convert to mono 16kHz, and return the audio data.
+#     """
+#     if audio_file.endswith(".mp3"):
+#         # Convert MP3 to WAV using ffmpeg
+#         wav_file = audio_file.replace(".mp3", ".wav")
+#         if not os.path.exists(wav_file):
+#             command = f'ffmpeg -i "{audio_file}" -ac 1 -ar {target_sr} "{wav_file}"'
+#             print(f"Converting MP3 to WAV: {command}")
+#             os.system(command)
+#         audio_file = wav_file
+#
+#     print(f"Loading audio file {audio_file}")
+#     audio_data, sr = librosa.load(audio_file, sr=target_sr, mono=True)
+#     return audio_data, sr
+#
+# def chunk_audio(audio_data, sr, chunk_duration):
+#     """
+#     Split the audio data into chunks of specified duration.
+#     """
+#     chunk_samples = int(chunk_duration * sr)
+#     total_samples = len(audio_data)
+#     chunks = [
+#         audio_data[i:i + chunk_samples]
+#         for i in range(0, total_samples, chunk_samples)
+#     ]
+#     print(f"Split audio into {len(chunks)} chunks of {chunk_duration} seconds each.")
+#     return chunks
+def read_audio_in_chunks(audio_file, target_sr=16000, chunk_duration=1) -> List[np.ndarray]:
     """
+    Reads a 16kHz mono audio file in 1-second chunks and returns them as little-endian 16-bit integer arrays.
+    Args:
+        file_path (str): Path to the audio file.
+        expected_sr (int): Expected sample rate (16000 by default).
+        expected_mono (bool): Expect the file to be mono (True by default).
+        chunk_duration (int): Duration of each chunk in seconds (1 second by default).
+    Returns:
+        List of numpy arrays: Each array is a 1-second chunk of the audio as 16-bit integers.
+    Raises:
+        ValueError: If the audio file's sample rate or number of channels doesn't match expectations.
     """
+    if not str(audio_file).endswith(".wav"):
         # Convert MP3 to WAV using ffmpeg
+        wav_file = Path(audio_file).with_suffix(".wav")
+        if not wav_file.exists():
             command = f'ffmpeg -i "{audio_file}" -ac 1 -ar {target_sr} "{wav_file}"'
             print(f"Converting MP3 to WAV: {command}")
             os.system(command)
         audio_file = wav_file
+    # Load the audio file
+    audio_data, sr = librosa.load(audio_file, sr=None, mono=True)
+    # Raise an exception if the sample rate doesn't match
+    if sr != target_sr:
+        raise ValueError(f"Unexpected sample rate {sr}. Expected {target_sr}.")
+    # Convert the audio data to 16-bit PCM (little-endian)
+    audio_data_int16 = (audio_data * 32767).astype(np.int16)
+    # Check if the current byte order is little-endian
+    if audio_data_int16.dtype.byteorder == '>' or (
+            audio_data_int16.dtype.byteorder == '=' and np.dtype(np.int16).byteorder == '>'):
+        print("Byte swap performed to convert to little-endian.")
+        # Ensure little-endian format (if the current format is big-endian)
+        audio_data_little_endian = audio_data_int16.byteswap().newbyteorder('L')
+    else:
+        print("No byte swap needed. Already little-endian.")
+        audio_data_little_endian = audio_data_int16
+    # Calculate the number of samples per chunk
+    samples_per_chunk = target_sr * chunk_duration
+    # Split the audio into chunks
     chunks = [
+        audio_data_little_endian[i:i + samples_per_chunk]
+        for i in range(0, len(audio_data_little_endian), samples_per_chunk)
     ]
     return chunks
     """
     Run the WebSocket client to stream audio and receive transcriptions.
     """
+    try:
+        audio_chunks = read_audio_in_chunks(args.audio_file)
+        params = build_query_params(args)
+        ws_url = websocket_url_with_params(args.url, params)
+        ws = websocket.WebSocketApp(
+            ws_url,
+            on_open=on_open,
+            on_message=on_message,
+            on_error=on_error,
+            on_close=on_close,
+        )
+        ws.args = args  # Attach args to ws to access inside callbacks
+        # Run the WebSocket in a separate thread to allow sending and receiving simultaneously
+        ws_thread = threading.Thread(target=ws.run_forever)
+        ws_thread.start()
+        # Wait for the connection to open
+        while not ws.sock or not ws.sock.connected:
+            time.sleep(0.1)
+        # Send the audio chunks
+        send_audio_chunks(ws, audio_chunks, 16000)
+    except Exception as e:
+        print(f"An error occurred: {e}")
     # Wait for the WebSocket thread to finish
     ws_thread.join()

pyproject.toml CHANGED Viewed

@@ -32,7 +32,7 @@ transformers = "^4.44.2"
 soundfile = "^0.12.1"
 faster-whisper = "^1.0.3"
 fastapi = "^0.114.2"
-websockets = "^13.0.1"
 #websocket-client = "^1.8.0"
 librosa = "^0.10.2.post1"
 uvicorn = "^0.30.6"

 soundfile = "^0.12.1"
 faster-whisper = "^1.0.3"
 fastapi = "^0.114.2"
+#websockets = "^13.0.1"
 #websocket-client = "^1.8.0"
 librosa = "^0.10.2.post1"
 uvicorn = "^0.30.6"

ws_client.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import argparse
+import json
+import threading
+import time
+from pathlib import Path
+from typing import List
+import websocket
+import os
+import librosa
+import numpy as np
+# Define the default WebSocket endpoint
+DEFAULT_WS_URL = "ws://localhost:8000/v1/ws_transcribe_streaming"
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Stream audio to the transcription WebSocket endpoint.")
+    parser.add_argument("audio_file", help="Path to the input audio file.")
+    parser.add_argument("--url", default=DEFAULT_WS_URL, help="WebSocket endpoint URL.")
+    parser.add_argument("--model", type=str, help="Model name to use for transcription.")
+    parser.add_argument("--language", type=str, help="Language code for transcription.")
+    parser.add_argument(
+        "--response_format",
+        type=str,
+        default="verbose_json",
+        choices=["text", "json", "verbose_json"],
+        help="Response format.",
+    )
+    parser.add_argument("--temperature", type=float, default=0.0, help="Temperature for transcription.")
+    parser.add_argument("--vad_filter", action="store_true", help="Enable voice activity detection filter.")
+    parser.add_argument("--chunk_duration", type=float, default=1.0, help="Duration of each audio chunk in seconds.")
+    return parser.parse_args()
+# def preprocess_audio(audio_file, target_sr=16000):
+#     """
+#     Load the audio file, convert to mono 16kHz, and return the audio data.
+#     """
+#     if audio_file.endswith(".mp3"):
+#         # Convert MP3 to WAV using ffmpeg
+#         wav_file = audio_file.replace(".mp3", ".wav")
+#         if not os.path.exists(wav_file):
+#             command = f'ffmpeg -i "{audio_file}" -ac 1 -ar {target_sr} "{wav_file}"'
+#             print(f"Converting MP3 to WAV: {command}")
+#             os.system(command)
+#         audio_file = wav_file
+#
+#     print(f"Loading audio file {audio_file}")
+#     audio_data, sr = librosa.load(audio_file, sr=target_sr, mono=True)
+#     return audio_data, sr
+#
+# def chunk_audio(audio_data, sr, chunk_duration):
+#     """
+#     Split the audio data into chunks of specified duration.
+#     """
+#     chunk_samples = int(chunk_duration * sr)
+#     total_samples = len(audio_data)
+#     chunks = [
+#         audio_data[i:i + chunk_samples]
+#         for i in range(0, total_samples, chunk_samples)
+#     ]
+#     print(f"Split audio into {len(chunks)} chunks of {chunk_duration} seconds each.")
+#     return chunks
+def read_audio_in_chunks(audio_file, target_sr=16000, chunk_duration=1) -> List[np.ndarray]:
+    """
+    Reads a 16kHz mono audio file in 1-second chunks and returns them as little-endian 16-bit integer arrays.
+    Args:
+        file_path (str): Path to the audio file.
+        expected_sr (int): Expected sample rate (16000 by default).
+        expected_mono (bool): Expect the file to be mono (True by default).
+        chunk_duration (int): Duration of each chunk in seconds (1 second by default).
+    Returns:
+        List of numpy arrays: Each array is a 1-second chunk of the audio as 16-bit integers.
+    Raises:
+        ValueError: If the audio file's sample rate or number of channels doesn't match expectations.
+    """
+    if not str(audio_file).endswith(".wav"):
+        # Convert MP3 to WAV using ffmpeg
+        wav_file = Path(audio_file).with_suffix(".wav")
+        if not wav_file.exists():
+            command = f'ffmpeg -i "{audio_file}" -ac 1 -ar {target_sr} "{wav_file}"'
+            print(f"Converting MP3 to WAV: {command}")
+            os.system(command)
+        audio_file = wav_file
+    # Load the audio file
+    audio_data, sr = librosa.load(audio_file, sr=None, mono=True)
+    # Raise an exception if the sample rate doesn't match
+    if sr != target_sr:
+        raise ValueError(f"Unexpected sample rate {sr}. Expected {target_sr}.")
+    # Convert the audio data to 16-bit PCM (little-endian)
+    audio_data_int16 = (audio_data * 32767).astype(np.int16)
+    # Check if the current byte order is little-endian
+    if audio_data_int16.dtype.byteorder == '>' or (
+            audio_data_int16.dtype.byteorder == '=' and np.dtype(np.int16).byteorder == '>'):
+        print("Byte swap performed to convert to little-endian.")
+        # Ensure little-endian format (if the current format is big-endian)
+        audio_data_little_endian = audio_data_int16.byteswap().newbyteorder('L')
+    else:
+        print("No byte swap needed. Already little-endian.")
+        audio_data_little_endian = audio_data_int16
+    # Calculate the number of samples per chunk
+    samples_per_chunk = target_sr * chunk_duration
+    # Split the audio into chunks
+    chunks = [
+        audio_data_little_endian[i:i + samples_per_chunk]
+        for i in range(0, len(audio_data_little_endian), samples_per_chunk)
+    ]
+    return chunks
+def build_query_params(args):
+    """
+    Build the query parameters for the WebSocket URL based on command-line arguments.
+    """
+    params = {}
+    if args.model:
+        params["model"] = args.model
+    if args.language:
+        params["language"] = args.language
+    if args.response_format:
+        params["response_format"] = args.response_format
+    if args.temperature is not None:
+        params["temperature"] = str(args.temperature)
+    if args.vad_filter:
+        params["vad_filter"] = "true"
+    return params
+def websocket_url_with_params(base_url, params):
+    """
+    Append query parameters to the WebSocket URL.
+    """
+    from urllib.parse import urlencode
+    if params:
+        query_string = urlencode(params)
+        url = f"{base_url}?{query_string}"
+    else:
+        url = base_url
+    return url
+def on_message(ws, message):
+    """
+    Callback function when a message is received from the server.
+    """
+    try:
+        data = json.loads(message)
+        # Accumulate transcriptions
+        if ws.args.response_format == "verbose_json":
+            segments = data.get('segments', [])
+            ws.transcriptions.extend(segments)
+            for segment in segments:
+                print(f"Received segment: {segment['text']}")
+        else:
+            # For 'json' or 'text' format
+            ws.transcriptions.append(data)
+            print(f"Transcription: {data['text']}")
+    except json.JSONDecodeError:
+        print(f"Received non-JSON message: {message}")
+def on_error(ws, error):
+    """
+    Callback function when an error occurs.
+    """
+    print(f"WebSocket error: {error}")
+def on_close(ws, close_status_code, close_msg):
+    """
+    Callback function when the WebSocket connection is closed.
+    """
+    print("WebSocket connection closed")
+def on_open(ws):
+    """
+    Callback function when the WebSocket connection is opened.
+    """
+    print("WebSocket connection opened")
+    ws.transcriptions = []  # Initialize the list to store transcriptions
+def send_audio_chunks(ws, audio_chunks, sr):
+    """
+    Send audio chunks to the WebSocket server.
+    """
+    for idx, chunk in enumerate(audio_chunks):
+        # Ensure little-endian format
+        audio_bytes = chunk.astype('<f4').tobytes()
+        ws.send(audio_bytes, opcode=websocket.ABNF.OPCODE_BINARY)
+        print(f"Sent chunk {idx + 1}/{len(audio_chunks)}")
+        time.sleep(0.1)  # Small delay to simulate real-time streaming
+    print("All audio chunks sent")
+    # Optionally, wait to receive remaining messages
+    time.sleep(2)
+    ws.close()
+    print("Closed WebSocket connection")
+def format_timestamp(seconds):
+    """
+    Convert seconds to SRT timestamp format (HH:MM:SS,mmm).
+    """
+    total_milliseconds = int(seconds * 1000)
+    hours = total_milliseconds // (3600 * 1000)
+    minutes = (total_milliseconds % (3600 * 1000)) // (60 * 1000)
+    secs = (total_milliseconds % (60 * 1000)) // 1000
+    milliseconds = total_milliseconds % 1000
+    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"
+def generate_srt(transcriptions):
+    """
+    Generate and print SRT content from transcriptions.
+    """
+    print("\nGenerated SRT:")
+    for idx, segment in enumerate(transcriptions, 1):
+        start_time = format_timestamp(segment['start'])
+        end_time = format_timestamp(segment['end'])
+        text = segment['text']
+        print(f"{idx}")
+        print(f"{start_time} --> {end_time}")
+        print(f"{text}\n")
+def run_websocket_client(args):
+    """
+    Run the WebSocket client to stream audio and receive transcriptions.
+    """
+    try:
+        audio_chunks = read_audio_in_chunks(args.audio_file)
+        # params = build_query_params(args)
+        # ws_url = websocket_url_with_params(args.url, params)
+        ws_url = args.url
+        ws = websocket.WebSocketApp(
+            ws_url,
+            on_open=on_open,
+            on_message=on_message,
+            on_error=on_error,
+            on_close=on_close,
+        )
+        ws.args = args  # Attach args to ws to access inside callbacks
+        # Run the WebSocket in a separate thread to allow sending and receiving simultaneously
+        ws_thread = threading.Thread(target=ws.run_forever)
+        ws_thread.start()
+        # Wait for the connection to open
+        while not ws.sock or not ws.sock.connected:
+            time.sleep(0.1)
+        # Send the audio chunks
+        send_audio_chunks(ws, audio_chunks, 16000)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    # Wait for the WebSocket thread to finish
+    ws_thread.join()
+    # Generate SRT if transcriptions are available
+    if hasattr(ws, 'transcriptions') and ws.transcriptions:
+        generate_srt(ws.transcriptions)
+    else:
+        print("No transcriptions received.")
+if __name__ == "__main__":
+    args = parse_arguments()
+    run_websocket_client(args)

ws_server.py CHANGED Viewed

@@ -1,11 +1,13 @@
 # Import the necessary components from whisper_online.py
 import logging
 import os
 import librosa
 import soundfile
 import uvicorn
 from fastapi import FastAPI, WebSocket
 from starlette.websockets import WebSocketDisconnect
 from libs.whisper_streaming.whisper_online import (
@@ -25,22 +27,51 @@ import argparse
 import sys
 import numpy as np
 import io
-import soundfile as sf
 import wave
 import requests
 import argparse
 logger = logging.getLogger(__name__)
 SAMPLING_RATE = 16000
 WARMUP_FILE = "mono16k.test_hebrew.wav"
 AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav"
-is_first = True
-asr, online = None, None
-min_limit = None  # min_chunk*SAMPLING_RATE
 app = FastAPI()
 def convert_to_mono_16k(input_wav: str, output_wav: str) -> None:
     """
@@ -78,28 +109,68 @@ def download_warmup_file():
         convert_to_mono_16k(audio_file_path, WARMUP_FILE)
-async def receive_audio_chunk(self, websocket: WebSocket):
     # receive all audio that is available by this time
     # blocks operation if less than self.min_chunk seconds is available
     # unblocks if connection is closed or a chunk is available
     out = []
-    while sum(len(x) for x in out) < min_limit:
-        raw_bytes = await websocket.receive_bytes()
         if not raw_bytes:
             break
         sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
         audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
         out.append(audio)
     if not out:
         return None
-    conc = np.concatenate(out)
-    if self.is_first and len(conc) < min_limit:
         return None
-    self.is_first = False
-    return conc
 # Define WebSocket endpoint
 @app.websocket("/ws_transcribe_streaming")
@@ -108,46 +179,37 @@ async def websocket_transcribe(websocket: WebSocket):
     await websocket.accept()
     logger.info("WebSocket connection established successfully.")
     asr, online = asr_factory(args)
     # warm up the ASR because the very first transcribe takes more time than the others.
     # Test results in https://github.com/ufal/whisper_streaming/pull/81
     a = load_audio_chunk(WARMUP_FILE, 0, 1)
     asr.transcribe(a)
     logger.info("Whisper is warmed up.")
-    global min_limit
-    min_limit = args.min_chunk_size * SAMPLING_RATE
     try:
-        out = []
         while True:
             try:
-                # Receive JSON data
-                raw_bytes = await websocket.receive_json()
-                sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1, endian="LITTLE", samplerate=SAMPLING_RATE,
-                                         subtype="PCM_16", format="RAW")
-                audio, _ = librosa.load(sf, sr=SAMPLING_RATE, dtype=np.float32)
-                out.append(audio)
-                # Call the transcribe function
-                # segments, info = await asyncio.to_thread(model.transcribe,
-                segments, info = model.transcribe(
-                    audio_file_path,
-                    language='he',
-                    initial_prompt=input_data.init_prompt,
-                    beam_size=5,
-                    word_timestamps=True,
-                    condition_on_previous_text=True
-                )
-                # Convert segments to list and serialize
-                segments_list = list(segments)
-                segments_serializable = [segment_to_dict(s) for s in segments_list]
-                logger.info(get_raw_words_from_segments(segments_list))
-                # Send the serialized segments back to the client
-                await websocket.send_json(segments_serializable)
             except WebSocketDisconnect:
                 logger.info("WebSocket connection closed by the client.")
                 break
@@ -158,8 +220,11 @@ async def websocket_transcribe(websocket: WebSocket):
         logger.info("Cleaning up and closing WebSocket connection.")
 def main():
-    args = argparse.ArgumentParser()
-    args = add_shared_args(args)
     args.parse_args([
         '--lan', 'he',
         '--model', 'ivrit-ai/faster-whisper-v2-d4',
@@ -168,8 +233,7 @@ def main():
         # '--vac', '--buffer_trimming', 'segment', '--buffer_trimming_sec', '15', '--min_chunk_size', '1.0', '--vac_chunk_size', '0.04', '--start_at', '0.0', '--offline', '--comp_unaware', '--log_level', 'DEBUG'
     ])
-    global asr, online
-    uvicorn.run(app)

 # Import the necessary components from whisper_online.py
 import logging
 import os
+from typing import Optional
 import librosa
 import soundfile
 import uvicorn
 from fastapi import FastAPI, WebSocket
+from pydantic import BaseModel, ConfigDict
 from starlette.websockets import WebSocketDisconnect
 from libs.whisper_streaming.whisper_online import (
 import sys
 import numpy as np
 import io
+import soundfile
 import wave
 import requests
 import argparse
+# from libs.whisper_streaming.whisper_online_server import online
 logger = logging.getLogger(__name__)
 SAMPLING_RATE = 16000
 WARMUP_FILE = "mono16k.test_hebrew.wav"
 AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav"
 app = FastAPI()
+args = argparse.ArgumentParser()
+add_shared_args(args)
+def drop_option_from_parser(parser, option_name):
+    """
+    Reinitializes the parser and copies all options except the specified option.
+    Args:
+        parser (argparse.ArgumentParser): The original argument parser.
+        option_name (str): The option string to drop (e.g., '--model').
+    Returns:
+        argparse.ArgumentParser: A new parser without the specified option.
+    """
+    # Create a new parser with the same description and other attributes
+    new_parser = argparse.ArgumentParser(
+        description=parser.description,
+        epilog=parser.epilog,
+        formatter_class=parser.formatter_class
+    )
+    # Iterate through all the arguments of the original parser
+    for action in parser._actions:
+        if "-h" in action.option_strings:
+            continue
+        # Check if the option is not the one to drop
+        if option_name not in action.option_strings :
+            new_parser._add_action(action)
+    return new_parser
 def convert_to_mono_16k(input_wav: str, output_wav: str) -> None:
     """
         convert_to_mono_16k(audio_file_path, WARMUP_FILE)
+class State(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    websocket: WebSocket
+    asr: ASRBase
+    online: OnlineASRProcessor
+    min_limit: int
+    is_first: bool = True
+    last_end: Optional[float] = None
+async def receive_audio_chunk(state: State) -> Optional[np.ndarray]:
     # receive all audio that is available by this time
     # blocks operation if less than self.min_chunk seconds is available
     # unblocks if connection is closed or a chunk is available
     out = []
+    while sum(len(x) for x in out) < state.min_limit:
+        raw_bytes = await state.websocket.receive_bytes()
         if not raw_bytes:
             break
+#            print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
         sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
         audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
         out.append(audio)
     if not out:
         return None
+    flat_out = np.concatenate(out)
+    if state.is_first and len(flat_out) < state.min_limit:
+        return None
+    state.is_first = False
+    return flat_out
+def format_output_transcript(state, o) -> dict:
+    # output format in stdout is like:
+    # 0 1720 Takhle to je
+    # - the first two words are:
+    #    - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
+    # - the next words: segment transcript
+    # This function differs from whisper_online.output_transcript in the following:
+    # succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it.
+    # Therefore, beg, is max of previous end and current beg outputed by Whisper.
+    # Usually it differs negligibly, by appx 20 ms.
+    if o[0] is not None:
+        beg, end = o[0]*1000,o[1]*1000
+        if state.last_end is not None:
+            beg = max(beg, state.last_end)
+        state.last_end = end
+        print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
+        return {
+            "start": "%1.0f" % beg,
+            "end": "%1.0f" % end,
+            "text": "%s" % o[2],
+        }
+    else:
+        logger.debug("No text in this segment")
         return None
 # Define WebSocket endpoint
 @app.websocket("/ws_transcribe_streaming")
     await websocket.accept()
     logger.info("WebSocket connection established successfully.")
+    # initialize the ASR model
+    logger.info("Loading whisper model...")
     asr, online = asr_factory(args)
+    state = State(
+        websocket=websocket,
+        asr=asr,
+        online=online,
+        min_limit=args.min_chunk_size * SAMPLING_RATE,
+    )
     # warm up the ASR because the very first transcribe takes more time than the others.
     # Test results in https://github.com/ufal/whisper_streaming/pull/81
+    logger.info("Warming up the whisper model...")
     a = load_audio_chunk(WARMUP_FILE, 0, 1)
     asr.transcribe(a)
     logger.info("Whisper is warmed up.")
     try:
         while True:
+            a = await receive_audio_chunk(state)
+            if a is None:
+                break
+            state.online.insert_audio_chunk(a)
+            o = online.process_iter()
             try:
+                if result := format_output_transcript(state, o):
+                    await websocket.send_json(result)
+            except BrokenPipeError:
+                logger.info("broken pipe -- connection closed?")
+                break
             except WebSocketDisconnect:
                 logger.info("WebSocket connection closed by the client.")
                 break
         logger.info("Cleaning up and closing WebSocket connection.")
 def main():
+    global args
+    args = drop_option_from_parser(args, '--model')
+    args.add_argument('--model', type=str,
+                      help="Name size of the Whisper model to use. The model is automatically downloaded from the model hub if not present in model cache dir.")
     args.parse_args([
         '--lan', 'he',
         '--model', 'ivrit-ai/faster-whisper-v2-d4',
         # '--vac', '--buffer_trimming', 'segment', '--buffer_trimming_sec', '15', '--min_chunk_size', '1.0', '--vac_chunk_size', '0.04', '--start_at', '0.0', '--offline', '--comp_unaware', '--log_level', 'DEBUG'
     ])
+    uvicorn.run(app)
+if __name__ == "__main__":
+    main()