Spaces:

Gigaverse
/

ivrit-ai-streaming

Sleeping

App Files Files Community

AshDavid12 commited on Sep 12, 2024

Commit

92ce07c

1 Parent(s): b0d532b

added chunk tracking

Browse files

Files changed (2) hide show

client.py +14 -5
infer.py +16 -11

client.py CHANGED Viewed

@@ -7,15 +7,24 @@ import ssl
 AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod_serverless_whisper/main/me-hebrew.wav"  # Use WAV file
 async def send_audio(websocket):
-    # Stream the audio file in real-time
     with requests.get(AUDIO_FILE_URL, stream=True, allow_redirects=False) as response:
         if response.status_code == 200:
             print("Starting to stream audio file...")
-            for chunk in response.iter_content(chunk_size=8192):  # Stream in chunks of 8192 bytes
                 if chunk:
-                    await websocket.send(chunk)  # Send each chunk over WebSocket
-                    print(f"Sent audio chunk of size {len(chunk)} bytes")
             print("Finished sending audio.")
         else:
@@ -42,7 +51,7 @@ async def send_heartbeat(websocket):
 async def run_client():
-    uri = ("wss://gigaverse-ivrit-ai-streaming.hf.space/ws/transcribe")  # WebSocket URL
     ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
     ssl_context.check_hostname = False
     ssl_context.verify_mode = ssl.CERT_NONE

 AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod_serverless_whisper/main/me-hebrew.wav"  # Use WAV file
 async def send_audio(websocket):
+    buffer_size = 512 * 1024  # Buffer audio chunks up to 512KB before sending
+    audio_buffer = bytearray()
     with requests.get(AUDIO_FILE_URL, stream=True, allow_redirects=False) as response:
         if response.status_code == 200:
             print("Starting to stream audio file...")
+            for chunk in response.iter_content(chunk_size=8192):  # Stream in chunks
                 if chunk:
+                    audio_buffer.extend(chunk)
+                    print(f"Received audio chunk of size {len(chunk)} bytes.")
+                    # Send buffered audio data once it's large enough
+                    if len(audio_buffer) >= buffer_size:
+                        await websocket.send(audio_buffer)
+                        print(f"Sent {len(audio_buffer)} bytes of audio data.")
+                        audio_buffer.clear()
+                        await asyncio.sleep(0.01)
             print("Finished sending audio.")
         else:
 async def run_client():
+    uri = ("wss://gigaverse-ivrit-ai-streaming.hf.space/wtranscribe")  # WebSocket URL
     ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
     ssl_context.check_hostname = False
     ssl_context.verify_mode = ssl.CERT_NONE

infer.py CHANGED Viewed

@@ -9,7 +9,7 @@ from fastapi import FastAPI, HTTPException, WebSocket,WebSocketDisconnect
 import websockets
 from pydantic import BaseModel
 from typing import Optional
-import sys
 import asyncio
 # Configure logging
@@ -186,7 +186,7 @@ def transcribe_core_ws(audio_file, last_transcribed_time):
 import tempfile
-@app.websocket("/ws/transcribe")
 async def websocket_transcribe(websocket: WebSocket):
     logging.info("New WebSocket connection request received.")
     await websocket.accept()
@@ -195,6 +195,8 @@ async def websocket_transcribe(websocket: WebSocket):
     try:
         processed_segments = []  # Keeps track of the segments already transcribed
         accumulated_audio_size = 0  # Track how much audio data has been buffered
         # A temporary file to store the growing audio data
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
@@ -208,20 +210,23 @@ async def websocket_transcribe(websocket: WebSocket):
                         logging.warning("Received empty audio chunk, skipping processing.")
                         continue
-                    # Write audio chunk to file and accumulate size
                     temp_audio_file.write(audio_chunk)
                     temp_audio_file.flush()
                     accumulated_audio_size += len(audio_chunk)
-                    logging.info(
-                        f"Received and buffered {len(audio_chunk)} bytes, total buffered: {accumulated_audio_size} bytes")
-                    # Buffer at least 512KB before transcription
-                    if accumulated_audio_size >= (512 * 1024):  # Adjust this size as needed
-                        logging.info("Buffered enough data, starting transcription.")
-                        partial_result, processed_segments = transcribe_core_ws(temp_audio_file.name,
-                                                                                processed_segments)
-                        accumulated_audio_size = 0  # Reset the accumulated audio size
                         # Send the transcription result back to the client
                         logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")

 import websockets
 from pydantic import BaseModel
 from typing import Optional
+import sys
 import asyncio
 # Configure logging
 import tempfile
+@app.websocket("/wtranscribe")
 async def websocket_transcribe(websocket: WebSocket):
     logging.info("New WebSocket connection request received.")
     await websocket.accept()
     try:
         processed_segments = []  # Keeps track of the segments already transcribed
         accumulated_audio_size = 0  # Track how much audio data has been buffered
+        accumulated_audio_time = 0  # Track the total audio duration accumulated
+        min_transcription_time = 5.0  # Minimum duration of audio in seconds before transcription starts
         # A temporary file to store the growing audio data
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
                         logging.warning("Received empty audio chunk, skipping processing.")
                         continue
+                    # Write audio chunk to file and accumulate size and time
                     temp_audio_file.write(audio_chunk)
                     temp_audio_file.flush()
                     accumulated_audio_size += len(audio_chunk)
+                    # Estimate the duration of the chunk based on its size (e.g., 16kHz audio)
+                    chunk_duration = len(audio_chunk) / (16000 * 2)  # Assuming 16kHz mono WAV (2 bytes per sample)
+                    accumulated_audio_time += chunk_duration
+                    logging.info(f"Received and buffered {len(audio_chunk)} bytes, total buffered: {accumulated_audio_size} bytes, total time: {accumulated_audio_time:.2f} seconds")
+                    # Transcribe when enough time (audio) is accumulated (e.g., at least 5 seconds of audio)
+                    if accumulated_audio_time >= min_transcription_time:
+                        logging.info("Buffered enough audio time, starting transcription.")
+                        # Call the transcription function with the last processed time
+                        partial_result, processed_segments = transcribe_core_ws(temp_audio_file.name, processed_segments)
+                        accumulated_audio_time = 0  # Reset the accumulated audio time
                         # Send the transcription result back to the client
                         logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")