Spaces:

Gigaverse
/

ivrit-ai-streaming

Sleeping

AshDavid12 commited on Sep 17, 2024

Commit

d7b2452

1 Parent(s): b157f40

pcm change

changed client to download the wav file and then send the pcd chunks and then the server is accepting pcm chunks converting to wav and sending to whisper.

Files changed (3) hide show

client.py +29 -15
downloaded_audio.wav +0 -1
infer.py +51 -60

client.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 import json
 import websockets
 import requests
@@ -9,29 +10,42 @@ import ssl
 AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav"  # Use WAV file
 async def send_audio(websocket):
-    buffer_size = 1024*512  # Buffer audio chunks up to 512KB before sending
-    audio_buffer = bytearray()
-    with requests.get(AUDIO_FILE_URL, stream=True, allow_redirects=False) as response:
         if response.status_code == 200:
-            print("Starting to stream audio file...")
-            for chunk in response.iter_content(chunk_size=1024):  # Stream in chunks
-                if chunk:
-                    audio_buffer.extend(chunk)
-                    #print(f"Received audio chunk of size {len(chunk)} bytes.")
-                    # Send buffered audio data once it's large enough
-                    if len(audio_buffer) >= buffer_size:
-                        await websocket.send(audio_buffer)
-                        #print(f"Sent {len(audio_buffer)} bytes of audio data.")
-                        audio_buffer.clear()
-                        await asyncio.sleep(0.01)
-            print("Finished sending audio.")
         else:
             print(f"Failed to download audio file. Status code: {response.status_code}")
 async def receive_transcription(websocket):
     while True:
         try:

 import asyncio
 import json
+import wave
 import websockets
 import requests
 AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav"  # Use WAV file
 async def send_audio(websocket):
+    buffer_size = 1024 * 16  # Send smaller chunks (16KB) for real-time processing
+    # Download the WAV file locally
+    with requests.get(AUDIO_FILE_URL, stream=True) as response:
         if response.status_code == 200:
+            with open('downloaded_audio.wav', 'wb') as f:
+                for chunk in response.iter_content(chunk_size=1024):
+                    f.write(chunk)
+            print("Audio file downloaded successfully.")
+            # Open the downloaded WAV file and extract PCM data
+            with wave.open('downloaded_audio.wav', 'rb') as wav_file:
+                metadata = {
+                    'sample_rate': wav_file.getframerate(),
+                    'channels': wav_file.getnchannels(),
+                    'sampwidth': wav_file.getsampwidth(),
+                }
+                # Send metadata to the server before sending the audio
+                await websocket.send(json.dumps(metadata))
+                print(f"Sent metadata: {metadata}")
+                # Send the PCM audio data in chunks
+                while True:
+                    pcm_chunk = wav_file.readframes(buffer_size)
+                    if not pcm_chunk:
+                        break  # End of file
+                    await websocket.send(pcm_chunk)  # Send raw PCM data chunk
+                    print(f"Sent PCM chunk of size {len(pcm_chunk)} bytes.")
+                    await asyncio.sleep(0.01)  # Simulate real-time sending
         else:
             print(f"Failed to download audio file. Status code: {response.status_code}")
 async def receive_transcription(websocket):
     while True:
         try:

downloaded_audio.wav DELETED Viewed

	@@ -1 +0,0 @@
1	- {"error":"File not found"}

infer.py CHANGED Viewed

@@ -185,76 +185,67 @@ async def websocket_transcribe(websocket: WebSocket):
     try:
         processed_segments = []  # Keeps track of the segments already transcribed
-        accumulated_audio_size = 0  # Track how much audio data has been buffered
         accumulated_audio_time = 0  # Track the total audio duration accumulated
         last_transcribed_time = 0.0
-        #min_transcription_time = 5.0  # Minimum duration of audio in seconds before transcription starts
-        # A temporary file to store the growing audio data
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
-            logging.info(f"Temporary audio file created at {temp_audio_file.name}")
-            #temp_audio_filename = os.path.basename(temp_audio_file.name)
-            output_directory = "/tmp"
-            os.makedirs(output_directory, exist_ok=True)
-            chunk_counter = 0
-            while True:
-                try:
-                    # Receive the next chunk of audio data
-                    audio_chunk = await websocket.receive_bytes()
-                    if not audio_chunk:
-                        logging.warning("Received empty audio chunk, skipping processing hey.")
-                        continue
-                    # Create a new file for the chunk
-                    chunk_filename = os.path.join(output_directory, f"audio_chunk_{chunk_counter}.wav")
-                    chunk_counter += 1
-                    with wave.open(chunk_filename, 'wb') as wav_file:
-                        wav_file.setnchannels(1)  # Mono channel
-                        wav_file.setsampwidth(2)  # 2 bytes per sample (16-bit audio)
-                        wav_file.setframerate(16000)  # 16 kHz sample rate
-                        wav_file.writeframes(audio_chunk)
-                    # with open(chunk_filename, 'wb') as audio_file:
-                    #     audio_file.write(audio_chunk)
-                    # Write audio chunk to file and accumulate size and time
-                    temp_audio_file.write(audio_chunk)
-                    temp_audio_file.flush()
-                    accumulated_audio_size += len(audio_chunk)
-                    # Estimate the duration of the chunk based on its size (e.g., 16kHz audio)
-                    chunk_duration = len(audio_chunk) / (16000 * 2)  # Assuming 16kHz mono WAV (2 bytes per sample)
-                    accumulated_audio_time += chunk_duration
-                    logging.info(f"Received and buffered {len(audio_chunk)} bytes, total buffered: {accumulated_audio_size} bytes, total time: {accumulated_audio_time:.2f} seconds")
-                    # Transcribe when enough time (audio) is accumulated (e.g., at least 5 seconds of audio)
-                    #if accumulated_audio_time >= min_transcription_time:
-                    #logging.info("Buffered enough audio time, starting transcription.")
-                    # Call the transcription function with the last processed time
-                    partial_result, last_transcribed_time = transcribe_core_ws(temp_audio_file.name, last_transcribed_time)
-                    accumulated_audio_time = 0  # Reset the accumulated audio time
-                    processed_segments.extend(partial_result['new_segments'])
-                    # Reset the accumulated audio size after transcription
-                    accumulated_audio_size = 0
                     # Send the transcription result back to the client with both new and all processed segments
                     response = {
                         "new_segments": partial_result['new_segments'],
-                        "processed_segments": processed_segments,
-                        "download_url": f"https://gigaverse-ivrit-ai-streaming.hf.space/download_audio/{os.path.basename(chunk_filename)}"
                     }
                     logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")
                     await websocket.send_json(response)
-                except WebSocketDisconnect:
-                    logging.info("WebSocket connection closed by the client.")
-                    break
     except Exception as e:
         logging.error(f"Unexpected error during WebSocket transcription: {e}")

     try:
         processed_segments = []  # Keeps track of the segments already transcribed
         accumulated_audio_time = 0  # Track the total audio duration accumulated
         last_transcribed_time = 0.0
+        min_transcription_time = 5.0  # Minimum duration of audio in seconds before transcription starts
+        # A buffer to store raw PCM audio data
+        pcm_audio_buffer = bytearray()
+        # Metadata for the incoming PCM data (sample rate, channels, and sample width should be consistent)
+        sample_rate = 16000  # 16kHz
+        channels = 1  # Mono
+        sample_width = 2  # 2 bytes per sample (16-bit audio)
+        while True:
+            try:
+                # Receive the next chunk of PCM audio data
+                audio_chunk = await websocket.receive_bytes()
+                if not audio_chunk:
+                    logging.warning("Received empty audio chunk, skipping processing.")
+                    continue
+                # Accumulate the raw PCM data into the buffer
+                pcm_audio_buffer.extend(audio_chunk)
+                # Estimate the duration of the chunk based on its size
+                chunk_duration = len(audio_chunk) / (sample_rate * channels * sample_width)
+                accumulated_audio_time += chunk_duration
+                logging.info(f"Received and buffered {len(audio_chunk)} bytes, total buffered: {len(pcm_audio_buffer)} bytes, total time: {accumulated_audio_time:.2f} seconds")
+                # Transcribe when enough time (audio) is accumulated (e.g., at least 5 seconds of audio)
+                if accumulated_audio_time >= min_transcription_time:
+                    logging.info("Buffered enough audio time, starting transcription.")
+                    # Create a temporary WAV file from the accumulated PCM data
+                    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav_file:
+                        with wave.open(temp_wav_file.name, 'wb') as wav_file:
+                            wav_file.setnchannels(channels)
+                            wav_file.setsampwidth(sample_width)
+                            wav_file.setframerate(sample_rate)
+                            wav_file.writeframes(pcm_audio_buffer)
+                        logging.info(f"Temporary WAV file created at {temp_wav_file.name} for transcription.")
+                        # Call the transcription function with the WAV file
+                        partial_result, last_transcribed_time = transcribe_core_ws(temp_wav_file.name, last_transcribed_time)
+                        processed_segments.extend(partial_result['new_segments'])
+                    # Clear the buffer after transcription
+                    pcm_audio_buffer.clear()
+                    accumulated_audio_time = 0  # Reset accumulated time
                     # Send the transcription result back to the client with both new and all processed segments
                     response = {
                         "new_segments": partial_result['new_segments'],
+                        "processed_segments": processed_segments
                     }
                     logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")
                     await websocket.send_json(response)
+            except WebSocketDisconnect:
+                logging.info("WebSocket connection closed by the client.")
+                break
     except Exception as e:
         logging.error(f"Unexpected error during WebSocket transcription: {e}")