AshDavid12 commited on
Commit
d7b2452
·
1 Parent(s): b157f40

pcm change

Browse files

changed client to download the wav file and then send the pcd chunks and then the server is accepting pcm chunks converting to wav and sending to whisper.

Files changed (3) hide show
  1. client.py +29 -15
  2. downloaded_audio.wav +0 -1
  3. infer.py +51 -60
client.py CHANGED
@@ -1,5 +1,6 @@
1
  import asyncio
2
  import json
 
3
 
4
  import websockets
5
  import requests
@@ -9,29 +10,42 @@ import ssl
9
  AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav" # Use WAV file
10
 
11
  async def send_audio(websocket):
12
- buffer_size = 1024*512 # Buffer audio chunks up to 512KB before sending
13
- audio_buffer = bytearray()
14
 
15
- with requests.get(AUDIO_FILE_URL, stream=True, allow_redirects=False) as response:
 
16
  if response.status_code == 200:
17
- print("Starting to stream audio file...")
 
 
 
18
 
19
- for chunk in response.iter_content(chunk_size=1024): # Stream in chunks
20
- if chunk:
21
- audio_buffer.extend(chunk)
22
- #print(f"Received audio chunk of size {len(chunk)} bytes.")
 
 
 
23
 
24
- # Send buffered audio data once it's large enough
25
- if len(audio_buffer) >= buffer_size:
26
- await websocket.send(audio_buffer)
27
- #print(f"Sent {len(audio_buffer)} bytes of audio data.")
28
- audio_buffer.clear()
29
- await asyncio.sleep(0.01)
 
 
 
 
 
 
 
30
 
31
- print("Finished sending audio.")
32
  else:
33
  print(f"Failed to download audio file. Status code: {response.status_code}")
34
 
 
35
  async def receive_transcription(websocket):
36
  while True:
37
  try:
 
1
  import asyncio
2
  import json
3
+ import wave
4
 
5
  import websockets
6
  import requests
 
10
  AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav" # Use WAV file
11
 
12
  async def send_audio(websocket):
13
+ buffer_size = 1024 * 16 # Send smaller chunks (16KB) for real-time processing
 
14
 
15
+ # Download the WAV file locally
16
+ with requests.get(AUDIO_FILE_URL, stream=True) as response:
17
  if response.status_code == 200:
18
+ with open('downloaded_audio.wav', 'wb') as f:
19
+ for chunk in response.iter_content(chunk_size=1024):
20
+ f.write(chunk)
21
+ print("Audio file downloaded successfully.")
22
 
23
+ # Open the downloaded WAV file and extract PCM data
24
+ with wave.open('downloaded_audio.wav', 'rb') as wav_file:
25
+ metadata = {
26
+ 'sample_rate': wav_file.getframerate(),
27
+ 'channels': wav_file.getnchannels(),
28
+ 'sampwidth': wav_file.getsampwidth(),
29
+ }
30
 
31
+ # Send metadata to the server before sending the audio
32
+ await websocket.send(json.dumps(metadata))
33
+ print(f"Sent metadata: {metadata}")
34
+
35
+ # Send the PCM audio data in chunks
36
+ while True:
37
+ pcm_chunk = wav_file.readframes(buffer_size)
38
+ if not pcm_chunk:
39
+ break # End of file
40
+
41
+ await websocket.send(pcm_chunk) # Send raw PCM data chunk
42
+ print(f"Sent PCM chunk of size {len(pcm_chunk)} bytes.")
43
+ await asyncio.sleep(0.01) # Simulate real-time sending
44
 
 
45
  else:
46
  print(f"Failed to download audio file. Status code: {response.status_code}")
47
 
48
+
49
  async def receive_transcription(websocket):
50
  while True:
51
  try:
downloaded_audio.wav DELETED
@@ -1 +0,0 @@
1
- {"error":"File not found"}
 
 
infer.py CHANGED
@@ -185,76 +185,67 @@ async def websocket_transcribe(websocket: WebSocket):
185
 
186
  try:
187
  processed_segments = [] # Keeps track of the segments already transcribed
188
- accumulated_audio_size = 0 # Track how much audio data has been buffered
189
  accumulated_audio_time = 0 # Track the total audio duration accumulated
190
  last_transcribed_time = 0.0
191
- #min_transcription_time = 5.0 # Minimum duration of audio in seconds before transcription starts
192
-
193
- # A temporary file to store the growing audio data
194
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
195
- logging.info(f"Temporary audio file created at {temp_audio_file.name}")
196
- #temp_audio_filename = os.path.basename(temp_audio_file.name)
197
- output_directory = "/tmp"
198
- os.makedirs(output_directory, exist_ok=True)
199
- chunk_counter = 0
200
-
201
- while True:
202
- try:
203
- # Receive the next chunk of audio data
204
- audio_chunk = await websocket.receive_bytes()
205
- if not audio_chunk:
206
- logging.warning("Received empty audio chunk, skipping processing hey.")
207
- continue
208
-
209
-
210
- # Create a new file for the chunk
211
- chunk_filename = os.path.join(output_directory, f"audio_chunk_{chunk_counter}.wav")
212
- chunk_counter += 1
213
-
214
- with wave.open(chunk_filename, 'wb') as wav_file:
215
- wav_file.setnchannels(1) # Mono channel
216
- wav_file.setsampwidth(2) # 2 bytes per sample (16-bit audio)
217
- wav_file.setframerate(16000) # 16 kHz sample rate
218
- wav_file.writeframes(audio_chunk)
219
-
220
- # with open(chunk_filename, 'wb') as audio_file:
221
- # audio_file.write(audio_chunk)
222
-
223
- # Write audio chunk to file and accumulate size and time
224
- temp_audio_file.write(audio_chunk)
225
- temp_audio_file.flush()
226
- accumulated_audio_size += len(audio_chunk)
227
-
228
- # Estimate the duration of the chunk based on its size (e.g., 16kHz audio)
229
- chunk_duration = len(audio_chunk) / (16000 * 2) # Assuming 16kHz mono WAV (2 bytes per sample)
230
- accumulated_audio_time += chunk_duration
231
- logging.info(f"Received and buffered {len(audio_chunk)} bytes, total buffered: {accumulated_audio_size} bytes, total time: {accumulated_audio_time:.2f} seconds")
232
-
233
- # Transcribe when enough time (audio) is accumulated (e.g., at least 5 seconds of audio)
234
- #if accumulated_audio_time >= min_transcription_time:
235
- #logging.info("Buffered enough audio time, starting transcription.")
236
-
237
-
238
- # Call the transcription function with the last processed time
239
- partial_result, last_transcribed_time = transcribe_core_ws(temp_audio_file.name, last_transcribed_time)
240
- accumulated_audio_time = 0 # Reset the accumulated audio time
241
- processed_segments.extend(partial_result['new_segments'])
242
-
243
- # Reset the accumulated audio size after transcription
244
- accumulated_audio_size = 0
245
 
246
  # Send the transcription result back to the client with both new and all processed segments
247
  response = {
248
  "new_segments": partial_result['new_segments'],
249
- "processed_segments": processed_segments,
250
- "download_url": f"https://gigaverse-ivrit-ai-streaming.hf.space/download_audio/{os.path.basename(chunk_filename)}"
251
  }
252
  logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")
253
  await websocket.send_json(response)
254
 
255
- except WebSocketDisconnect:
256
- logging.info("WebSocket connection closed by the client.")
257
- break
258
 
259
  except Exception as e:
260
  logging.error(f"Unexpected error during WebSocket transcription: {e}")
 
185
 
186
  try:
187
  processed_segments = [] # Keeps track of the segments already transcribed
 
188
  accumulated_audio_time = 0 # Track the total audio duration accumulated
189
  last_transcribed_time = 0.0
190
+ min_transcription_time = 5.0 # Minimum duration of audio in seconds before transcription starts
191
+
192
+ # A buffer to store raw PCM audio data
193
+ pcm_audio_buffer = bytearray()
194
+
195
+ # Metadata for the incoming PCM data (sample rate, channels, and sample width should be consistent)
196
+ sample_rate = 16000 # 16kHz
197
+ channels = 1 # Mono
198
+ sample_width = 2 # 2 bytes per sample (16-bit audio)
199
+
200
+ while True:
201
+ try:
202
+ # Receive the next chunk of PCM audio data
203
+ audio_chunk = await websocket.receive_bytes()
204
+ if not audio_chunk:
205
+ logging.warning("Received empty audio chunk, skipping processing.")
206
+ continue
207
+
208
+ # Accumulate the raw PCM data into the buffer
209
+ pcm_audio_buffer.extend(audio_chunk)
210
+
211
+ # Estimate the duration of the chunk based on its size
212
+ chunk_duration = len(audio_chunk) / (sample_rate * channels * sample_width)
213
+ accumulated_audio_time += chunk_duration
214
+ logging.info(f"Received and buffered {len(audio_chunk)} bytes, total buffered: {len(pcm_audio_buffer)} bytes, total time: {accumulated_audio_time:.2f} seconds")
215
+
216
+ # Transcribe when enough time (audio) is accumulated (e.g., at least 5 seconds of audio)
217
+ if accumulated_audio_time >= min_transcription_time:
218
+ logging.info("Buffered enough audio time, starting transcription.")
219
+
220
+ # Create a temporary WAV file from the accumulated PCM data
221
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav_file:
222
+ with wave.open(temp_wav_file.name, 'wb') as wav_file:
223
+ wav_file.setnchannels(channels)
224
+ wav_file.setsampwidth(sample_width)
225
+ wav_file.setframerate(sample_rate)
226
+ wav_file.writeframes(pcm_audio_buffer)
227
+
228
+ logging.info(f"Temporary WAV file created at {temp_wav_file.name} for transcription.")
229
+
230
+ # Call the transcription function with the WAV file
231
+ partial_result, last_transcribed_time = transcribe_core_ws(temp_wav_file.name, last_transcribed_time)
232
+ processed_segments.extend(partial_result['new_segments'])
233
+
234
+ # Clear the buffer after transcription
235
+ pcm_audio_buffer.clear()
236
+ accumulated_audio_time = 0 # Reset accumulated time
 
 
 
 
 
 
 
237
 
238
  # Send the transcription result back to the client with both new and all processed segments
239
  response = {
240
  "new_segments": partial_result['new_segments'],
241
+ "processed_segments": processed_segments
 
242
  }
243
  logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")
244
  await websocket.send_json(response)
245
 
246
+ except WebSocketDisconnect:
247
+ logging.info("WebSocket connection closed by the client.")
248
+ break
249
 
250
  except Exception as e:
251
  logging.error(f"Unexpected error during WebSocket transcription: {e}")