Spaces:
Sleeping
Sleeping
AshDavid12
commited on
Commit
·
d7b2452
1
Parent(s):
b157f40
pcm change
Browse fileschanged client to download the wav file and then send the pcd chunks and then the server is accepting pcm chunks converting to wav and sending to whisper.
- client.py +29 -15
- downloaded_audio.wav +0 -1
- infer.py +51 -60
client.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import asyncio
|
2 |
import json
|
|
|
3 |
|
4 |
import websockets
|
5 |
import requests
|
@@ -9,29 +10,42 @@ import ssl
|
|
9 |
AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav" # Use WAV file
|
10 |
|
11 |
async def send_audio(websocket):
|
12 |
-
buffer_size = 1024*
|
13 |
-
audio_buffer = bytearray()
|
14 |
|
15 |
-
|
|
|
16 |
if response.status_code == 200:
|
17 |
-
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
print("Finished sending audio.")
|
32 |
else:
|
33 |
print(f"Failed to download audio file. Status code: {response.status_code}")
|
34 |
|
|
|
35 |
async def receive_transcription(websocket):
|
36 |
while True:
|
37 |
try:
|
|
|
1 |
import asyncio
|
2 |
import json
|
3 |
+
import wave
|
4 |
|
5 |
import websockets
|
6 |
import requests
|
|
|
10 |
AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav" # Use WAV file
|
11 |
|
12 |
async def send_audio(websocket):
|
13 |
+
buffer_size = 1024 * 16 # Send smaller chunks (16KB) for real-time processing
|
|
|
14 |
|
15 |
+
# Download the WAV file locally
|
16 |
+
with requests.get(AUDIO_FILE_URL, stream=True) as response:
|
17 |
if response.status_code == 200:
|
18 |
+
with open('downloaded_audio.wav', 'wb') as f:
|
19 |
+
for chunk in response.iter_content(chunk_size=1024):
|
20 |
+
f.write(chunk)
|
21 |
+
print("Audio file downloaded successfully.")
|
22 |
|
23 |
+
# Open the downloaded WAV file and extract PCM data
|
24 |
+
with wave.open('downloaded_audio.wav', 'rb') as wav_file:
|
25 |
+
metadata = {
|
26 |
+
'sample_rate': wav_file.getframerate(),
|
27 |
+
'channels': wav_file.getnchannels(),
|
28 |
+
'sampwidth': wav_file.getsampwidth(),
|
29 |
+
}
|
30 |
|
31 |
+
# Send metadata to the server before sending the audio
|
32 |
+
await websocket.send(json.dumps(metadata))
|
33 |
+
print(f"Sent metadata: {metadata}")
|
34 |
+
|
35 |
+
# Send the PCM audio data in chunks
|
36 |
+
while True:
|
37 |
+
pcm_chunk = wav_file.readframes(buffer_size)
|
38 |
+
if not pcm_chunk:
|
39 |
+
break # End of file
|
40 |
+
|
41 |
+
await websocket.send(pcm_chunk) # Send raw PCM data chunk
|
42 |
+
print(f"Sent PCM chunk of size {len(pcm_chunk)} bytes.")
|
43 |
+
await asyncio.sleep(0.01) # Simulate real-time sending
|
44 |
|
|
|
45 |
else:
|
46 |
print(f"Failed to download audio file. Status code: {response.status_code}")
|
47 |
|
48 |
+
|
49 |
async def receive_transcription(websocket):
|
50 |
while True:
|
51 |
try:
|
downloaded_audio.wav
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"error":"File not found"}
|
|
|
|
infer.py
CHANGED
@@ -185,76 +185,67 @@ async def websocket_transcribe(websocket: WebSocket):
|
|
185 |
|
186 |
try:
|
187 |
processed_segments = [] # Keeps track of the segments already transcribed
|
188 |
-
accumulated_audio_size = 0 # Track how much audio data has been buffered
|
189 |
accumulated_audio_time = 0 # Track the total audio duration accumulated
|
190 |
last_transcribed_time = 0.0
|
191 |
-
|
192 |
-
|
193 |
-
# A
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
#
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
#
|
236 |
-
|
237 |
-
|
238 |
-
# Call the transcription function with the last processed time
|
239 |
-
partial_result, last_transcribed_time = transcribe_core_ws(temp_audio_file.name, last_transcribed_time)
|
240 |
-
accumulated_audio_time = 0 # Reset the accumulated audio time
|
241 |
-
processed_segments.extend(partial_result['new_segments'])
|
242 |
-
|
243 |
-
# Reset the accumulated audio size after transcription
|
244 |
-
accumulated_audio_size = 0
|
245 |
|
246 |
# Send the transcription result back to the client with both new and all processed segments
|
247 |
response = {
|
248 |
"new_segments": partial_result['new_segments'],
|
249 |
-
"processed_segments": processed_segments
|
250 |
-
"download_url": f"https://gigaverse-ivrit-ai-streaming.hf.space/download_audio/{os.path.basename(chunk_filename)}"
|
251 |
}
|
252 |
logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")
|
253 |
await websocket.send_json(response)
|
254 |
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
|
259 |
except Exception as e:
|
260 |
logging.error(f"Unexpected error during WebSocket transcription: {e}")
|
|
|
185 |
|
186 |
try:
|
187 |
processed_segments = [] # Keeps track of the segments already transcribed
|
|
|
188 |
accumulated_audio_time = 0 # Track the total audio duration accumulated
|
189 |
last_transcribed_time = 0.0
|
190 |
+
min_transcription_time = 5.0 # Minimum duration of audio in seconds before transcription starts
|
191 |
+
|
192 |
+
# A buffer to store raw PCM audio data
|
193 |
+
pcm_audio_buffer = bytearray()
|
194 |
+
|
195 |
+
# Metadata for the incoming PCM data (sample rate, channels, and sample width should be consistent)
|
196 |
+
sample_rate = 16000 # 16kHz
|
197 |
+
channels = 1 # Mono
|
198 |
+
sample_width = 2 # 2 bytes per sample (16-bit audio)
|
199 |
+
|
200 |
+
while True:
|
201 |
+
try:
|
202 |
+
# Receive the next chunk of PCM audio data
|
203 |
+
audio_chunk = await websocket.receive_bytes()
|
204 |
+
if not audio_chunk:
|
205 |
+
logging.warning("Received empty audio chunk, skipping processing.")
|
206 |
+
continue
|
207 |
+
|
208 |
+
# Accumulate the raw PCM data into the buffer
|
209 |
+
pcm_audio_buffer.extend(audio_chunk)
|
210 |
+
|
211 |
+
# Estimate the duration of the chunk based on its size
|
212 |
+
chunk_duration = len(audio_chunk) / (sample_rate * channels * sample_width)
|
213 |
+
accumulated_audio_time += chunk_duration
|
214 |
+
logging.info(f"Received and buffered {len(audio_chunk)} bytes, total buffered: {len(pcm_audio_buffer)} bytes, total time: {accumulated_audio_time:.2f} seconds")
|
215 |
+
|
216 |
+
# Transcribe when enough time (audio) is accumulated (e.g., at least 5 seconds of audio)
|
217 |
+
if accumulated_audio_time >= min_transcription_time:
|
218 |
+
logging.info("Buffered enough audio time, starting transcription.")
|
219 |
+
|
220 |
+
# Create a temporary WAV file from the accumulated PCM data
|
221 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav_file:
|
222 |
+
with wave.open(temp_wav_file.name, 'wb') as wav_file:
|
223 |
+
wav_file.setnchannels(channels)
|
224 |
+
wav_file.setsampwidth(sample_width)
|
225 |
+
wav_file.setframerate(sample_rate)
|
226 |
+
wav_file.writeframes(pcm_audio_buffer)
|
227 |
+
|
228 |
+
logging.info(f"Temporary WAV file created at {temp_wav_file.name} for transcription.")
|
229 |
+
|
230 |
+
# Call the transcription function with the WAV file
|
231 |
+
partial_result, last_transcribed_time = transcribe_core_ws(temp_wav_file.name, last_transcribed_time)
|
232 |
+
processed_segments.extend(partial_result['new_segments'])
|
233 |
+
|
234 |
+
# Clear the buffer after transcription
|
235 |
+
pcm_audio_buffer.clear()
|
236 |
+
accumulated_audio_time = 0 # Reset accumulated time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
# Send the transcription result back to the client with both new and all processed segments
|
239 |
response = {
|
240 |
"new_segments": partial_result['new_segments'],
|
241 |
+
"processed_segments": processed_segments
|
|
|
242 |
}
|
243 |
logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")
|
244 |
await websocket.send_json(response)
|
245 |
|
246 |
+
except WebSocketDisconnect:
|
247 |
+
logging.info("WebSocket connection closed by the client.")
|
248 |
+
break
|
249 |
|
250 |
except Exception as e:
|
251 |
logging.error(f"Unexpected error during WebSocket transcription: {e}")
|