Spaces:
Sleeping
Sleeping
AshDavid12
commited on
Commit
·
f4a3257
1
Parent(s):
1ad41b2
reverting back to partial trans
Browse files
client.py
CHANGED
@@ -1,161 +1,66 @@
|
|
1 |
import asyncio
|
2 |
-
import io
|
3 |
-
import json
|
4 |
-
|
5 |
-
import numpy as np
|
6 |
import websockets
|
7 |
import requests
|
8 |
import ssl
|
9 |
-
import wave
|
10 |
-
import logging
|
11 |
-
import sys
|
12 |
-
import sounddevice as sd
|
13 |
-
|
14 |
|
15 |
# Parameters for reading and sending the audio
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
29 |
ssl_context.check_hostname = False
|
30 |
ssl_context.verify_mode = ssl.CERT_NONE
|
31 |
-
logger.info(f"Connecting to server at {uri}")
|
32 |
-
try:
|
33 |
-
async with websockets.connect(uri,ssl=ssl_context) as websocket:
|
34 |
-
logger.info("WebSocket connection established")
|
35 |
-
# Start tasks for sending and receiving
|
36 |
-
send_task = asyncio.create_task(send_audio(websocket))
|
37 |
-
receive_task = asyncio.create_task(receive_transcriptions(websocket))
|
38 |
-
await asyncio.gather(send_task, receive_task)
|
39 |
-
except Exception as e:
|
40 |
-
logger.error(f"WebSocket connection error: {e}")
|
41 |
-
max_size_bytes = 50_000_000 # 10 MB
|
42 |
-
|
43 |
-
SAMPLE_RATE = 16000
|
44 |
-
CHUNK_SIZE =1024
|
45 |
-
|
46 |
-
async def send_audio_chunks(websocket):
|
47 |
-
"""Capture audio and send chunks to the server via WebSocket."""
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
websocket.send(json.dumps(audio_chunk)), asyncio.get_event_loop()
|
55 |
)
|
56 |
|
57 |
-
|
58 |
-
with sd.InputStream(callback=audio_callback, channels=1, samplerate=SAMPLE_RATE, blocksize=CHUNK_SIZE):
|
59 |
-
await asyncio.Future() # Keep the stream open and running
|
60 |
-
|
61 |
-
|
62 |
-
async def receive_transcriptions(websocket):
|
63 |
-
try:
|
64 |
-
logger.info("Starting to receive transcriptions")
|
65 |
-
async for message in websocket: # This is the same as websocket.recv()
|
66 |
-
logger.info(f"Received transcription: {message}")
|
67 |
-
print(f"Transcription: {message}")
|
68 |
-
except Exception as e:
|
69 |
-
logger.error(f"Receive transcription error: {e}")
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
if __name__ == "__main__":
|
76 |
-
asyncio.run(send_receive())
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
# async def send_audio(websocket):
|
92 |
-
# buffer_size = 512 * 1024 #HAVE TO HAVE 512!!
|
93 |
-
# audio_buffer = bytearray()
|
94 |
-
#
|
95 |
-
# with requests.get(AUDIO_FILE_URL, stream=True, allow_redirects=False) as response:
|
96 |
-
# if response.status_code == 200:
|
97 |
-
# print("Starting to stream audio file...")
|
98 |
-
#
|
99 |
-
# for chunk in response.iter_content(chunk_size=1024): # Stream in chunks
|
100 |
-
# if chunk:
|
101 |
-
# audio_buffer.extend(chunk)
|
102 |
-
# #print(f"Received audio chunk of size {len(chunk)} bytes.")
|
103 |
-
#
|
104 |
-
# # Send buffered audio data once it's large enough
|
105 |
-
# if len(audio_buffer) >= buffer_size:
|
106 |
-
# await websocket.send(audio_buffer)
|
107 |
-
# #print(f"Sent {len(audio_buffer)} bytes of audio data.")
|
108 |
-
# audio_buffer.clear()
|
109 |
-
# await asyncio.sleep(0.01)
|
110 |
-
#
|
111 |
-
# print("Finished sending audio.")
|
112 |
-
# else:
|
113 |
-
# print(f"Failed to download audio file. Status code: {response.status_code}")
|
114 |
-
#
|
115 |
-
#
|
116 |
-
# async def receive_transcription(websocket):
|
117 |
-
# while True:
|
118 |
-
# try:
|
119 |
-
#
|
120 |
-
# transcription = await websocket.recv()
|
121 |
-
# # Receive transcription from the server
|
122 |
-
# print(f"Transcription: {transcription}")
|
123 |
-
# except Exception as e:
|
124 |
-
# print(f"Error receiving transcription: {e}")
|
125 |
-
# #await asyncio.sleep(30)
|
126 |
-
# break
|
127 |
-
#
|
128 |
-
#
|
129 |
-
# async def send_heartbeat(websocket):
|
130 |
-
# while True:
|
131 |
-
# try:
|
132 |
-
# await websocket.ping()
|
133 |
-
# print("Sent keepalive ping")
|
134 |
-
# except websockets.ConnectionClosed:
|
135 |
-
# print("Connection closed, stopping heartbeat")
|
136 |
-
# break
|
137 |
-
# await asyncio.sleep(30) # Send ping every 30 seconds (adjust as needed)
|
138 |
-
#
|
139 |
-
#
|
140 |
-
# async def run_client():
|
141 |
-
# uri = ("wss://gigaverse-ivrit-ai-streaming.hf.space/wtranscribe") # WebSocket URL
|
142 |
-
# ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
143 |
-
# ssl_context.check_hostname = False
|
144 |
-
# ssl_context.verify_mode = ssl.CERT_NONE
|
145 |
-
# while True:
|
146 |
-
# try:
|
147 |
-
# async with websockets.connect(uri, ssl=ssl_context, ping_timeout=1000, ping_interval=50) as websocket:
|
148 |
-
# await asyncio.gather(
|
149 |
-
# send_audio(websocket),
|
150 |
-
# receive_transcription(websocket),
|
151 |
-
# send_heartbeat(websocket)
|
152 |
-
# )
|
153 |
-
# except websockets.ConnectionClosedError as e:
|
154 |
-
# print(f"WebSocket closed with error: {e}")
|
155 |
-
# # except Exception as e:
|
156 |
-
# # print(f"Unexpected error: {e}")
|
157 |
-
# #
|
158 |
-
# # print("Reconnecting in 5 seconds...")
|
159 |
-
# # await asyncio.sleep(5) # Wait 5 seconds before reconnecting
|
160 |
-
#
|
161 |
-
# asyncio.run(run_client())
|
|
|
1 |
import asyncio
|
|
|
|
|
|
|
|
|
2 |
import websockets
|
3 |
import requests
|
4 |
import ssl
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Parameters for reading and sending the audio
|
7 |
+
AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav" # Use WAV file
|
8 |
+
|
9 |
+
async def send_audio(websocket):
|
10 |
+
buffer_size = 1024*512 # Buffer audio chunks up to 512KB before sending
|
11 |
+
audio_buffer = bytearray()
|
12 |
+
|
13 |
+
with requests.get(AUDIO_FILE_URL, stream=True, allow_redirects=False) as response:
|
14 |
+
if response.status_code == 200:
|
15 |
+
print("Starting to stream audio file...")
|
16 |
+
|
17 |
+
for chunk in response.iter_content(chunk_size=1024): # Stream in chunks
|
18 |
+
if chunk:
|
19 |
+
audio_buffer.extend(chunk)
|
20 |
+
print(f"Received audio chunk of size {len(chunk)} bytes.")
|
21 |
+
|
22 |
+
# Send buffered audio data once it's large enough
|
23 |
+
if len(audio_buffer) >= buffer_size:
|
24 |
+
await websocket.send(audio_buffer)
|
25 |
+
print(f"Sent {len(audio_buffer)} bytes of audio data.")
|
26 |
+
audio_buffer.clear()
|
27 |
+
await asyncio.sleep(0.01)
|
28 |
+
|
29 |
+
print("Finished sending audio.")
|
30 |
+
else:
|
31 |
+
print(f"Failed to download audio file. Status code: {response.status_code}")
|
32 |
+
|
33 |
+
async def receive_transcription(websocket):
|
34 |
+
while True:
|
35 |
+
try:
|
36 |
+
transcription = await websocket.recv() # Receive transcription from the server
|
37 |
+
print(f"Transcription: {transcription}")
|
38 |
+
except Exception as e:
|
39 |
+
print(f"Error receiving transcription: {e}")
|
40 |
+
break
|
41 |
+
|
42 |
+
async def send_heartbeat(websocket):
|
43 |
+
while True:
|
44 |
+
try:
|
45 |
+
await websocket.ping()
|
46 |
+
print("Sent keepalive ping")
|
47 |
+
except websockets.ConnectionClosed:
|
48 |
+
print("Connection closed, stopping heartbeat")
|
49 |
+
break
|
50 |
+
await asyncio.sleep(30) # Send ping every 30 seconds (adjust as needed)
|
51 |
+
|
52 |
+
|
53 |
+
async def run_client():
|
54 |
+
uri = ("wss://gigaverse-ivrit-ai-streaming.hf.space/wtranscribe") # WebSocket URL
|
55 |
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
56 |
ssl_context.check_hostname = False
|
57 |
ssl_context.verify_mode = ssl.CERT_NONE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
+
async with websockets.connect(uri, ssl=ssl_context, timeout=60) as websocket:
|
60 |
+
await asyncio.gather(
|
61 |
+
send_audio(websocket),
|
62 |
+
receive_transcription(websocket),
|
63 |
+
send_heartbeat(websocket)
|
|
|
64 |
)
|
65 |
|
66 |
+
asyncio.run(run_client())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
infer.py
CHANGED
@@ -123,129 +123,250 @@ async def read_root():
|
|
123 |
import tempfile
|
124 |
|
125 |
|
126 |
-
@app.websocket("/ws")
|
127 |
-
async def websocket_endpoint(websocket: WebSocket):
|
128 |
-
"""WebSocket endpoint to handle client connections."""
|
129 |
-
await websocket.accept()
|
130 |
-
client_ip = websocket.client.host
|
131 |
-
logger.info(f"Client connected: {client_ip}")
|
132 |
-
sys.stdout.flush()
|
133 |
-
try:
|
134 |
-
await process_audio_stream(websocket)
|
135 |
-
except WebSocketDisconnect:
|
136 |
-
logger.info(f"Client disconnected: {client_ip}")
|
137 |
-
except Exception as e:
|
138 |
-
logger.error(f"Unexpected error: {e}")
|
139 |
-
await websocket.close()
|
140 |
-
|
141 |
-
async def process_audio_stream(websocket: WebSocket):
|
142 |
-
"""Continuously receive audio chunks and initiate transcription tasks."""
|
143 |
-
sampling_rate = 16000
|
144 |
-
min_chunk_size = 5 # in seconds
|
145 |
-
|
146 |
-
transcription_task = None
|
147 |
-
chunk_counter = 0
|
148 |
-
total_bytes_received = 0
|
149 |
-
|
150 |
-
while True:
|
151 |
-
try:
|
152 |
-
# Receive audio data from client
|
153 |
-
data = await websocket.receive_bytes()
|
154 |
-
if not data:
|
155 |
-
logger.info("No data received, closing connection")
|
156 |
-
break
|
157 |
-
chunk_counter += 1
|
158 |
-
chunk_size = len(data)
|
159 |
-
total_bytes_received += chunk_size
|
160 |
-
#logger.debug(f"Received chunk {chunk_counter}: {chunk_size} bytes")
|
161 |
-
|
162 |
-
audio_chunk = process_received_audio(data)
|
163 |
-
#logger.debug(f"Processed audio chunk {chunk_counter}: {len(audio_chunk)} samples")
|
164 |
-
# Check if enough audio has been buffered
|
165 |
-
if transcription_task is None or transcription_task.done():
|
166 |
-
# Start a new transcription task
|
167 |
-
# logger.info(f"Starting transcription task for {len(audio_buffer)} samples")
|
168 |
-
transcription_task = asyncio.create_task(
|
169 |
-
transcribe_and_send(websocket, audio_chunk)
|
170 |
-
)
|
171 |
-
|
172 |
-
#logger.debug(f"Audio buffer size: {len(audio_buffer)} samples")
|
173 |
-
except Exception as e:
|
174 |
-
logger.error(f"Error receiving data: {e}")
|
175 |
-
break
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
async def transcribe_and_send(websocket: WebSocket, audio_data):
|
180 |
-
"""Run transcription in a separate thread and send the result to the client."""
|
181 |
-
logger.debug(f"Transcription task started for {len(audio_data)} samples")
|
182 |
-
transcription_result = await asyncio.to_thread(sync_transcribe_audio, audio_data)
|
183 |
-
if transcription_result:
|
184 |
-
try:
|
185 |
-
# Send the result as JSON
|
186 |
-
await websocket.send_json(transcription_result)
|
187 |
-
logger.info(f"Transcription JSON sent to client {transcription_result}")
|
188 |
-
except Exception as e:
|
189 |
-
logger.error(f"Error sending transcription: {e}")
|
190 |
-
else:
|
191 |
-
logger.warning("No transcription result to send")
|
192 |
-
|
193 |
-
def sync_transcribe_audio(audio_data):
|
194 |
-
"""Synchronously transcribe audio data using the ASR model and format the result."""
|
195 |
-
try:
|
196 |
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
)
|
201 |
-
logger.info('Transcription completed')
|
202 |
|
203 |
-
|
204 |
-
|
|
|
|
|
|
|
205 |
|
206 |
-
|
207 |
-
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
|
217 |
seg = {
|
218 |
-
'id':
|
219 |
-
'
|
220 |
-
'
|
221 |
-
'end': float(s.end),
|
222 |
-
'text': s.text,
|
223 |
-
'avg_logprob': float(s.avg_logprob),
|
224 |
-
'compression_ratio': float(s.compression_ratio),
|
225 |
-
'no_speech_prob': float(s.no_speech_prob),
|
226 |
-
'words': words
|
227 |
}
|
228 |
-
|
229 |
-
ret['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
-
logger.debug(f"Total segments in transcription result: {len(ret['segments'])}")
|
232 |
-
return ret
|
233 |
except Exception as e:
|
234 |
-
|
235 |
-
|
|
|
|
|
|
|
236 |
|
237 |
-
def process_received_audio(data):
|
238 |
-
"""Convert received bytes into normalized float32 NumPy array."""
|
239 |
-
#logger.debug(f"Processing received audio data of size {len(data)} bytes")
|
240 |
-
audio_int16 = np.frombuffer(data, dtype=np.int16)
|
241 |
-
#logger.debug(f"Converted to int16 NumPy array with {len(audio_int16)} samples")
|
242 |
|
243 |
-
audio_float32 = audio_int16.astype(np.float32) / 32768.0 # Normalize to [-1, 1]
|
244 |
-
#logger.debug(f"Normalized audio data to float32 with {len(audio_float32)} samples")
|
245 |
|
246 |
-
return audio_float32
|
247 |
|
248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
|
251 |
|
|
|
123 |
import tempfile
|
124 |
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
+
def transcribe_core_ws(audio_file, last_transcribed_time):
|
128 |
+
"""
|
129 |
+
Transcribe the audio file and return only the segments that have not been processed yet.
|
|
|
|
|
130 |
|
131 |
+
:param audio_file: Path to the growing audio file.
|
132 |
+
:param last_transcribed_time: The last time (in seconds) that was transcribed.
|
133 |
+
:return: Newly transcribed segments and the updated last transcribed time.
|
134 |
+
"""
|
135 |
+
logging.info(f"Starting transcription for file: {audio_file} from {last_transcribed_time} seconds.")
|
136 |
|
137 |
+
ret = {'new_segments': []}
|
138 |
+
new_last_transcribed_time = last_transcribed_time
|
139 |
|
140 |
+
try:
|
141 |
+
# Transcribe the entire audio file
|
142 |
+
logging.debug(f"Initiating model transcription for file: {audio_file}")
|
143 |
+
segs, _ = model.transcribe(audio_file, language='he', word_timestamps=True)
|
144 |
+
logging.info('Transcription completed successfully.')
|
145 |
+
except Exception as e:
|
146 |
+
logging.error(f"Error during transcription: {e}")
|
147 |
+
raise e
|
148 |
+
|
149 |
+
# Track the new segments and update the last transcribed time
|
150 |
+
for s in segs:
|
151 |
+
logging.info(f"Processing segment with start time: {s.start} and end time: {s.end}")
|
152 |
+
|
153 |
+
# Only process segments that start after the last transcribed time
|
154 |
+
if s.start >= last_transcribed_time:
|
155 |
+
logging.info(f"New segment found starting at {s.start} seconds.")
|
156 |
+
words = [{'start': w.start, 'end': w.end, 'word': w.word, 'probability': w.probability} for w in s.words]
|
157 |
|
158 |
seg = {
|
159 |
+
'id': s.id, 'seek': s.seek, 'start': s.start, 'end': s.end, 'text': s.text,
|
160 |
+
'avg_logprob': s.avg_logprob, 'compression_ratio': s.compression_ratio,
|
161 |
+
'no_speech_prob': s.no_speech_prob, 'words': words
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
}
|
163 |
+
logging.info(f'Adding new transcription segment: {seg}')
|
164 |
+
ret['new_segments'].append(seg)
|
165 |
+
|
166 |
+
# Update the last transcribed time to the end of the current segment
|
167 |
+
new_last_transcribed_time = max(new_last_transcribed_time, s.end)
|
168 |
+
logging.debug(f"Updated last transcribed time to: {new_last_transcribed_time} seconds")
|
169 |
+
|
170 |
+
#logging.info(f"Returning {len(ret['new_segments'])} new segments and updated last transcribed time.")
|
171 |
+
return ret, new_last_transcribed_time
|
172 |
+
|
173 |
+
|
174 |
+
import tempfile
|
175 |
+
|
176 |
+
|
177 |
+
@app.websocket("/wtranscribe")
|
178 |
+
async def websocket_transcribe(websocket: WebSocket):
|
179 |
+
logging.info("New WebSocket connection request received.")
|
180 |
+
await websocket.accept()
|
181 |
+
logging.info("WebSocket connection established successfully.")
|
182 |
+
|
183 |
+
try:
|
184 |
+
processed_segments = [] # Keeps track of the segments already transcribed
|
185 |
+
accumulated_audio_size = 0 # Track how much audio data has been buffered
|
186 |
+
accumulated_audio_time = 0 # Track the total audio duration accumulated
|
187 |
+
last_transcribed_time = 0.0
|
188 |
+
#min_transcription_time = 5.0 # Minimum duration of audio in seconds before transcription starts
|
189 |
+
|
190 |
+
# A temporary file to store the growing audio data
|
191 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
|
192 |
+
logging.info(f"Temporary audio file created at {temp_audio_file.name}")
|
193 |
+
|
194 |
+
while True:
|
195 |
+
try:
|
196 |
+
# Receive the next chunk of audio data
|
197 |
+
audio_chunk = await websocket.receive_bytes()
|
198 |
+
if not audio_chunk:
|
199 |
+
logging.warning("Received empty audio chunk, skipping processing.")
|
200 |
+
continue
|
201 |
+
|
202 |
+
# Write audio chunk to file and accumulate size and time
|
203 |
+
temp_audio_file.write(audio_chunk)
|
204 |
+
temp_audio_file.flush()
|
205 |
+
accumulated_audio_size += len(audio_chunk)
|
206 |
+
|
207 |
+
# Estimate the duration of the chunk based on its size (e.g., 16kHz audio)
|
208 |
+
chunk_duration = len(audio_chunk) / (16000 * 2) # Assuming 16kHz mono WAV (2 bytes per sample)
|
209 |
+
accumulated_audio_time += chunk_duration
|
210 |
+
logging.info(f"Received and buffered {len(audio_chunk)} bytes, total buffered: {accumulated_audio_size} bytes, total time: {accumulated_audio_time:.2f} seconds")
|
211 |
+
|
212 |
+
# Transcribe when enough time (audio) is accumulated (e.g., at least 5 seconds of audio)
|
213 |
+
#if accumulated_audio_time >= min_transcription_time:
|
214 |
+
#logging.info("Buffered enough audio time, starting transcription.")
|
215 |
+
|
216 |
+
|
217 |
+
# Call the transcription function with the last processed time
|
218 |
+
partial_result, last_transcribed_time = transcribe_core_ws(temp_audio_file.name, last_transcribed_time)
|
219 |
+
accumulated_audio_time = 0 # Reset the accumulated audio time
|
220 |
+
processed_segments.extend(partial_result['new_segments'])
|
221 |
+
|
222 |
+
# Reset the accumulated audio size after transcription
|
223 |
+
accumulated_audio_size = 0
|
224 |
+
|
225 |
+
# Send the transcription result back to the client with both new and all processed segments
|
226 |
+
response = {
|
227 |
+
"new_segments": partial_result['new_segments'],
|
228 |
+
"processed_segments": processed_segments
|
229 |
+
}
|
230 |
+
logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")
|
231 |
+
await websocket.send_json(response)
|
232 |
+
|
233 |
+
except WebSocketDisconnect:
|
234 |
+
logging.info("WebSocket connection closed by the client.")
|
235 |
+
break
|
236 |
|
|
|
|
|
237 |
except Exception as e:
|
238 |
+
logging.error(f"Unexpected error during WebSocket transcription: {e}")
|
239 |
+
await websocket.send_json({"error": str(e)})
|
240 |
+
|
241 |
+
finally:
|
242 |
+
logging.info("Cleaning up and closing WebSocket connection.")
|
243 |
|
|
|
|
|
|
|
|
|
|
|
244 |
|
|
|
|
|
245 |
|
|
|
246 |
|
247 |
|
248 |
+
# @app.websocket("/ws")
|
249 |
+
# async def websocket_endpoint(websocket: WebSocket):
|
250 |
+
# """WebSocket endpoint to handle client connections."""
|
251 |
+
# await websocket.accept()
|
252 |
+
# client_ip = websocket.client.host
|
253 |
+
# logger.info(f"Client connected: {client_ip}")
|
254 |
+
# sys.stdout.flush()
|
255 |
+
# try:
|
256 |
+
# await process_audio_stream(websocket)
|
257 |
+
# except WebSocketDisconnect:
|
258 |
+
# logger.info(f"Client disconnected: {client_ip}")
|
259 |
+
# except Exception as e:
|
260 |
+
# logger.error(f"Unexpected error: {e}")
|
261 |
+
# await websocket.close()
|
262 |
+
#
|
263 |
+
# async def process_audio_stream(websocket: WebSocket):
|
264 |
+
# """Continuously receive audio chunks and initiate transcription tasks."""
|
265 |
+
# sampling_rate = 16000
|
266 |
+
# min_chunk_size = 5 # in seconds
|
267 |
+
#
|
268 |
+
# transcription_task = None
|
269 |
+
# chunk_counter = 0
|
270 |
+
# total_bytes_received = 0
|
271 |
+
#
|
272 |
+
# while True:
|
273 |
+
# try:
|
274 |
+
# # Receive audio data from client
|
275 |
+
# data = await websocket.receive_bytes()
|
276 |
+
# if not data:
|
277 |
+
# logger.info("No data received, closing connection")
|
278 |
+
# break
|
279 |
+
# chunk_counter += 1
|
280 |
+
# chunk_size = len(data)
|
281 |
+
# total_bytes_received += chunk_size
|
282 |
+
# #logger.debug(f"Received chunk {chunk_counter}: {chunk_size} bytes")
|
283 |
+
#
|
284 |
+
# audio_chunk = process_received_audio(data)
|
285 |
+
# #logger.debug(f"Processed audio chunk {chunk_counter}: {len(audio_chunk)} samples")
|
286 |
+
# # Check if enough audio has been buffered
|
287 |
+
# # if transcription_task is None or transcription_task.done():
|
288 |
+
# # # Start a new transcription task
|
289 |
+
# # # logger.info(f"Starting transcription task for {len(audio_buffer)} samples")
|
290 |
+
# transcription_task = asyncio.create_task(
|
291 |
+
# transcribe_and_send(websocket, audio_chunk)
|
292 |
+
# )
|
293 |
+
#
|
294 |
+
# #logger.debug(f"Audio buffer size: {len(audio_buffer)} samples")
|
295 |
+
# except Exception as e:
|
296 |
+
# logger.error(f"Error receiving data: {e}")
|
297 |
+
# break
|
298 |
+
#
|
299 |
+
#
|
300 |
+
# async def transcribe_and_send(websocket: WebSocket, audio_data):
|
301 |
+
# """Run transcription in a separate thread and send the result to the client."""
|
302 |
+
# logger.debug(f"Transcription task started for {len(audio_data)} samples")
|
303 |
+
# transcription_result = await asyncio.to_thread(sync_transcribe_audio, audio_data)
|
304 |
+
# if transcription_result:
|
305 |
+
# try:
|
306 |
+
# # Send the result as JSON
|
307 |
+
# await websocket.send_json(transcription_result)
|
308 |
+
# logger.info(f"Transcription JSON sent to client {transcription_result}")
|
309 |
+
# except Exception as e:
|
310 |
+
# logger.error(f"Error sending transcription: {e}")
|
311 |
+
# else:
|
312 |
+
# logger.warning("No transcription result to send")
|
313 |
+
#
|
314 |
+
# def sync_transcribe_audio(audio_data):
|
315 |
+
# """Synchronously transcribe audio data using the ASR model and format the result."""
|
316 |
+
# try:
|
317 |
+
#
|
318 |
+
# logger.info('Starting transcription...')
|
319 |
+
# segments, info = model.transcribe(
|
320 |
+
# audio_data, language="he",compression_ratio_threshold=2.5, word_timestamps=True
|
321 |
+
# )
|
322 |
+
# logger.info('Transcription completed')
|
323 |
+
#
|
324 |
+
# # Build the transcription result as per your requirement
|
325 |
+
# ret = {'segments': []}
|
326 |
+
#
|
327 |
+
# for s in segments:
|
328 |
+
# logger.debug(f"Processing segment {s.id} with start time: {s.start} and end time: {s.end}")
|
329 |
+
#
|
330 |
+
# # Process words in the segment
|
331 |
+
# words = [{
|
332 |
+
# 'start': float(w.start),
|
333 |
+
# 'end': float(w.end),
|
334 |
+
# 'word': w.word,
|
335 |
+
# 'probability': float(w.probability)
|
336 |
+
# } for w in s.words]
|
337 |
+
#
|
338 |
+
# seg = {
|
339 |
+
# 'id': int(s.id),
|
340 |
+
# 'seek': int(s.seek),
|
341 |
+
# 'start': float(s.start),
|
342 |
+
# 'end': float(s.end),
|
343 |
+
# 'text': s.text,
|
344 |
+
# 'avg_logprob': float(s.avg_logprob),
|
345 |
+
# 'compression_ratio': float(s.compression_ratio),
|
346 |
+
# 'no_speech_prob': float(s.no_speech_prob),
|
347 |
+
# 'words': words
|
348 |
+
# }
|
349 |
+
# logger.debug(f'Adding new transcription segment: {seg}')
|
350 |
+
# ret['segments'].append(seg)
|
351 |
+
#
|
352 |
+
# logger.debug(f"Total segments in transcription result: {len(ret['segments'])}")
|
353 |
+
# return ret
|
354 |
+
# except Exception as e:
|
355 |
+
# logger.error(f"Transcription error: {e}")
|
356 |
+
# return {}
|
357 |
+
#
|
358 |
+
# def process_received_audio(data):
|
359 |
+
# """Convert received bytes into normalized float32 NumPy array."""
|
360 |
+
# #logger.debug(f"Processing received audio data of size {len(data)} bytes")
|
361 |
+
# audio_int16 = np.frombuffer(data, dtype=np.int16)
|
362 |
+
# #logger.debug(f"Converted to int16 NumPy array with {len(audio_int16)} samples")
|
363 |
+
#
|
364 |
+
# audio_float32 = audio_int16.astype(np.float32) / 32768.0 # Normalize to [-1, 1]
|
365 |
+
# #logger.debug(f"Normalized audio data to float32 with {len(audio_float32)} samples")
|
366 |
+
#
|
367 |
+
# return audio_float32
|
368 |
+
#
|
369 |
+
#
|
370 |
|
371 |
|
372 |
|