Spaces:
Sleeping
Sleeping
Merge branch 'main' of https://huggingface.co/spaces/Gigaverse/ivrit-ai-streaming
Browse files- .gitignore +2 -0
- client.py +104 -29
- downloaded_audio.wav +0 -1
- infer.py +148 -99
- pyproject.toml +7 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*.wav
|
2 |
+
*.ogg
|
client.py
CHANGED
@@ -1,36 +1,111 @@
|
|
1 |
import asyncio
|
2 |
import json
|
|
|
|
|
3 |
|
4 |
import websockets
|
5 |
import requests
|
6 |
import ssl
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# Parameters for reading and sending the audio
|
9 |
AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav" # Use WAV file
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
async def send_audio(websocket):
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
await websocket.send(audio_buffer)
|
27 |
-
#print(f"Sent {len(audio_buffer)} bytes of audio data.")
|
28 |
-
audio_buffer.clear()
|
29 |
-
await asyncio.sleep(0.01)
|
30 |
|
31 |
-
print("Finished sending audio.")
|
32 |
-
else:
|
33 |
-
print(f"Failed to download audio file. Status code: {response.status_code}")
|
34 |
|
35 |
async def receive_transcription(websocket):
|
36 |
while True:
|
@@ -38,17 +113,17 @@ async def receive_transcription(websocket):
|
|
38 |
transcription = await websocket.recv() # Receive transcription from the server
|
39 |
print(f"Transcription: {transcription}")
|
40 |
transcription = json.loads(transcription)
|
41 |
-
download_url = transcription.get('download_url')
|
42 |
-
if download_url:
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
except Exception as e:
|
53 |
print(f"Error receiving transcription: {e}")
|
54 |
break
|
|
|
1 |
import asyncio
|
2 |
import json
|
3 |
+
import logging
|
4 |
+
import wave
|
5 |
|
6 |
import websockets
|
7 |
import requests
|
8 |
import ssl
|
9 |
+
import sys
|
10 |
+
|
11 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s',
|
12 |
+
handlers=[logging.StreamHandler(sys.stdout)], force=True)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
|
15 |
# Parameters for reading and sending the audio
|
16 |
AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav" # Use WAV file
|
17 |
|
18 |
+
from pydub import AudioSegment
|
19 |
+
|
20 |
+
|
21 |
+
# Convert and resample audio before writing it to WAV
|
22 |
+
# Convert and resample audio before writing it to WAV
|
23 |
+
def convert_to_mono_16k(audio_file_path):
|
24 |
+
logging.info(f"Starting audio conversion to mono and resampling to 16kHz for file: {audio_file_path}")
|
25 |
+
|
26 |
+
try:
|
27 |
+
# Load the audio file into an AudioSegment object
|
28 |
+
audio_segment = AudioSegment.from_file(audio_file_path, format="wav")
|
29 |
+
|
30 |
+
# Convert the audio to mono and resample it to 16kHz
|
31 |
+
audio_segment = audio_segment.set_channels(1).set_frame_rate(16000)
|
32 |
+
|
33 |
+
logging.info("Audio conversion to mono and 16kHz completed successfully.")
|
34 |
+
except Exception as e:
|
35 |
+
logging.error(f"Error during audio conversion: {e}")
|
36 |
+
raise e
|
37 |
+
|
38 |
+
# Return the modified AudioSegment object
|
39 |
+
return audio_segment
|
40 |
+
|
41 |
+
|
42 |
async def send_audio(websocket):
|
43 |
+
print(f"hi")
|
44 |
+
buffer_size = 1024 * 16 # Send smaller chunks (16KB) for real-time processing
|
45 |
+
logging.info("Converting the audio to mono and 16kHz.")
|
46 |
+
|
47 |
+
try:
|
48 |
+
converted_audio = convert_to_mono_16k('test_copy.wav')
|
49 |
+
except Exception as e:
|
50 |
+
logging.error(f"Failed to convert audio: {e}")
|
51 |
+
return
|
52 |
+
|
53 |
+
# Send metadata to the server
|
54 |
+
metadata = {
|
55 |
+
'sample_rate': 16000, # Resampled rate
|
56 |
+
'channels': 1, # Converted to mono
|
57 |
+
'sampwidth': 2 # Assuming 16-bit audio
|
58 |
+
}
|
59 |
+
await websocket.send(json.dumps(metadata))
|
60 |
+
logging.info(f"Sent metadata: {metadata}")
|
61 |
+
|
62 |
+
try:
|
63 |
+
raw_data = converted_audio.raw_data
|
64 |
+
logging.info(f"Starting to send raw PCM audio data. Total data size: {len(raw_data)} bytes.")
|
65 |
+
|
66 |
+
for i in range(0, len(raw_data), buffer_size):
|
67 |
+
pcm_chunk = raw_data[i:i + buffer_size]
|
68 |
+
await websocket.send(pcm_chunk) # Send raw PCM data chunk
|
69 |
+
#logging.info(f"Sent PCM chunk of size {len(pcm_chunk)} bytes.")
|
70 |
+
await asyncio.sleep(0.01) # Simulate real-time sending
|
71 |
+
|
72 |
+
logging.info("Completed sending all audio data.")
|
73 |
+
except Exception as e:
|
74 |
+
logging.error(f"Error while sending audio data: {e}")
|
75 |
+
|
76 |
+
# Download the WAV file locally
|
77 |
+
# with requests.get(AUDIO_FILE_URL, stream=True) as response:
|
78 |
+
# if response.status_code == 200:
|
79 |
+
# with open('downloaded_audio.wav', 'wb') as f:
|
80 |
+
# for chunk in response.iter_content(chunk_size=1024):
|
81 |
+
# f.write(chunk)
|
82 |
+
# print("Audio file downloaded successfully.")
|
83 |
|
84 |
+
# Open the downloaded WAV file and extract PCM data
|
85 |
+
# with wave.open('test_copy.wav', 'rb') as wav_file:
|
86 |
+
# metadata = {
|
87 |
+
# 'sample_rate': wav_file.getframerate(),
|
88 |
+
# 'channels': wav_file.getnchannels(),
|
89 |
+
# 'sampwidth': wav_file.getsampwidth(),
|
90 |
+
# }
|
91 |
+
#
|
92 |
+
# # Send metadata to the server before sending the audio
|
93 |
+
# await websocket.send(json.dumps(metadata))
|
94 |
+
# print(f"Sent metadata: {metadata}")
|
95 |
|
96 |
+
# # Send the PCM audio data in chunks
|
97 |
+
# while True:
|
98 |
+
# pcm_chunk = wav_file.readframes(buffer_size)
|
99 |
+
# if not pcm_chunk:
|
100 |
+
# break # End of file
|
101 |
+
#
|
102 |
+
# await websocket.send(pcm_chunk) # Send raw PCM data chunk
|
103 |
+
# #print(f"Sent PCM chunk of size {len(pcm_chunk)} bytes.")
|
104 |
+
# await asyncio.sleep(0.01) # Simulate real-time sending
|
105 |
|
106 |
+
# else:
|
107 |
+
# print(f"Failed to download audio file. Status code: {response.status_code}")
|
|
|
|
|
|
|
|
|
108 |
|
|
|
|
|
|
|
109 |
|
110 |
async def receive_transcription(websocket):
|
111 |
while True:
|
|
|
113 |
transcription = await websocket.recv() # Receive transcription from the server
|
114 |
print(f"Transcription: {transcription}")
|
115 |
transcription = json.loads(transcription)
|
116 |
+
#download_url = transcription.get('download_url')
|
117 |
+
# if download_url:
|
118 |
+
# print(f"Download URL: {download_url}")
|
119 |
+
# # Download the audio file
|
120 |
+
# response = requests.get(download_url)
|
121 |
+
# if response.status_code == 200:
|
122 |
+
# with open("downloaded_audio.wav", "wb") as f:
|
123 |
+
# f.write(response.content)
|
124 |
+
# print("File downloaded successfully")
|
125 |
+
# else:
|
126 |
+
# print(f"Failed to download file. Status code: {response.status_code}")
|
127 |
except Exception as e:
|
128 |
print(f"Error receiving transcription: {e}")
|
129 |
break
|
downloaded_audio.wav
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"error":"File not found"}
|
|
|
|
infer.py
CHANGED
@@ -21,7 +21,7 @@ import asyncio
|
|
21 |
from model import segment_to_dict
|
22 |
|
23 |
# Configure logging
|
24 |
-
logging.basicConfig(level=logging.
|
25 |
handlers=[logging.StreamHandler(sys.stdout)], force=True)
|
26 |
logger = logging.getLogger(__name__)
|
27 |
#logging.getLogger("asyncio").setLevel(logging.DEBUG)
|
@@ -184,24 +184,14 @@ async def read_root():
|
|
184 |
import tempfile
|
185 |
|
186 |
|
|
|
187 |
|
188 |
-
|
189 |
-
"""
|
190 |
-
Transcribe the audio file and return only the segments that have not been processed yet.
|
191 |
-
|
192 |
-
:param audio_file: Path to the growing audio file.
|
193 |
-
:param last_transcribed_time: The last time (in seconds) that was transcribed.
|
194 |
-
:return: Newly transcribed segments and the updated last transcribed time.
|
195 |
-
"""
|
196 |
-
logging.info(f"Starting transcription for file: {audio_file} from {last_transcribed_time} seconds.")
|
197 |
-
|
198 |
-
ret = {'new_segments': []}
|
199 |
-
new_last_transcribed_time = last_transcribed_time
|
200 |
|
201 |
try:
|
202 |
# Transcribe the entire audio file
|
203 |
logging.debug(f"Initiating model transcription for file: {audio_file}")
|
204 |
-
segs, _ = model.transcribe
|
205 |
logging.info('Transcription completed successfully.')
|
206 |
except Exception as e:
|
207 |
logging.error(f"Error during transcription: {e}")
|
@@ -210,31 +200,62 @@ def transcribe_core_ws(audio_file, last_transcribed_time):
|
|
210 |
# Track the new segments and update the last transcribed time
|
211 |
for s in segs:
|
212 |
logging.info(f"Processing segment with start time: {s.start} and end time: {s.end}")
|
|
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
218 |
|
219 |
-
seg = {
|
220 |
-
'id': s.id, 'seek': s.seek, 'start': s.start, 'end': s.end, 'text': s.text,
|
221 |
-
'avg_logprob': s.avg_logprob, 'compression_ratio': s.compression_ratio,
|
222 |
-
'no_speech_prob': s.no_speech_prob, 'words': words
|
223 |
-
}
|
224 |
-
logging.info(f'Adding new transcription segment: {seg}')
|
225 |
-
ret['new_segments'].append(seg)
|
226 |
-
|
227 |
-
# Update the last transcribed time to the end of the current segment
|
228 |
-
new_last_transcribed_time = max(new_last_transcribed_time, s.end)
|
229 |
-
logging.debug(f"Updated last transcribed time to: {new_last_transcribed_time} seconds")
|
230 |
|
231 |
#logging.info(f"Returning {len(ret['new_segments'])} new segments and updated last transcribed time.")
|
232 |
-
return ret
|
233 |
|
234 |
|
235 |
import tempfile
|
236 |
|
237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
@app.websocket("/wtranscribe")
|
239 |
async def websocket_transcribe(websocket: WebSocket):
|
240 |
logging.info("New WebSocket connection request received.")
|
@@ -242,77 +263,111 @@ async def websocket_transcribe(websocket: WebSocket):
|
|
242 |
logging.info("WebSocket connection established successfully.")
|
243 |
|
244 |
try:
|
245 |
-
|
246 |
-
accumulated_audio_size = 0 # Track how much audio data has been buffered
|
247 |
accumulated_audio_time = 0 # Track the total audio duration accumulated
|
248 |
last_transcribed_time = 0.0
|
249 |
-
|
250 |
-
|
251 |
-
# A
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
# Send the transcription result back to the client with both new and all processed segments
|
305 |
response = {
|
306 |
-
"
|
307 |
-
"processed_segments": processed_segments,
|
308 |
-
"download_url": f"https://gigaverse-ivrit-ai-streaming.hf.space/download_audio/{os.path.basename(chunk_filename)}"
|
309 |
}
|
310 |
-
logging.info(f"Sending {len(partial_result['
|
311 |
await websocket.send_json(response)
|
312 |
|
313 |
-
|
314 |
-
|
315 |
-
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
except Exception as e:
|
318 |
logging.error(f"Unexpected error during WebSocket transcription: {e}")
|
@@ -459,12 +514,6 @@ async def download_audio(filename: str):
|
|
459 |
#
|
460 |
|
461 |
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
# @app.websocket("/wtranscribe")
|
469 |
# async def websocket_transcribe(websocket: WebSocket):
|
470 |
# logging.info("New WebSocket connection request received.")
|
|
|
21 |
from model import segment_to_dict
|
22 |
|
23 |
# Configure logging
|
24 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s',
|
25 |
handlers=[logging.StreamHandler(sys.stdout)], force=True)
|
26 |
logger = logging.getLogger(__name__)
|
27 |
#logging.getLogger("asyncio").setLevel(logging.DEBUG)
|
|
|
184 |
import tempfile
|
185 |
|
186 |
|
187 |
+
async def transcribe_core_ws(audio_file):
|
188 |
|
189 |
+
ret = {'segments': []}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
try:
|
192 |
# Transcribe the entire audio file
|
193 |
logging.debug(f"Initiating model transcription for file: {audio_file}")
|
194 |
+
segs, _ = await asyncio.to_thread(model.transcribe,audio_file, language='he', word_timestamps=True)
|
195 |
logging.info('Transcription completed successfully.')
|
196 |
except Exception as e:
|
197 |
logging.error(f"Error during transcription: {e}")
|
|
|
200 |
# Track the new segments and update the last transcribed time
|
201 |
for s in segs:
|
202 |
logging.info(f"Processing segment with start time: {s.start} and end time: {s.end}")
|
203 |
+
words = [{'start': w.start, 'end': w.end, 'word': w.word, 'probability': w.probability} for w in s.words]
|
204 |
|
205 |
+
seg = {
|
206 |
+
'id': s.id, 'seek': s.seek, 'start': s.start, 'end': s.end, 'text': s.text,
|
207 |
+
'avg_logprob': s.avg_logprob, 'compression_ratio': s.compression_ratio,
|
208 |
+
'no_speech_prob': s.no_speech_prob, 'words': words
|
209 |
+
}
|
210 |
+
logging.info(f'Adding new transcription segment: {seg}')
|
211 |
+
ret['segments'].append(seg)
|
212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
#logging.info(f"Returning {len(ret['new_segments'])} new segments and updated last transcribed time.")
|
215 |
+
return ret
|
216 |
|
217 |
|
218 |
import tempfile
|
219 |
|
220 |
|
221 |
+
# Function to verify if the PCM data is valid
|
222 |
+
def validate_pcm_data(pcm_audio_buffer, sample_rate, channels, sample_width):
|
223 |
+
"""Validates the PCM data buffer to ensure it conforms to the expected format."""
|
224 |
+
logging.info(f"Validating PCM data: total size = {len(pcm_audio_buffer)} bytes.")
|
225 |
+
|
226 |
+
# Calculate the expected sample size
|
227 |
+
expected_sample_size = sample_rate * channels * sample_width
|
228 |
+
actual_sample_size = len(pcm_audio_buffer)
|
229 |
+
|
230 |
+
if actual_sample_size == 0:
|
231 |
+
logging.error("Received PCM data is empty.")
|
232 |
+
return False
|
233 |
+
|
234 |
+
logging.info(f"Expected sample size per second: {expected_sample_size} bytes.")
|
235 |
+
|
236 |
+
if actual_sample_size % expected_sample_size != 0:
|
237 |
+
logging.warning(
|
238 |
+
f"PCM data size {actual_sample_size} is not a multiple of the expected sample size per second ({expected_sample_size} bytes). Data may be corrupted or incomplete.")
|
239 |
+
|
240 |
+
return True
|
241 |
+
|
242 |
+
|
243 |
+
# Function to validate if the created WAV file is valid
|
244 |
+
def validate_wav_file(wav_file_path):
|
245 |
+
"""Validates if the WAV file was created correctly and can be opened."""
|
246 |
+
try:
|
247 |
+
with wave.open(wav_file_path, 'rb') as wav_file:
|
248 |
+
sample_rate = wav_file.getframerate()
|
249 |
+
channels = wav_file.getnchannels()
|
250 |
+
sample_width = wav_file.getsampwidth()
|
251 |
+
logging.info(
|
252 |
+
f"WAV file details - Sample Rate: {sample_rate}, Channels: {channels}, Sample Width: {sample_width}")
|
253 |
+
return True
|
254 |
+
except wave.Error as e:
|
255 |
+
logging.error(f"Error reading WAV file: {e}")
|
256 |
+
return False
|
257 |
+
|
258 |
+
|
259 |
@app.websocket("/wtranscribe")
|
260 |
async def websocket_transcribe(websocket: WebSocket):
|
261 |
logging.info("New WebSocket connection request received.")
|
|
|
263 |
logging.info("WebSocket connection established successfully.")
|
264 |
|
265 |
try:
|
266 |
+
segments = [] # Keeps track of the segments already transcribed
|
|
|
267 |
accumulated_audio_time = 0 # Track the total audio duration accumulated
|
268 |
last_transcribed_time = 0.0
|
269 |
+
min_transcription_time = 5.0 # Minimum duration of audio in seconds before transcription starts
|
270 |
+
|
271 |
+
# A buffer to store raw PCM audio data
|
272 |
+
pcm_audio_buffer = bytearray()
|
273 |
+
logging.info("im here, is it failing?.")
|
274 |
+
|
275 |
+
# Metadata for the incoming PCM data (sample rate, channels, and sample width should be consistent)
|
276 |
+
sample_rate = 16000 # 16kHz
|
277 |
+
channels = 1 # Mono
|
278 |
+
sample_width = 2 # 2 bytes per sample (16-bit audio)
|
279 |
+
|
280 |
+
# Ensure the /tmp directory exists
|
281 |
+
tmp_directory = "/tmp"
|
282 |
+
if not os.path.exists(tmp_directory):
|
283 |
+
logging.info(f"Creating /tmp directory: {tmp_directory}")
|
284 |
+
os.makedirs(tmp_directory)
|
285 |
+
logging.info("im here, is it failing?2.")
|
286 |
+
while True:
|
287 |
+
logging.info("in while true")
|
288 |
+
try:
|
289 |
+
# Receive the next chunk of PCM audio data
|
290 |
+
logging.info("in try before recive ")
|
291 |
+
audio_chunk = await asyncio.wait_for(websocket.receive_bytes(), timeout=10.0)
|
292 |
+
|
293 |
+
logging.info("after recieve")
|
294 |
+
sys.stdout.flush()
|
295 |
+
if not audio_chunk:
|
296 |
+
logging.warning("Received empty audio chunk, skipping processing.")
|
297 |
+
continue
|
298 |
+
|
299 |
+
# Accumulate the raw PCM data into the buffer
|
300 |
+
pcm_audio_buffer.extend(audio_chunk)
|
301 |
+
print(f"len of pcm buffer: {len(pcm_audio_buffer)}")
|
302 |
+
logging.info("after buffer extend")
|
303 |
+
|
304 |
+
# Validate the PCM data after each chunk
|
305 |
+
if not validate_pcm_data(pcm_audio_buffer, sample_rate, channels, sample_width):
|
306 |
+
logging.error("Invalid PCM data received. Aborting transcription.")
|
307 |
+
await websocket.send_json({"error": "Invalid PCM data received."})
|
308 |
+
return
|
309 |
+
|
310 |
+
# Estimate the duration of the chunk based on its size
|
311 |
+
chunk_duration = len(audio_chunk) / (sample_rate * channels * sample_width)
|
312 |
+
accumulated_audio_time += chunk_duration
|
313 |
+
logging.info(
|
314 |
+
f"Received and buffered {len(audio_chunk)} bytes, total buffered: {len(pcm_audio_buffer)} bytes, total time: {accumulated_audio_time:.2f} seconds")
|
315 |
+
|
316 |
+
# Transcribe when enough time (audio) is accumulated (e.g., at least 5 seconds of audio)
|
317 |
+
if accumulated_audio_time >= min_transcription_time:
|
318 |
+
logging.info("Buffered enough audio time, starting transcription.")
|
319 |
+
|
320 |
+
# Create a temporary WAV file in /tmp for transcription
|
321 |
+
|
322 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/tmp") as temp_wav_file:
|
323 |
+
logging.info(f"Temporary audio file created at {temp_wav_file.name}")
|
324 |
+
|
325 |
+
with wave.open(temp_wav_file.name, 'wb') as wav_file:
|
326 |
+
wav_file.setnchannels(channels)
|
327 |
+
wav_file.setsampwidth(sample_width)
|
328 |
+
wav_file.setframerate(sample_rate)
|
329 |
+
wav_file.writeframes(pcm_audio_buffer)
|
330 |
+
temp_wav_file.flush()
|
331 |
+
|
332 |
+
if not validate_wav_file(temp_wav_file.name):
|
333 |
+
logging.error(f"Invalid WAV file created: {temp_wav_file.name}")
|
334 |
+
await websocket.send_json({"error": "Invalid WAV file created."})
|
335 |
+
return
|
336 |
+
|
337 |
+
logging.info(f"Temporary WAV file created at {temp_wav_file.name} for transcription.")
|
338 |
+
|
339 |
+
# Log to confirm that the file exists and has the expected size
|
340 |
+
if os.path.exists(temp_wav_file.name):
|
341 |
+
file_size = os.path.getsize(temp_wav_file.name)
|
342 |
+
logging.info(f"Temporary WAV file size: {file_size} bytes.")
|
343 |
+
else:
|
344 |
+
logging.error(f"Temporary WAV file {temp_wav_file.name} does not exist.")
|
345 |
+
raise Exception(f"Temporary WAV file {temp_wav_file.name} not found.")
|
346 |
+
|
347 |
+
with open(temp_wav_file.name, 'rb') as audio_file:
|
348 |
+
audio_data = audio_file.read()
|
349 |
+
partial_result = await asyncio.to_thread(transcribe_core_ws,audio_data)
|
350 |
+
segments.extend(partial_result['segments'])
|
351 |
+
|
352 |
+
# Clear the buffer after transcription
|
353 |
+
pcm_audio_buffer.clear()
|
354 |
+
accumulated_audio_time = 0 # Reset accumulated time
|
355 |
|
356 |
# Send the transcription result back to the client with both new and all processed segments
|
357 |
response = {
|
358 |
+
"segments": segments
|
|
|
|
|
359 |
}
|
360 |
+
logging.info(f"Sending {len(partial_result['segments'])} segments to the client.")
|
361 |
await websocket.send_json(response)
|
362 |
|
363 |
+
# Optionally delete the temporary WAV file after processing
|
364 |
+
if os.path.exists(temp_wav_file.name):
|
365 |
+
os.remove(temp_wav_file.name)
|
366 |
+
logging.info(f"Temporary WAV file {temp_wav_file.name} removed.")
|
367 |
+
|
368 |
+
except WebSocketDisconnect:
|
369 |
+
logging.info("WebSocket connection closed by the client.")
|
370 |
+
break
|
371 |
|
372 |
except Exception as e:
|
373 |
logging.error(f"Unexpected error during WebSocket transcription: {e}")
|
|
|
514 |
#
|
515 |
|
516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
517 |
# @app.websocket("/wtranscribe")
|
518 |
# async def websocket_transcribe(websocket: WebSocket):
|
519 |
# logging.info("New WebSocket connection request received.")
|
pyproject.toml
CHANGED
@@ -37,6 +37,13 @@ librosa = "^0.10.2.post1"
|
|
37 |
uvicorn = "^0.30.6"
|
38 |
torchaudio = "^2.4.1"
|
39 |
silero-vad = "^5.1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
|
42 |
|
|
|
37 |
uvicorn = "^0.30.6"
|
38 |
torchaudio = "^2.4.1"
|
39 |
silero-vad = "^5.1"
|
40 |
+
#openai = "^1.42.0"
|
41 |
+
#numpy = "^1.22.0"
|
42 |
+
#torch = "2.1.0"
|
43 |
+
#sounddevice = "^0.5.0"
|
44 |
+
#pydub = "^0.25.1"
|
45 |
+
#ffmpeg = "^1.4"
|
46 |
+
|
47 |
|
48 |
|
49 |
|