Spaces:

Gigaverse
/

ivrit-ai-streaming

Sleeping

App Files Files Community

AshDavid12 commited on Sep 12, 2024

Commit

4c42c49

1 Parent(s): 35b4964

revert

Browse files

Files changed (2) hide show

client.py +9 -29
infer.py +23 -7

client.py CHANGED Viewed

@@ -2,38 +2,14 @@ import asyncio
 import websockets
 import requests
 import ssl
 # Parameters for reading and sending the audio
 AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav"  # Use WAV file
-processed_segments = set()
-def process_transcription_results(transcription_result):
-    global processed_segments  # Ensure we use the same set across multiple calls
-    new_segments = []
-    # Iterate over all segments in the transcription result
-    for segment in transcription_result.get("segments", []):
-        # You can use a unique identifier like 'id' or a combination of 'start' and 'end' times
-        segment_id = segment.get("id")
-        # Check if the segment is already processed
-        if segment_id not in processed_segments:
-            # Process the new segment (do your actual processing here)
-            new_segments.append(segment)
-            # Mark the segment as processed by adding its 'id' to the set
-            processed_segments.add(segment_id)
-            print(f"Processed segment ID: {segment_id}")
-        else:
-            print(f"Skipping already processed segment ID: {segment_id}")
-    # Return only new segments that have not been processed before
-    return new_segments
 async def send_audio(websocket):
-    buffer_size = 512 * 1024  # Buffer audio chunks up to 512KB before sending
     audio_buffer = bytearray()
     with requests.get(AUDIO_FILE_URL, stream=True, allow_redirects=False) as response:
@@ -48,7 +24,7 @@ async def send_audio(websocket):
                     # Send buffered audio data once it's large enough
                     if len(audio_buffer) >= buffer_size:
                         await websocket.send(audio_buffer)
-                        print(f"Sent {len(audio_buffer)} bytes of audio data.")
                         audio_buffer.clear()
                         await asyncio.sleep(0.01)
@@ -56,6 +32,7 @@ async def send_audio(websocket):
         else:
             print(f"Failed to download audio file. Status code: {response.status_code}")
 async def receive_transcription(websocket):
     while True:
         try:
@@ -70,6 +47,7 @@ async def receive_transcription(websocket):
             print(f"Error receiving transcription: {e}")
             break
 async def send_heartbeat(websocket):
     while True:
         try:
@@ -78,7 +56,7 @@ async def send_heartbeat(websocket):
         except websockets.ConnectionClosed:
             print("Connection closed, stopping heartbeat")
             break
-        await asyncio.sleep(120)  # Send ping every 30 seconds (adjust as needed)
 async def run_client():
@@ -87,11 +65,13 @@ async def run_client():
     ssl_context.check_hostname = False
     ssl_context.verify_mode = ssl.CERT_NONE
-    async with websockets.connect(uri, ssl=ssl_context, timeout=600) as websocket:
         await asyncio.gather(
             send_audio(websocket),
             receive_transcription(websocket),
             send_heartbeat(websocket)
         )
 asyncio.run(run_client())

 import websockets
 import requests
 import ssl
+import logging
 # Parameters for reading and sending the audio
 AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav"  # Use WAV file
 async def send_audio(websocket):
+    buffer_size = 1024  # Buffer audio chunks up to 512KB before sending
     audio_buffer = bytearray()
     with requests.get(AUDIO_FILE_URL, stream=True, allow_redirects=False) as response:
                     # Send buffered audio data once it's large enough
                     if len(audio_buffer) >= buffer_size:
                         await websocket.send(audio_buffer)
+                        #print(f"Sent {len(audio_buffer)} bytes of audio data.")
                         audio_buffer.clear()
                         await asyncio.sleep(0.01)
         else:
             print(f"Failed to download audio file. Status code: {response.status_code}")
 async def receive_transcription(websocket):
     while True:
         try:
             print(f"Error receiving transcription: {e}")
             break
 async def send_heartbeat(websocket):
     while True:
         try:
         except websockets.ConnectionClosed:
             print("Connection closed, stopping heartbeat")
             break
+        await asyncio.sleep(30)  # Send ping every 30 seconds (adjust as needed)
 async def run_client():
     ssl_context.check_hostname = False
     ssl_context.verify_mode = ssl.CERT_NONE
+    async with websockets.connect(uri, ssl=ssl_context, timeout=120) as websocket:
+        print(f"here")
         await asyncio.gather(
             send_audio(websocket),
             receive_transcription(websocket),
             send_heartbeat(websocket)
         )
 asyncio.run(run_client())

infer.py CHANGED Viewed

@@ -160,14 +160,13 @@ def transcribe_core_ws(audio_file, last_transcribed_time):
     # Track the new segments and update the last transcribed time
     for s in segs:
-        words= []
         logging.info(f"Processing segment with start time: {s.start} and end time: {s.end}")
         # Only process segments that start after the last transcribed time
         if s.start >= last_transcribed_time:
             logging.info(f"New segment found starting at {s.start} seconds.")
-            for w in words:
-                words.append({'start': w.start, 'end': w.end, 'word': w.word, 'probability': w.probability})
             seg = {
                 'id': s.id, 'seek': s.seek, 'start': s.start, 'end': s.end, 'text': s.text,
                 'avg_logprob': s.avg_logprob, 'compression_ratio': s.compression_ratio,
@@ -177,10 +176,10 @@ def transcribe_core_ws(audio_file, last_transcribed_time):
             ret['new_segments'].append(seg)
             # Update the last transcribed time to the end of the current segment
-            new_last_transcribed_time = s.end
             logging.debug(f"Updated last transcribed time to: {new_last_transcribed_time} seconds")
-    logging.info(f"Returning {len(ret['new_segments'])} new segments and updated last transcribed time.")
     return ret, new_last_transcribed_time
@@ -195,7 +194,10 @@ async def websocket_transcribe(websocket: WebSocket):
     try:
         processed_segments = []  # Keeps track of the segments already transcribed
         last_transcribed_time = 0.0
         # A temporary file to store the growing audio data
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
@@ -212,18 +214,33 @@ async def websocket_transcribe(websocket: WebSocket):
                     # Write audio chunk to file and accumulate size and time
                     temp_audio_file.write(audio_chunk)
                     temp_audio_file.flush()
                     # Call the transcription function with the last processed time
                     partial_result, last_transcribed_time = transcribe_core_ws(temp_audio_file.name, last_transcribed_time)
                     accumulated_audio_time = 0  # Reset the accumulated audio time
                     response = {
                         "new_segments": partial_result['new_segments'],
                         "processed_segments": processed_segments
                     }
                     logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")
                     await websocket.send_json(response)
-                    processed_segments.extend(partial_result['new_segments'])
                 except WebSocketDisconnect:
                     logging.info("WebSocket connection closed by the client.")
@@ -237,4 +254,3 @@ async def websocket_transcribe(websocket: WebSocket):
         logging.info("Cleaning up and closing WebSocket connection.")

     # Track the new segments and update the last transcribed time
     for s in segs:
         logging.info(f"Processing segment with start time: {s.start} and end time: {s.end}")
         # Only process segments that start after the last transcribed time
         if s.start >= last_transcribed_time:
             logging.info(f"New segment found starting at {s.start} seconds.")
+            words = [{'start': w.start, 'end': w.end, 'word': w.word, 'probability': w.probability} for w in s.words]
             seg = {
                 'id': s.id, 'seek': s.seek, 'start': s.start, 'end': s.end, 'text': s.text,
                 'avg_logprob': s.avg_logprob, 'compression_ratio': s.compression_ratio,
             ret['new_segments'].append(seg)
             # Update the last transcribed time to the end of the current segment
+            new_last_transcribed_time = max(new_last_transcribed_time, s.end)
             logging.debug(f"Updated last transcribed time to: {new_last_transcribed_time} seconds")
+    #logging.info(f"Returning {len(ret['new_segments'])} new segments and updated last transcribed time.")
     return ret, new_last_transcribed_time
     try:
         processed_segments = []  # Keeps track of the segments already transcribed
+        accumulated_audio_size = 0  # Track how much audio data has been buffered
+        accumulated_audio_time = 0  # Track the total audio duration accumulated
         last_transcribed_time = 0.0
+        #min_transcription_time = 5.0  # Minimum duration of audio in seconds before transcription starts
         # A temporary file to store the growing audio data
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
                     # Write audio chunk to file and accumulate size and time
                     temp_audio_file.write(audio_chunk)
                     temp_audio_file.flush()
+                    accumulated_audio_size += len(audio_chunk)
+                    # Estimate the duration of the chunk based on its size (e.g., 16kHz audio)
+                    chunk_duration = len(audio_chunk) / (16000 * 2)  # Assuming 16kHz mono WAV (2 bytes per sample)
+                    accumulated_audio_time += chunk_duration
+                    #logging.info(f"Received and buffered {len(audio_chunk)} bytes, total buffered: {accumulated_audio_size} bytes, total time: {accumulated_audio_time:.2f} seconds")
+                    # Transcribe when enough time (audio) is accumulated (e.g., at least 5 seconds of audio)
+                    #if accumulated_audio_time >= min_transcription_time:
+                    #logging.info("Buffered enough audio time, starting transcription.")
                     # Call the transcription function with the last processed time
                     partial_result, last_transcribed_time = transcribe_core_ws(temp_audio_file.name, last_transcribed_time)
                     accumulated_audio_time = 0  # Reset the accumulated audio time
+                    processed_segments.extend(partial_result['new_segments'])
+                    # Reset the accumulated audio size after transcription
+                    accumulated_audio_size = 0
+                    # Send the transcription result back to the client with both new and all processed segments
                     response = {
                         "new_segments": partial_result['new_segments'],
                         "processed_segments": processed_segments
                     }
                     logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")
                     await websocket.send_json(response)
                 except WebSocketDisconnect:
                     logging.info("WebSocket connection closed by the client.")
         logging.info("Cleaning up and closing WebSocket connection.")