Spaces:

Gigaverse
/

ivrit-ai-streaming

Sleeping

App Files Files Community

AshDavid12 commited on Sep 16, 2024

Commit

f4a3257

1 Parent(s): 1ad41b2

reverting back to partial trans

Browse files

Files changed (2) hide show

client.py +54 -149
infer.py +230 -109

client.py CHANGED Viewed

@@ -1,161 +1,66 @@
 import asyncio
-import io
-import json
-import numpy as np
 import websockets
 import requests
 import ssl
-import wave
-import logging
-import sys
-import sounddevice as sd
 # Parameters for reading and sending the audio
-#AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav"  # Use WAV file
-AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/hugging_face_ivrit_streaming/main/long_hebrew.wav"
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s',
-                    handlers=[logging.StreamHandler(sys.stdout)], force=True)
-logger = logging.getLogger(__name__)
-async def send_receive():
-    uri = "wss://gigaverse-ivrit-ai-streaming.hf.space/ws"  # Update with your server's address if needed
     ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
     ssl_context.check_hostname = False
     ssl_context.verify_mode = ssl.CERT_NONE
-    logger.info(f"Connecting to server at {uri}")
-    try:
-        async with websockets.connect(uri,ssl=ssl_context) as websocket:
-            logger.info("WebSocket connection established")
-            # Start tasks for sending and receiving
-            send_task = asyncio.create_task(send_audio(websocket))
-            receive_task = asyncio.create_task(receive_transcriptions(websocket))
-            await asyncio.gather(send_task, receive_task)
-    except Exception as e:
-        logger.error(f"WebSocket connection error: {e}")
-max_size_bytes = 50_000_000  # 10 MB
-SAMPLE_RATE = 16000
-CHUNK_SIZE =1024
-async def send_audio_chunks(websocket):
-    """Capture audio and send chunks to the server via WebSocket."""
-    def audio_callback(indata, frames, time, status):
-        """Callback function called when new audio is available."""
-        # Convert the audio input to a JSON-serializable format (e.g., list of samples)
-        audio_chunk = indata[:, 0].tolist()  # Use only the first channel
-        asyncio.run_coroutine_threadsafe(
-            websocket.send(json.dumps(audio_chunk)), asyncio.get_event_loop()
         )
-    # Start the audio stream
-    with sd.InputStream(callback=audio_callback, channels=1, samplerate=SAMPLE_RATE, blocksize=CHUNK_SIZE):
-        await asyncio.Future()  # Keep the stream open and running
-async def receive_transcriptions(websocket):
-    try:
-        logger.info("Starting to receive transcriptions")
-        async for message in websocket:  # This is the same as websocket.recv()
-            logger.info(f"Received transcription: {message}")
-            print(f"Transcription: {message}")
-    except Exception as e:
-        logger.error(f"Receive transcription error: {e}")
-if __name__ == "__main__":
-    asyncio.run(send_receive())
-# async def send_audio(websocket):
-#     buffer_size = 512 * 1024  #HAVE TO HAVE 512!!
-#     audio_buffer = bytearray()
-#
-#     with requests.get(AUDIO_FILE_URL, stream=True, allow_redirects=False) as response:
-#         if response.status_code == 200:
-#             print("Starting to stream audio file...")
-#
-#             for chunk in response.iter_content(chunk_size=1024):  # Stream in chunks
-#                 if chunk:
-#                     audio_buffer.extend(chunk)
-#                     #print(f"Received audio chunk of size {len(chunk)} bytes.")
-#
-#                     # Send buffered audio data once it's large enough
-#                 if len(audio_buffer) >= buffer_size:
-#                     await websocket.send(audio_buffer)
-#                         #print(f"Sent {len(audio_buffer)} bytes of audio data.")
-#                     audio_buffer.clear()
-#                     await asyncio.sleep(0.01)
-#
-#             print("Finished sending audio.")
-#         else:
-#             print(f"Failed to download audio file. Status code: {response.status_code}")
-#
-#
-# async def receive_transcription(websocket):
-#     while True:
-#         try:
-#
-#             transcription = await websocket.recv()
-#             # Receive transcription from the server
-#             print(f"Transcription: {transcription}")
-#         except Exception as e:
-#             print(f"Error receiving transcription: {e}")
-#             #await asyncio.sleep(30)
-#             break
-#
-#
-# async def send_heartbeat(websocket):
-#     while True:
-#         try:
-#             await websocket.ping()
-#             print("Sent keepalive ping")
-#         except websockets.ConnectionClosed:
-#             print("Connection closed, stopping heartbeat")
-#             break
-#         await asyncio.sleep(30)  # Send ping every 30 seconds (adjust as needed)
-#
-#
-# async def run_client():
-#     uri = ("wss://gigaverse-ivrit-ai-streaming.hf.space/wtranscribe")  # WebSocket URL
-#     ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
-#     ssl_context.check_hostname = False
-#     ssl_context.verify_mode = ssl.CERT_NONE
-#     while True:
-#         try:
-#             async with websockets.connect(uri, ssl=ssl_context, ping_timeout=1000, ping_interval=50) as websocket:
-#                 await asyncio.gather(
-#                     send_audio(websocket),
-#                     receive_transcription(websocket),
-#                     send_heartbeat(websocket)
-#                 )
-#         except websockets.ConnectionClosedError as e:
-#             print(f"WebSocket closed with error: {e}")
-#         # except Exception as e:
-#         #     print(f"Unexpected error: {e}")
-#         #
-#         # print("Reconnecting in 5 seconds...")
-#         # await asyncio.sleep(5)  # Wait 5 seconds before reconnecting
-#
-# asyncio.run(run_client())

 import asyncio
 import websockets
 import requests
 import ssl
 # Parameters for reading and sending the audio
+AUDIO_FILE_URL = "https://raw.githubusercontent.com/AshDavid12/runpod-serverless-forked/main/test_hebrew.wav"  # Use WAV file
+async def send_audio(websocket):
+    buffer_size = 1024*512  # Buffer audio chunks up to 512KB before sending
+    audio_buffer = bytearray()
+    with requests.get(AUDIO_FILE_URL, stream=True, allow_redirects=False) as response:
+        if response.status_code == 200:
+            print("Starting to stream audio file...")
+            for chunk in response.iter_content(chunk_size=1024):  # Stream in chunks
+                if chunk:
+                    audio_buffer.extend(chunk)
+                    print(f"Received audio chunk of size {len(chunk)} bytes.")
+                    # Send buffered audio data once it's large enough
+                    if len(audio_buffer) >= buffer_size:
+                        await websocket.send(audio_buffer)
+                        print(f"Sent {len(audio_buffer)} bytes of audio data.")
+                        audio_buffer.clear()
+                        await asyncio.sleep(0.01)
+            print("Finished sending audio.")
+        else:
+            print(f"Failed to download audio file. Status code: {response.status_code}")
+async def receive_transcription(websocket):
+    while True:
+        try:
+            transcription = await websocket.recv()  # Receive transcription from the server
+            print(f"Transcription: {transcription}")
+        except Exception as e:
+            print(f"Error receiving transcription: {e}")
+            break
+async def send_heartbeat(websocket):
+    while True:
+        try:
+            await websocket.ping()
+            print("Sent keepalive ping")
+        except websockets.ConnectionClosed:
+            print("Connection closed, stopping heartbeat")
+            break
+        await asyncio.sleep(30)  # Send ping every 30 seconds (adjust as needed)
+async def run_client():
+    uri = ("wss://gigaverse-ivrit-ai-streaming.hf.space/wtranscribe")  # WebSocket URL
     ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
     ssl_context.check_hostname = False
     ssl_context.verify_mode = ssl.CERT_NONE
+    async with websockets.connect(uri, ssl=ssl_context, timeout=60) as websocket:
+        await asyncio.gather(
+            send_audio(websocket),
+            receive_transcription(websocket),
+            send_heartbeat(websocket)
         )
+asyncio.run(run_client())

infer.py CHANGED Viewed

@@ -123,129 +123,250 @@ async def read_root():
 import tempfile
-@app.websocket("/ws")
-async def websocket_endpoint(websocket: WebSocket):
-    """WebSocket endpoint to handle client connections."""
-    await websocket.accept()
-    client_ip = websocket.client.host
-    logger.info(f"Client connected: {client_ip}")
-    sys.stdout.flush()
-    try:
-        await process_audio_stream(websocket)
-    except WebSocketDisconnect:
-        logger.info(f"Client disconnected: {client_ip}")
-    except Exception as e:
-        logger.error(f"Unexpected error: {e}")
-        await websocket.close()
-async def process_audio_stream(websocket: WebSocket):
-    """Continuously receive audio chunks and initiate transcription tasks."""
-    sampling_rate = 16000
-    min_chunk_size = 5  # in seconds
-    transcription_task = None
-    chunk_counter = 0
-    total_bytes_received = 0
-    while True:
-        try:
-            # Receive audio data from client
-            data = await websocket.receive_bytes()
-            if not data:
-                logger.info("No data received, closing connection")
-                break
-            chunk_counter += 1
-            chunk_size = len(data)
-            total_bytes_received += chunk_size
-            #logger.debug(f"Received chunk {chunk_counter}: {chunk_size} bytes")
-            audio_chunk = process_received_audio(data)
-            #logger.debug(f"Processed audio chunk {chunk_counter}: {len(audio_chunk)} samples")
-            # Check if enough audio has been buffered
-            if transcription_task is None or transcription_task.done():
-                # Start a new transcription task
-                # logger.info(f"Starting transcription task for {len(audio_buffer)} samples")
-                transcription_task = asyncio.create_task(
-                    transcribe_and_send(websocket, audio_chunk)
-                )
-            #logger.debug(f"Audio buffer size: {len(audio_buffer)} samples")
-        except Exception as e:
-            logger.error(f"Error receiving data: {e}")
-            break
-async def transcribe_and_send(websocket: WebSocket, audio_data):
-    """Run transcription in a separate thread and send the result to the client."""
-    logger.debug(f"Transcription task started for {len(audio_data)} samples")
-    transcription_result = await asyncio.to_thread(sync_transcribe_audio, audio_data)
-    if transcription_result:
-        try:
-            # Send the result as JSON
-            await websocket.send_json(transcription_result)
-            logger.info(f"Transcription JSON sent to client {transcription_result}")
-        except Exception as e:
-            logger.error(f"Error sending transcription: {e}")
-    else:
-        logger.warning("No transcription result to send")
-def sync_transcribe_audio(audio_data):
-    """Synchronously transcribe audio data using the ASR model and format the result."""
-    try:
-        logger.info('Starting transcription...')
-        segments, info = model.transcribe(
-            audio_data, language="he",compression_ratio_threshold=2.5, word_timestamps=True
-        )
-        logger.info('Transcription completed')
-        # Build the transcription result as per your requirement
-        ret = {'segments': []}
-        for s in segments:
-            logger.debug(f"Processing segment {s.id} with start time: {s.start} and end time: {s.end}")
-            # Process words in the segment
-            words = [{
-                'start': float(w.start),
-                'end': float(w.end),
-                'word': w.word,
-                'probability': float(w.probability)
-            } for w in s.words]
             seg = {
-                'id': int(s.id),
-                'seek': int(s.seek),
-                'start': float(s.start),
-                'end': float(s.end),
-                'text': s.text,
-                'avg_logprob': float(s.avg_logprob),
-                'compression_ratio': float(s.compression_ratio),
-                'no_speech_prob': float(s.no_speech_prob),
-                'words': words
             }
-            logger.debug(f'Adding new transcription segment: {seg}')
-            ret['segments'].append(seg)
-            logger.debug(f"Total segments in transcription result: {len(ret['segments'])}")
-            return ret
     except Exception as e:
-        logger.error(f"Transcription error: {e}")
-        return {}
-def process_received_audio(data):
-    """Convert received bytes into normalized float32 NumPy array."""
-    #logger.debug(f"Processing received audio data of size {len(data)} bytes")
-    audio_int16 = np.frombuffer(data, dtype=np.int16)
-    #logger.debug(f"Converted to int16 NumPy array with {len(audio_int16)} samples")
-    audio_float32 = audio_int16.astype(np.float32) / 32768.0  # Normalize to [-1, 1]
-    #logger.debug(f"Normalized audio data to float32 with {len(audio_float32)} samples")
-    return audio_float32

 import tempfile
+def transcribe_core_ws(audio_file, last_transcribed_time):
+    """
+    Transcribe the audio file and return only the segments that have not been processed yet.
+    :param audio_file: Path to the growing audio file.
+    :param last_transcribed_time: The last time (in seconds) that was transcribed.
+    :return: Newly transcribed segments and the updated last transcribed time.
+    """
+    logging.info(f"Starting transcription for file: {audio_file} from {last_transcribed_time} seconds.")
+    ret = {'new_segments': []}
+    new_last_transcribed_time = last_transcribed_time
+    try:
+        # Transcribe the entire audio file
+        logging.debug(f"Initiating model transcription for file: {audio_file}")
+        segs, _ = model.transcribe(audio_file, language='he', word_timestamps=True)
+        logging.info('Transcription completed successfully.')
+    except Exception as e:
+        logging.error(f"Error during transcription: {e}")
+        raise e
+    # Track the new segments and update the last transcribed time
+    for s in segs:
+        logging.info(f"Processing segment with start time: {s.start} and end time: {s.end}")
+        # Only process segments that start after the last transcribed time
+        if s.start >= last_transcribed_time:
+            logging.info(f"New segment found starting at {s.start} seconds.")
+            words = [{'start': w.start, 'end': w.end, 'word': w.word, 'probability': w.probability} for w in s.words]
             seg = {
+                'id': s.id, 'seek': s.seek, 'start': s.start, 'end': s.end, 'text': s.text,
+                'avg_logprob': s.avg_logprob, 'compression_ratio': s.compression_ratio,
+                'no_speech_prob': s.no_speech_prob, 'words': words
             }
+            logging.info(f'Adding new transcription segment: {seg}')
+            ret['new_segments'].append(seg)
+            # Update the last transcribed time to the end of the current segment
+            new_last_transcribed_time = max(new_last_transcribed_time, s.end)
+            logging.debug(f"Updated last transcribed time to: {new_last_transcribed_time} seconds")
+    #logging.info(f"Returning {len(ret['new_segments'])} new segments and updated last transcribed time.")
+    return ret, new_last_transcribed_time
+import tempfile
+@app.websocket("/wtranscribe")
+async def websocket_transcribe(websocket: WebSocket):
+    logging.info("New WebSocket connection request received.")
+    await websocket.accept()
+    logging.info("WebSocket connection established successfully.")
+    try:
+        processed_segments = []  # Keeps track of the segments already transcribed
+        accumulated_audio_size = 0  # Track how much audio data has been buffered
+        accumulated_audio_time = 0  # Track the total audio duration accumulated
+        last_transcribed_time = 0.0
+        #min_transcription_time = 5.0  # Minimum duration of audio in seconds before transcription starts
+        # A temporary file to store the growing audio data
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio_file:
+            logging.info(f"Temporary audio file created at {temp_audio_file.name}")
+            while True:
+                try:
+                    # Receive the next chunk of audio data
+                    audio_chunk = await websocket.receive_bytes()
+                    if not audio_chunk:
+                        logging.warning("Received empty audio chunk, skipping processing.")
+                        continue
+                    # Write audio chunk to file and accumulate size and time
+                    temp_audio_file.write(audio_chunk)
+                    temp_audio_file.flush()
+                    accumulated_audio_size += len(audio_chunk)
+                    # Estimate the duration of the chunk based on its size (e.g., 16kHz audio)
+                    chunk_duration = len(audio_chunk) / (16000 * 2)  # Assuming 16kHz mono WAV (2 bytes per sample)
+                    accumulated_audio_time += chunk_duration
+                    logging.info(f"Received and buffered {len(audio_chunk)} bytes, total buffered: {accumulated_audio_size} bytes, total time: {accumulated_audio_time:.2f} seconds")
+                    # Transcribe when enough time (audio) is accumulated (e.g., at least 5 seconds of audio)
+                    #if accumulated_audio_time >= min_transcription_time:
+                    #logging.info("Buffered enough audio time, starting transcription.")
+                    # Call the transcription function with the last processed time
+                    partial_result, last_transcribed_time = transcribe_core_ws(temp_audio_file.name, last_transcribed_time)
+                    accumulated_audio_time = 0  # Reset the accumulated audio time
+                    processed_segments.extend(partial_result['new_segments'])
+                    # Reset the accumulated audio size after transcription
+                    accumulated_audio_size = 0
+                    # Send the transcription result back to the client with both new and all processed segments
+                    response = {
+                        "new_segments": partial_result['new_segments'],
+                        "processed_segments": processed_segments
+                    }
+                    logging.info(f"Sending {len(partial_result['new_segments'])} new segments to the client.")
+                    await websocket.send_json(response)
+                except WebSocketDisconnect:
+                    logging.info("WebSocket connection closed by the client.")
+                    break
     except Exception as e:
+        logging.error(f"Unexpected error during WebSocket transcription: {e}")
+        await websocket.send_json({"error": str(e)})
+    finally:
+        logging.info("Cleaning up and closing WebSocket connection.")
+# @app.websocket("/ws")
+# async def websocket_endpoint(websocket: WebSocket):
+#     """WebSocket endpoint to handle client connections."""
+#     await websocket.accept()
+#     client_ip = websocket.client.host
+#     logger.info(f"Client connected: {client_ip}")
+#     sys.stdout.flush()
+#     try:
+#         await process_audio_stream(websocket)
+#     except WebSocketDisconnect:
+#         logger.info(f"Client disconnected: {client_ip}")
+#     except Exception as e:
+#         logger.error(f"Unexpected error: {e}")
+#         await websocket.close()
+#
+# async def process_audio_stream(websocket: WebSocket):
+#     """Continuously receive audio chunks and initiate transcription tasks."""
+#     sampling_rate = 16000
+#     min_chunk_size = 5  # in seconds
+#
+#     transcription_task = None
+#     chunk_counter = 0
+#     total_bytes_received = 0
+#
+#     while True:
+#         try:
+#             # Receive audio data from client
+#             data = await websocket.receive_bytes()
+#             if not data:
+#                 logger.info("No data received, closing connection")
+#                 break
+#             chunk_counter += 1
+#             chunk_size = len(data)
+#             total_bytes_received += chunk_size
+#             #logger.debug(f"Received chunk {chunk_counter}: {chunk_size} bytes")
+#
+#             audio_chunk = process_received_audio(data)
+#             #logger.debug(f"Processed audio chunk {chunk_counter}: {len(audio_chunk)} samples")
+#             # Check if enough audio has been buffered
+#             # if transcription_task is None or transcription_task.done():
+#             #     # Start a new transcription task
+#         #     # logger.info(f"Starting transcription task for {len(audio_buffer)} samples")
+#             transcription_task = asyncio.create_task(
+#                 transcribe_and_send(websocket, audio_chunk)
+#             )
+#
+#             #logger.debug(f"Audio buffer size: {len(audio_buffer)} samples")
+#         except Exception as e:
+#             logger.error(f"Error receiving data: {e}")
+#             break
+#
+#
+# async def transcribe_and_send(websocket: WebSocket, audio_data):
+#     """Run transcription in a separate thread and send the result to the client."""
+#     logger.debug(f"Transcription task started for {len(audio_data)} samples")
+#     transcription_result = await asyncio.to_thread(sync_transcribe_audio, audio_data)
+#     if transcription_result:
+#         try:
+#             # Send the result as JSON
+#             await websocket.send_json(transcription_result)
+#             logger.info(f"Transcription JSON sent to client {transcription_result}")
+#         except Exception as e:
+#             logger.error(f"Error sending transcription: {e}")
+#     else:
+#         logger.warning("No transcription result to send")
+#
+# def sync_transcribe_audio(audio_data):
+#     """Synchronously transcribe audio data using the ASR model and format the result."""
+#     try:
+#
+#         logger.info('Starting transcription...')
+#         segments, info = model.transcribe(
+#             audio_data, language="he",compression_ratio_threshold=2.5, word_timestamps=True
+#         )
+#         logger.info('Transcription completed')
+#
+#         # Build the transcription result as per your requirement
+#         ret = {'segments': []}
+#
+#         for s in segments:
+#             logger.debug(f"Processing segment {s.id} with start time: {s.start} and end time: {s.end}")
+#
+#             # Process words in the segment
+#             words = [{
+#                 'start': float(w.start),
+#                 'end': float(w.end),
+#                 'word': w.word,
+#                 'probability': float(w.probability)
+#             } for w in s.words]
+#
+#             seg = {
+#                 'id': int(s.id),
+#                 'seek': int(s.seek),
+#                 'start': float(s.start),
+#                 'end': float(s.end),
+#                 'text': s.text,
+#                 'avg_logprob': float(s.avg_logprob),
+#                 'compression_ratio': float(s.compression_ratio),
+#                 'no_speech_prob': float(s.no_speech_prob),
+#                 'words': words
+#             }
+#             logger.debug(f'Adding new transcription segment: {seg}')
+#             ret['segments'].append(seg)
+#
+#             logger.debug(f"Total segments in transcription result: {len(ret['segments'])}")
+#             return ret
+#     except Exception as e:
+#         logger.error(f"Transcription error: {e}")
+#         return {}
+#
+# def process_received_audio(data):
+#     """Convert received bytes into normalized float32 NumPy array."""
+#     #logger.debug(f"Processing received audio data of size {len(data)} bytes")
+#     audio_int16 = np.frombuffer(data, dtype=np.int16)
+#     #logger.debug(f"Converted to int16 NumPy array with {len(audio_int16)} samples")
+#
+#     audio_float32 = audio_int16.astype(np.float32) / 32768.0  # Normalize to [-1, 1]
+#     #logger.debug(f"Normalized audio data to float32 with {len(audio_float32)} samples")
+#
+#     return audio_float32
+#
+#