CRYSTAL-R1 / SoundScribe /transcribe.py
crystal-technologies's picture
Upload 1653 files
714d948
from utils import transcribe as transcribe_api
# from SoundScribe.speakerID import find_user
import sounddevice as sd
import soundfile as sf
import numpy as np
import threading
import whisper
import time
SAMPLE_RATE = 16000
CHANNELS = 1
BLOCKSIZE = 8000
DURATION = 0.5
THRESHOLD = 0.015
SILENT_THRESHOLD = 3
silence_duration = 0
output_file = sf.SoundFile(
'database/recording.wav', mode='w', samplerate=SAMPLE_RATE, channels=CHANNELS)
transcription_in_progress = False
queued = False
first_run = True
transcription_text = ""
def transcribe(audio):
if first_run:
model = whisper.load_model("base")
first_run = False
result = model.transcribe(audio)
transcription = result['text']
# user = find_user("database/recording.wav")
user = "Vatsal"
return transcription, user
def transcription():
global transcription_in_progress
global transcription_text
transcription_text, user = transcribe_api('database/recording.wav')
print("-"*100)
print(f'Transcription: {transcription_text} from user {user}')
print("-"*100)
transcription_in_progress = False
def listen(stream):
global transcription_in_progress
global queued
global silence_duration
global output_file
audio_data, _ = stream.read(BLOCKSIZE)
output_file.write(audio_data)
time.sleep(0.5)
audio_data, _ = stream.read(int(DURATION * SAMPLE_RATE))
output_file.write(audio_data)
if float(np.abs(audio_data).mean()) > THRESHOLD:
silence_duration = 0
if transcription_in_progress:
print('Audio detected! Transcribing...')
queued = True
else:
transcription_in_progress = True
print('Audio detected! Transcribing...')
threading.Thread(target=transcription).start()
elif float(np.abs(audio_data).mean()) < THRESHOLD:
silence_duration += BLOCKSIZE / float(SAMPLE_RATE)
if silence_duration >= SILENT_THRESHOLD:
if queued:
transcription()
queued = False
with open('./database/input.txt', 'w', encoding="utf-8") as write_to:
write_to.write(transcription_text[1:])
silence_duration = 0
output_file.close()
audio_data = None
output_file = sf.SoundFile(
'database/recording.wav', mode='w', samplerate=SAMPLE_RATE, channels=CHANNELS)
def live_listen():
with sd.InputStream(channels=CHANNELS, blocksize=BLOCKSIZE, samplerate=SAMPLE_RATE) as stream:
print("STARTING LIVE TRANSCRIPTION")
while True:
listen(stream)
if __name__ == "__main__":
live_listen()