File size: 2,693 Bytes
2d8da09 714d948 2d8da09 714d948 2d8da09 714d948 2d8da09 714d948 2d8da09 714d948 2d8da09 38057e4 714d948 2d8da09 714d948 2d8da09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
from utils import transcribe as transcribe_api
# from SoundScribe.speakerID import find_user
import sounddevice as sd
import soundfile as sf
import numpy as np
import threading
import whisper
import time
SAMPLE_RATE = 16000
CHANNELS = 1
BLOCKSIZE = 8000
DURATION = 0.5
THRESHOLD = 0.015
SILENT_THRESHOLD = 3
silence_duration = 0
output_file = sf.SoundFile(
'database/recording.wav', mode='w', samplerate=SAMPLE_RATE, channels=CHANNELS)
transcription_in_progress = False
queued = False
first_run = True
transcription_text = ""
def transcribe(audio):
if first_run:
model = whisper.load_model("base")
first_run = False
result = model.transcribe(audio)
transcription = result['text']
# user = find_user("database/recording.wav")
user = "Vatsal"
return transcription, user
def transcription():
global transcription_in_progress
global transcription_text
transcription_text, user = transcribe_api('database/recording.wav')
print("-"*100)
print(f'Transcription: {transcription_text} from user {user}')
print("-"*100)
transcription_in_progress = False
def listen(stream):
global transcription_in_progress
global queued
global silence_duration
global output_file
audio_data, _ = stream.read(BLOCKSIZE)
output_file.write(audio_data)
time.sleep(0.5)
audio_data, _ = stream.read(int(DURATION * SAMPLE_RATE))
output_file.write(audio_data)
if float(np.abs(audio_data).mean()) > THRESHOLD:
silence_duration = 0
if transcription_in_progress:
print('Audio detected! Transcribing...')
queued = True
else:
transcription_in_progress = True
print('Audio detected! Transcribing...')
threading.Thread(target=transcription).start()
elif float(np.abs(audio_data).mean()) < THRESHOLD:
silence_duration += BLOCKSIZE / float(SAMPLE_RATE)
if silence_duration >= SILENT_THRESHOLD:
if queued:
transcription()
queued = False
with open('./database/input.txt', 'w', encoding="utf-8") as write_to:
write_to.write(transcription_text[1:])
silence_duration = 0
output_file.close()
audio_data = None
output_file = sf.SoundFile(
'database/recording.wav', mode='w', samplerate=SAMPLE_RATE, channels=CHANNELS)
def live_listen():
with sd.InputStream(channels=CHANNELS, blocksize=BLOCKSIZE, samplerate=SAMPLE_RATE) as stream:
print("STARTING LIVE TRANSCRIPTION")
while True:
listen(stream)
if __name__ == "__main__":
live_listen() |