|
from utils import transcribe as transcribe_api |
|
|
|
import sounddevice as sd |
|
import soundfile as sf |
|
import numpy as np |
|
import threading |
|
import whisper |
|
import time |
|
|
|
SAMPLE_RATE = 16000 |
|
CHANNELS = 1 |
|
BLOCKSIZE = 8000 |
|
DURATION = 0.5 |
|
THRESHOLD = 0.015 |
|
SILENT_THRESHOLD = 3 |
|
silence_duration = 0 |
|
output_file = sf.SoundFile( |
|
'database/recording.wav', mode='w', samplerate=SAMPLE_RATE, channels=CHANNELS) |
|
|
|
transcription_in_progress = False |
|
queued = False |
|
first_run = True |
|
transcription_text = "" |
|
|
|
|
|
def transcribe(audio): |
|
if first_run: |
|
model = whisper.load_model("base") |
|
first_run = False |
|
result = model.transcribe(audio) |
|
transcription = result['text'] |
|
|
|
user = "Vatsal" |
|
return transcription, user |
|
|
|
|
|
def transcription(): |
|
global transcription_in_progress |
|
global transcription_text |
|
transcription_text, user = transcribe_api('database/recording.wav') |
|
print("-"*100) |
|
print(f'Transcription: {transcription_text} from user {user}') |
|
print("-"*100) |
|
transcription_in_progress = False |
|
|
|
def listen(stream): |
|
global transcription_in_progress |
|
global queued |
|
global silence_duration |
|
global output_file |
|
|
|
audio_data, _ = stream.read(BLOCKSIZE) |
|
|
|
output_file.write(audio_data) |
|
|
|
time.sleep(0.5) |
|
|
|
audio_data, _ = stream.read(int(DURATION * SAMPLE_RATE)) |
|
output_file.write(audio_data) |
|
if float(np.abs(audio_data).mean()) > THRESHOLD: |
|
silence_duration = 0 |
|
if transcription_in_progress: |
|
print('Audio detected! Transcribing...') |
|
queued = True |
|
else: |
|
transcription_in_progress = True |
|
print('Audio detected! Transcribing...') |
|
threading.Thread(target=transcription).start() |
|
|
|
elif float(np.abs(audio_data).mean()) < THRESHOLD: |
|
silence_duration += BLOCKSIZE / float(SAMPLE_RATE) |
|
if silence_duration >= SILENT_THRESHOLD: |
|
if queued: |
|
transcription() |
|
queued = False |
|
|
|
with open('./database/input.txt', 'w', encoding="utf-8") as write_to: |
|
write_to.write(transcription_text[1:]) |
|
silence_duration = 0 |
|
output_file.close() |
|
audio_data = None |
|
output_file = sf.SoundFile( |
|
'database/recording.wav', mode='w', samplerate=SAMPLE_RATE, channels=CHANNELS) |
|
|
|
|
|
def live_listen(): |
|
with sd.InputStream(channels=CHANNELS, blocksize=BLOCKSIZE, samplerate=SAMPLE_RATE) as stream: |
|
print("STARTING LIVE TRANSCRIPTION") |
|
while True: |
|
listen(stream) |
|
|
|
|
|
if __name__ == "__main__": |
|
live_listen() |