File size: 2,693 Bytes
2d8da09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714d948
 
2d8da09
 
 
714d948
 
 
2d8da09
 
 
 
 
 
 
 
 
714d948
 
2d8da09
714d948
2d8da09
 
 
714d948
2d8da09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38057e4
714d948
 
2d8da09
 
 
 
 
 
 
 
 
 
 
714d948
2d8da09
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from utils import transcribe as transcribe_api
# from SoundScribe.speakerID import find_user
import sounddevice as sd
import soundfile as sf
import numpy as np
import threading
import whisper
import time

SAMPLE_RATE = 16000
CHANNELS = 1
BLOCKSIZE = 8000
DURATION = 0.5
THRESHOLD = 0.015
SILENT_THRESHOLD = 3
silence_duration = 0
output_file = sf.SoundFile(
    'database/recording.wav', mode='w', samplerate=SAMPLE_RATE, channels=CHANNELS)

transcription_in_progress = False
queued = False
first_run = True
transcription_text = ""


def transcribe(audio):
    if first_run:
        model = whisper.load_model("base")
        first_run = False
    result = model.transcribe(audio)
    transcription = result['text']
    # user = find_user("database/recording.wav")
    user = "Vatsal"
    return transcription, user


def transcription():
    global transcription_in_progress
    global transcription_text
    transcription_text, user = transcribe_api('database/recording.wav')
    print("-"*100)
    print(f'Transcription: {transcription_text} from user {user}')
    print("-"*100)
    transcription_in_progress = False

def listen(stream):
    global transcription_in_progress
    global queued
    global silence_duration
    global output_file

    audio_data, _ = stream.read(BLOCKSIZE)

    output_file.write(audio_data)

    time.sleep(0.5)

    audio_data, _ = stream.read(int(DURATION * SAMPLE_RATE))
    output_file.write(audio_data)
    if float(np.abs(audio_data).mean()) > THRESHOLD:
        silence_duration = 0
        if transcription_in_progress:
            print('Audio detected! Transcribing...')
            queued = True
        else:
            transcription_in_progress = True
            print('Audio detected! Transcribing...')
            threading.Thread(target=transcription).start()

    elif float(np.abs(audio_data).mean()) < THRESHOLD:
        silence_duration += BLOCKSIZE / float(SAMPLE_RATE)
        if silence_duration >= SILENT_THRESHOLD:
            if queued:
                transcription()
                queued = False

            with open('./database/input.txt', 'w', encoding="utf-8") as write_to:
                write_to.write(transcription_text[1:])   
            silence_duration = 0
            output_file.close()
            audio_data = None
            output_file = sf.SoundFile(
                'database/recording.wav', mode='w', samplerate=SAMPLE_RATE, channels=CHANNELS)


def live_listen():
    with sd.InputStream(channels=CHANNELS, blocksize=BLOCKSIZE, samplerate=SAMPLE_RATE) as stream:
        print("STARTING LIVE TRANSCRIPTION")
        while True:
            listen(stream)


if __name__ == "__main__":
    live_listen()