File size: 1,622 Bytes
0af1295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import streamlit as st
import sounddevice as sd
import soundfile as sf
from faster_whisper import WhisperModel
import io
import os
from langchain_community.llms import Ollama
import pyttsx3
# Set environment variable to handle duplicate libraries
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# Initialize WhisperModel and Ollama
model_size = "base.en"
model = WhisperModel(model_size, device="cpu", compute_type="int8", num_workers=5)
llm = Ollama(model="tinyllama")

# Initialize text-to-speech engine
engine = pyttsx3.init('sapi5')
voices = engine.getProperty('voices')
engine.setProperty('voice',voices[0].id)
engine.setProperty('rate',180)

def speak(audio):
    engine.say(audio)
    engine.runAndWait()

# Record and transcribe audio
audio_data = st.audio("recorded_audio.wav", format="audio/wav", start_time=0)
if st.button("Record"):
    with st.spinner("Recording..."):
        recorded_audio = sd.rec(int(5 * 44100), samplerate=44100, channels=2, dtype="int16")
        sd.wait()
        sf.write("recorded_audio.wav", recorded_audio, samplerate=44100)

    st.audio("recorded_audio.wav", format="audio/wav", start_time=0)

    # Transcribe audio and speak response
    with open("recorded_audio.wav", "rb") as audio_file:
        segments,info= model.transcribe(io.BytesIO(audio_file.read()), beam_size=10)
    for segment in segments:
        prompt=segment.text
        print(prompt)
        st.text(prompt)
    if prompt:
        response = llm.invoke(prompt)
        st.success("Response: " + response)
        speak(response)
        st.stop()
    else:
        st.error("Failed to transcribe audio.")