File size: 1,622 Bytes
0af1295 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import streamlit as st
import sounddevice as sd
import soundfile as sf
from faster_whisper import WhisperModel
import io
import os
from langchain_community.llms import Ollama
import pyttsx3
# Set environment variable to handle duplicate libraries
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
# Initialize WhisperModel and Ollama
model_size = "base.en"
model = WhisperModel(model_size, device="cpu", compute_type="int8", num_workers=5)
llm = Ollama(model="tinyllama")
# Initialize text-to-speech engine
engine = pyttsx3.init('sapi5')
voices = engine.getProperty('voices')
engine.setProperty('voice',voices[0].id)
engine.setProperty('rate',180)
def speak(audio):
engine.say(audio)
engine.runAndWait()
# Record and transcribe audio
audio_data = st.audio("recorded_audio.wav", format="audio/wav", start_time=0)
if st.button("Record"):
with st.spinner("Recording..."):
recorded_audio = sd.rec(int(5 * 44100), samplerate=44100, channels=2, dtype="int16")
sd.wait()
sf.write("recorded_audio.wav", recorded_audio, samplerate=44100)
st.audio("recorded_audio.wav", format="audio/wav", start_time=0)
# Transcribe audio and speak response
with open("recorded_audio.wav", "rb") as audio_file:
segments,info= model.transcribe(io.BytesIO(audio_file.read()), beam_size=10)
for segment in segments:
prompt=segment.text
print(prompt)
st.text(prompt)
if prompt:
response = llm.invoke(prompt)
st.success("Response: " + response)
speak(response)
st.stop()
else:
st.error("Failed to transcribe audio.")
|