import os import streamlit as st import whisper from dotenv import load_dotenv from langchain.chains import RetrievalQA from audiorecorder import audiorecorder from langchain.document_loaders import DirectoryLoader from langchain.embeddings import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores.faiss import FAISS from langchain.chat_models import ChatOpenAI load_dotenv() api_key = os.getenv("OPENAI_API_KEY") st.title("Avtarcoach Audio-to-text") # audio_bytes = audio_recorder("Click to record", "Click to stop recording", neutral_color="#051082", icon_size="2x") # if audio_bytes: # st.audio(audio_bytes, format="audio/wav") audio = audiorecorder("Click to record", "Click to stop recording") if len(audio) > 0: # To play audio in frontend: st.audio(audio.export().read()) # To save audio to a file, use pydub export method: audio.export("audio.wav", format="wav") # To get audio properties, use pydub AudioSegment properties: st.write( f"Frame rate: {audio.frame_rate}, Frame width: {audio.frame_width}, Duration: {audio.duration_seconds} seconds") model = whisper.load_model("base") # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(r"audio.wav") audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) st.write(f"Detected language: {max(probs, key=probs.get)}") # decode the audio options = whisper.DecodingOptions(fp16=False) result = whisper.decode(model, mel, options) # print the recognized text st.write("You Said: ", result.text) input_text = result.text st.markdown("""