#ref: https://www.youtube.com/watch?v=3ZDVmzlM6Nc ; https://github.com/plaban1981/Agents/blob/main/Audio_powered_RAG_using_langchain_groq.ipynb import os import chromadb import streamlit as st from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma from langchain_groq import ChatGroq from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain from PyPDF2 import PdfReader from groq import Groq from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode import av from pydub import AudioSegment from io import BytesIO # Clear ChromaDB cache to fix tenant issue chromadb.api.client.SharedSystemClient.clear_system_cache() # Ensure required environment variables are set GROQ_API_KEY = os.getenv("GROQ_API_KEY") if not GROQ_API_KEY: st.error("GROQ_API_KEY is not set. Please configure it in environment variables.") st.stop() # Initialize Groq Client for transcription and LLM groq_client = Groq(api_key=GROQ_API_KEY) llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0, groq_api_key=GROQ_API_KEY) # Function to process PDFs and set up the vectorstore def process_and_store_pdfs(uploaded_files): texts = [] for uploaded_file in uploaded_files: reader = PdfReader(uploaded_file) for page in reader.pages: texts.append(page.extract_text()) embeddings = HuggingFaceEmbeddings() vectorstore = Chroma.from_texts(texts, embedding=embeddings, persist_directory="vector_db_dir") return vectorstore # Function to set up the chat chain def chat_chain(vectorstore): retriever = vectorstore.as_retriever() memory = ConversationBufferMemory(output_key="answer", memory_key="chat_history", return_messages=True) chain = ConversationalRetrievalChain.from_llm( llm=llm, retriever=retriever, chain_type="stuff", memory=memory, verbose=True, return_source_documents=True ) return chain # Transcribe audio using Groq Whisper def transcribe_audio(file_path): """Transcribe audio using Groq's Whisper model.""" with open(file_path, "rb") as file: transcription = groq_client.audio.transcriptions.create( file=(file_path, file.read()), model="distil-whisper-large-v3-en", response_format="json", language="en" ) return transcription.text # Audio Processor Class for Recording class AudioProcessor(AudioProcessorBase): def __init__(self): self.audio_buffer = BytesIO() def recv(self, frame: av.AudioFrame) -> av.AudioFrame: # Append audio data to buffer audio_segment = AudioSegment( data=frame.to_ndarray().tobytes(), sample_width=2, frame_rate=frame.sample_rate, channels=1 ) self.audio_buffer.write(audio_segment.raw_data) return frame def get_audio_data(self): return self.audio_buffer # Streamlit UI st.title("Chat with Docs via Speech/Text 🗣️📝📚") uploaded_files = st.file_uploader("Upload PDF Files", accept_multiple_files=True, type=["pdf"]) if uploaded_files: vectorstore = process_and_store_pdfs(uploaded_files) chain = chat_chain(vectorstore) st.success("PDFs processed! Ready to chat.") input_method = st.radio("Choose Input Method", ["Text Input", "Record Audio", "Upload Audio File"]) # Text Input Mode if input_method == "Text Input": query = st.text_input("Ask your question:") if query: with st.spinner("Thinking..."): response = chain({"question": query})["answer"] st.write(f"**Response:** {response}") # Record Audio elif input_method == "Record Audio": st.write("Record your audio query:") audio_processor = AudioProcessor() webrtc_ctx = webrtc_streamer( key="record", mode=WebRtcMode.SENDONLY, audio_processor_factory=lambda: audio_processor, media_stream_constraints={"audio": True, "video": False}, ) # Stop recording when session ends if webrtc_ctx.state.playing: st.write("Recording... Speak now.") elif webrtc_ctx.state.stopped: st.write("Recording stopped. Processing...") audio_data = audio_processor.get_audio_data() if audio_data: # Save audio to a file audio_file_path = "recorded_audio.wav" audio_segment = AudioSegment.from_file(BytesIO(audio_data.getvalue()), format="raw", frame_rate=48000, channels=1, sample_width=2) audio_segment.export(audio_file_path, format="wav") st.success("Recording saved successfully!") # Transcribe and Generate Response st.write("Transcribing audio...") transcription = transcribe_audio(audio_file_path) st.write(f"**You said:** {transcription}") with st.spinner("Generating response..."): response = chain({"question": transcription})["answer"] st.write(f"**Response:** {response}") # Upload Audio File Mode elif input_method == "Upload Audio File": uploaded_audio = st.file_uploader("Upload an audio file (.wav, .mp3)", type=["wav", "mp3"]) if uploaded_audio: audio_file_path = "uploaded_audio.wav" with open(audio_file_path, "wb") as f: f.write(uploaded_audio.read()) st.audio(audio_file_path, format="audio/wav") st.write("Transcribing audio...") transcription = transcribe_audio(audio_file_path) st.write(f"**You said:** {transcription}") with st.spinner("Generating response..."): response = chain({"question": transcription})["answer"] st.write(f"**Response:** {response}") else: st.info("Please upload PDF files to start chatting.")