File size: 6,062 Bytes
53714cc
107ed40
b766313
f7483c6
fad1562
b766313
 
 
fad1562
 
c320ec9
af77c7a
170c9ab
 
35fd0cd
 
fad1562
1a7463f
 
 
5d13b89
 
 
0aba088
5d13b89
fad1562
af77c7a
 
 
 
c320ec9
 
 
 
 
 
 
fad1562
5d13b89
130b915
5d13b89
fad1562
5d13b89
b766313
 
130b915
5d13b89
b766313
 
 
 
 
 
 
 
 
 
170c9ab
af77c7a
abce794
af77c7a
 
 
130b915
 
 
 
bb6f1e7
130b915
170c9ab
 
35fd0cd
 
 
170c9ab
35fd0cd
 
 
 
 
 
 
 
170c9ab
 
35fd0cd
 
 
130b915
035ca2e
130b915
 
af77c7a
c320ec9
130b915
 
 
 
170c9ab
130b915
af77c7a
 
 
 
130b915
af77c7a
130b915
 
170c9ab
 
 
35fd0cd
170c9ab
 
 
35fd0cd
170c9ab
 
 
35fd0cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170c9ab
 
 
0aba088
 
 
 
 
 
 
130b915
0aba088
af77c7a
130b915
af77c7a
 
130b915
c320ec9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#ref: https://www.youtube.com/watch?v=3ZDVmzlM6Nc ; https://github.com/plaban1981/Agents/blob/main/Audio_powered_RAG_using_langchain_groq.ipynb

import os
import chromadb
import streamlit as st
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from PyPDF2 import PdfReader
from groq import Groq
from streamlit_webrtc import webrtc_streamer, AudioProcessorBase, WebRtcMode
import av
from pydub import AudioSegment
from io import BytesIO

# Clear ChromaDB cache to fix tenant issue
chromadb.api.client.SharedSystemClient.clear_system_cache()

# Ensure required environment variables are set
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    st.error("GROQ_API_KEY is not set. Please configure it in environment variables.")
    st.stop()

# Initialize Groq Client for transcription and LLM
groq_client = Groq(api_key=GROQ_API_KEY)
llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0, groq_api_key=GROQ_API_KEY)

# Function to process PDFs and set up the vectorstore
def process_and_store_pdfs(uploaded_files):
    texts = []
    for uploaded_file in uploaded_files:
        reader = PdfReader(uploaded_file)
        for page in reader.pages:
            texts.append(page.extract_text())

    embeddings = HuggingFaceEmbeddings()
    vectorstore = Chroma.from_texts(texts, embedding=embeddings, persist_directory="vector_db_dir")
    return vectorstore

# Function to set up the chat chain
def chat_chain(vectorstore):
    retriever = vectorstore.as_retriever()
    memory = ConversationBufferMemory(output_key="answer", memory_key="chat_history", return_messages=True)

    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        memory=memory,
        verbose=True,
        return_source_documents=True
    )
    return chain

# Transcribe audio using Groq Whisper
def transcribe_audio(file_path):
    """Transcribe audio using Groq's Whisper model."""
    with open(file_path, "rb") as file:
        transcription = groq_client.audio.transcriptions.create(
            file=(file_path, file.read()),
            model="distil-whisper-large-v3-en",
            response_format="json",
            language="en"
        )
    return transcription.text

# Audio Processor Class for Recording
class AudioProcessor(AudioProcessorBase):
    def __init__(self):
        self.audio_buffer = BytesIO()
        
    def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
        # Append audio data to buffer
        audio_segment = AudioSegment(
            data=frame.to_ndarray().tobytes(),
            sample_width=2,
            frame_rate=frame.sample_rate,
            channels=1
        )
        self.audio_buffer.write(audio_segment.raw_data)
        return frame

    def get_audio_data(self):
        return self.audio_buffer

# Streamlit UI
st.title("Chat with Docs via Speech/Text πŸ—£οΈπŸ“πŸ“š")

uploaded_files = st.file_uploader("Upload PDF Files", accept_multiple_files=True, type=["pdf"])

if uploaded_files:
    vectorstore = process_and_store_pdfs(uploaded_files)
    chain = chat_chain(vectorstore)
    st.success("PDFs processed! Ready to chat.")

    input_method = st.radio("Choose Input Method", ["Text Input", "Record Audio", "Upload Audio File"])

    # Text Input Mode
    if input_method == "Text Input":
        query = st.text_input("Ask your question:")
        if query:
            with st.spinner("Thinking..."):
                response = chain({"question": query})["answer"]
                st.write(f"**Response:** {response}")

    # Record Audio
    elif input_method == "Record Audio":
        st.write("Record your audio query:")
        audio_processor = AudioProcessor()
        webrtc_ctx = webrtc_streamer(
            key="record",
            mode=WebRtcMode.SENDONLY,
            audio_processor_factory=lambda: audio_processor,
            media_stream_constraints={"audio": True, "video": False},
        )

        # Stop recording when session ends
        if webrtc_ctx.state.playing:
            st.write("Recording... Speak now.")
        elif webrtc_ctx.state.stopped:
            st.write("Recording stopped. Processing...")
            audio_data = audio_processor.get_audio_data()

            if audio_data:
                # Save audio to a file
                audio_file_path = "recorded_audio.wav"
                audio_segment = AudioSegment.from_file(BytesIO(audio_data.getvalue()), format="raw", frame_rate=48000, channels=1, sample_width=2)
                audio_segment.export(audio_file_path, format="wav")
                st.success("Recording saved successfully!")

                # Transcribe and Generate Response
                st.write("Transcribing audio...")
                transcription = transcribe_audio(audio_file_path)
                st.write(f"**You said:** {transcription}")

                with st.spinner("Generating response..."):
                    response = chain({"question": transcription})["answer"]
                    st.write(f"**Response:** {response}")

    # Upload Audio File Mode
    elif input_method == "Upload Audio File":
        uploaded_audio = st.file_uploader("Upload an audio file (.wav, .mp3)", type=["wav", "mp3"])
        if uploaded_audio:
            audio_file_path = "uploaded_audio.wav"
            with open(audio_file_path, "wb") as f:
                f.write(uploaded_audio.read())

            st.audio(audio_file_path, format="audio/wav")
            st.write("Transcribing audio...")
            transcription = transcribe_audio(audio_file_path)
            st.write(f"**You said:** {transcription}")

            with st.spinner("Generating response..."):
                response = chain({"question": transcription})["answer"]
                st.write(f"**Response:** {response}")
else:
    st.info("Please upload PDF files to start chatting.")