Spaces:

edithram23
/

Chatbot

Runtime error

App Files Files Community

edithram23 commited on Oct 30, 2024

Commit

7661630

1 Parent(s): 1b69370

initial comit

Browse files

Files changed (4) hide show

app.py +87 -0
requirements.txt +0 -0
retriever.py +70 -0
setup.py +218 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+from dotenv import load_dotenv
+from gradio import ChatMessage
+from deepgram import DeepgramClient, SpeakOptions
+from setup import Script, Vector_db, Speech_Text
+from langchain_openai import ChatOpenAI
+load_dotenv()
+bot = Script()
+vector = Vector_db()
+transcriptor = Speech_Text()
+pdf_uploaded = False
+output_id = None
+# Function to generate chatbot response
+def generate_response(chat_history: list[ChatMessage], id=None):
+    user_input = chat_history[-1]["content"]
+    if len(chat_history) > 1:
+        chat = bot.history(chat_history[:-2])
+    else:
+        chat = ''
+    if id is not None:
+        rag_chain, question = bot.gpt_loaders_id(user_input, chat, id)
+    else:
+        rag_chain, question = bot.gpt_loaders(user_input, chat)
+    return rag_chain.invoke(question)
+def process(audio, input_text, pdfs, chat_history: list[ChatMessage]):
+    global pdf_uploaded, input_pdf, output_id
+    if pdfs is not None and not pdf_uploaded:
+        pdf_uploaded = True
+        pdf_path = pdfs.name
+        output_id = vector.upload_pdfs_user(pdf_path)
+        print(output_id)
+    if pdfs is None:
+        pdf_uploaded = False
+        output_id = None
+        print(output_id)
+    if audio is not None:
+        transcript = transcriptor.get_transcript(audio)
+        chat_history.append({"role": "user", "content": transcript})
+    elif input_text:
+        print(input_text)
+        chat_history.append({"role": "user", "content": input_text})
+    else:
+        response = 'Provide a query text or an audio to query.'
+        chat_history.append({"role": "assistant", "content": response})
+        audio_data = transcriptor.speech_synthesis(response)
+        return audio_data, chat_history
+    response = generate_response(chat_history, output_id)
+    chat_history.append({"role": "assistant", "content": response})
+    audio_data = transcriptor.speech_synthesis(response)
+    return audio_data, chat_history
+# Create Gradio Blocks interface
+with gr.Blocks() as demo:
+    gr.Markdown("""
+    # 🎤 Welcome to the ChatBot
+    This Bot has a Knowledge base on Indian Taxation Data by default. It allows you to chat with an AI assistant using either **text** or **voice**.<br>You can upload your own PDF data as knowledge base in the **upload a PDF** and can talk to your data seamlessly.
+    """)
+    with gr.Row():
+        with gr.Column(scale=1, min_width=300):
+            input_pdf = gr.File(label="Upload PDF", file_types=[".pdf"], file_count='single')
+            gr.Markdown("_Use a PDF to enhance the chatbot's knowledge!_", visible=not pdf_uploaded)
+    with gr.Row():
+        chatbot = gr.Chatbot(label="Chatbot Conversation", type="messages", bubble_full_width=True, show_copy_button=True, autoscroll=True)
+    with gr.Row():
+        input_textbox = gr.Textbox(label="Input Text", placeholder="Type your message here...")
+        input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
+    process_button = gr.Button("Submit Query")
+    output_audio = gr.Audio(label="Assistant's Response Audio", interactive=False, autoplay=True)
+    process_button.click(
+        fn=process,
+        inputs=[input_audio, input_textbox, input_pdf, chatbot],
+        outputs=[output_audio, chatbot]
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

Binary file (5.15 kB). View file

retriever.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+from langchain_openai import OpenAIEmbeddings
+from qdrant_client import QdrantClient
+from langchain_qdrant import QdrantVectorStore
+from qdrant_client.http import models
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv('.env')
+class Retriever():
+    def __init__(self):
+        # Initialize Qdrant client
+        qdrant_client = QdrantClient(
+            url=os.getenv("QDRANT_URL"),
+            api_key=os.getenv("QDRANT_API_KEY")
+        )
+        # Initialize Qdrant vector store
+        self.vector_store = QdrantVectorStore(
+            client=qdrant_client,
+            collection_name="siel-ai-assignment",
+            embedding=OpenAIEmbeddings(),
+        )
+        self.vector_store_user = QdrantVectorStore(
+            client=qdrant_client,
+            collection_name="siel-ai-user",
+            embedding=OpenAIEmbeddings(),
+        )
+        self.filters = ['Taxation-Goods-and-service-Tax',
+                        'Taxation-INCOME-TAX-LAW',
+                        'Direct Tax Laws and International Taxation',
+                        'Indirect Tax Laws',
+                        'INDIAN Income Tax ACTS',
+                        'ONLINESITES']
+    def filter(self,query):
+        retriever1 = self.vector_store.as_retriever(
+                                            search_type="similarity_score_threshold",
+                                            search_kwargs={"k": 7,
+                                                           'score_threshold':0.7,
+                                                            'filter':models.Filter(must=[models.FieldCondition(key="metadata.DOCUMENT_IS_ABOUT", match=models.MatchValue(value=self.filters[-1]),)])
+                                                            },
+                                        )
+        retriever2 = self.vector_store.as_retriever(
+                                            search_type="similarity_score_threshold",
+                                            search_kwargs={"k": 17,
+                                                           'score_threshold':0.7,
+                                                            'filter':models.Filter(must_not=[models.FieldCondition(key="metadata.DOCUMENT_IS_ABOUT", match=models.MatchValue(value=self.filters[-1]),)])
+                                                           },
+                                        )
+        ret = retriever1.invoke(query)+retriever2.invoke(query)
+        return ret
+    def id_filter(self,query,id):
+        retriever1 = self.vector_store_user.as_retriever(
+                                            search_type="similarity_score_threshold",
+                                            search_kwargs={"k": 10,
+                                                           'score_threshold':0.7,
+                                                            'filter':models.Filter(must=[models.FieldCondition(key="metadata.ID", match=models.MatchValue(value=id),)])
+                                                            }
+                                        )
+        ret = retriever1.invoke(query)
+        return ret
+    def data_retrieve(self, query=''):
+        retrieved_docs = self.vector_store.similarity_search_with_score(query, k=20)
+        return [doc for doc, _ in retrieved_docs]

setup.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from langchain_core.prompts import PromptTemplate
+from langchain_openai import ChatOpenAI
+from langchain_core.output_parsers import StrOutputParser
+from retriever import Retriever
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import Distance, VectorParams
+import os
+import io
+from langchain_qdrant import QdrantVectorStore
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.document_loaders import PyPDFLoader
+from openai import OpenAI
+from groq import Groq
+import soundfile as sf
+from deepgram import DeepgramClient, SpeakOptions
+from langchain_groq import ChatGroq
+import hashlib
+import time
+from uuid import uuid4
+from dotenv import load_dotenv
+load_dotenv('.env')
+class Script():
+    def __init__(self):
+        self.retriever = Retriever()
+        self.openai_client = ChatOpenAI(model="gpt-4o")
+        self.groq = ChatGroq(model='llama3-70b-8192')
+    def format_docs(self,format_results,id=False):
+        formatted_docs = []
+        for i,doc in enumerate(format_results,start=1):
+            if(id==True):
+                metadata = doc.metadata['DOCUMENT_NAME']
+            else:
+                metadata = doc.metadata['DOCUMENT_IS_ABOUT']
+            page = doc.page_content.strip()
+            content = f"**DOC {i}. METADATA : This DOC is about {metadata} \n CONTENT:{page}**"
+            formatted_docs.append(content)
+        return "".join(formatted_docs)
+    def history(self,hist):
+        text = ''
+        for i in hist:
+            if(i['content']!='Sorry! Unable to find an answer for your question. Try Again.'):
+                text += '|Role:'+i['role']+'Content:'+i['content']+'|'
+    def gpt_loaders(self,query:str,history:str):
+        template= f"""
+                    # You are an excellent Question & Answering BOT. Given a question and the context you will answer the question only based on the given context.
+                    # You will be given a user_query (or) User_question (or) User_scenario.
+                    # TASK: Your task is to provide an Answer to the USER_QUERY with the given CONTEXT_DATA.
+                    ===============================
+                    #USER_QUERY :  {{question}}
+                    ===============================
+                    #METADATA_OF_CONTEXT : -> The context given is related to INDIAN-TAXATIONS.
+                                        -> It may contain how to calculate tax for GOODS/SERVICES/INDIVIDUAL/CARS/TRAINS/etc anything related to INDIAN TAXES.
+                                        -> Based on the user_query use the context accordingly.
+                                        -> You can also provide a rough calculation for an example if asked for tax calculations related from the CONTEXT (if it is available in the CONTEXT).
+                    #CONTEXT : {{context}}
+                    ===============================
+                    You are also given previous ChatHistories (User question and corressponding AI answer) to you as an extra data.
+                    --# When to take the history as CONTEXT : Only if the history is relevant to the current question you are permitted to take the chat history as a context.
+                    --# If it is not relevant to the current question do not take it.
+                    #Chat History : {{history}}
+                    ===============================
+                    -> You are allowed to provide the answer only from the given context.
+                    -> Don't provide your own answer that is not in the given context.
+                    -> If you are not able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."
+                    -> Try to be a precise and provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
+                    -> Provide answer only to the question that is asked.
+                    ===============================
+                    # OUTPUT FORMAT:
+                        -> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer
+                        -> Don't provide any etc explanation apart from the answer output.
+                """
+        rag_prompt = PromptTemplate.from_template(template)
+        rag_chain = (
+                    rag_prompt
+                    | self.openai_client
+                    | StrOutputParser()
+                    )
+        question ={"context": self.format_docs(self.retriever.data_retrieve(query)), "question": query, "history": history}
+        return rag_chain,question
+    def gpt_loaders_id(self,query:str,history:str,id:str):
+        template= f"""
+                    # You are an excellent Question & Answering BOT. Given a question and the context you will answer the question only based on the given context.
+                    # You will be given a user_query (or) User_question (or) User_scenario.
+                    # TASK: Your task is to provide an Answer to the USER_QUERY with the given CONTEXT_DATA.
+                    ===============================
+                    #USER_QUERY :  {{question}}
+                    ===============================
+                    #METADATA_OF_CONTEXT : -> The context given is a given from the user pdf input.
+                                        -> Based on the user_query use the context accordingly.
+                    #CONTEXT : {{context}}
+                    ===============================
+                    You are also given previous ChatHistories (User question and corressponding AI answer) to you as an extra data.
+                    --# When to take the history as CONTEXT : Only if the history is relevant to the current question you are permitted to take the chat history as a context.
+                    --# If it is not relevant to the current question do not take it.
+                    #Chat History : {{history}}
+                    ===============================
+                    -> You are allowed to provide the answer only from the given context.
+                    -> Don't provide your own answer that is not in the given context.
+                    -> If you are not able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."
+                    -> Try to be a precise and provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
+                    -> Provide answer only to the question that is asked.
+                    ===============================
+                    # OUTPUT FORMAT:
+                        -> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer
+                        -> Don't provide any etc explanation apart from the answer output.
+                """
+        rag_prompt = PromptTemplate.from_template(template)
+        rag_chain = (
+                    rag_prompt
+                    | self.groq
+                    | StrOutputParser()
+                    )
+        question ={"context": self.format_docs(self.retriever.id_filter(query,id),id=True), "question": query, "history": history}
+        return rag_chain,question
+class Vector_db():
+    def __init__(self):
+        self.text_splitter = RecursiveCharacterTextSplitter(
+                            chunk_size=1024,
+                            chunk_overlap=256,
+                            length_function=len,
+                            is_separator_regex=False,
+                            )
+        self.qdrant_client = QdrantClient(
+                        url=os.getenv("QDRANT_URL"),
+                        api_key=os.getenv("QDRANT_API_KEY")
+                    )
+        self.openai_client = OpenAI()
+    def get_embed(self, texts):
+        return self.openai_client.embeddings.create(input = texts, model="text-embedding-3-large").data[0].embedding
+    def text_split(self,full_text,meta):
+        documents = self.text_splitter.create_documents([full_text],metadatas=[meta])
+        return documents
+    def load_data(self,pdf_path:str):
+        loader = PyPDFLoader(pdf_path)
+        file = loader.load()
+        text = ''
+        for i in file:
+            text+=i.page_content
+        return text
+    def getdocs(self,about,filename):
+        text = self.load_data(filename)
+        data = (text+str(time.time())).encode('utf-8')
+        identifier = hashlib.sha256(data).hexdigest()
+        metadata = {'DOCUMENT_NAME':about,'ID':str(identifier)}
+        documents = self.text_split(text,metadata)
+        return documents,identifier
+    def upload_pdfs_user(self,path,delete=False):
+        if delete==True:
+            if(self.qdrant_client.collection_exists("siel-ai-user")):
+                    self.qdrant_client.delete_collection("siel-ai-user")
+        if(not(self.qdrant_client.collection_exists("siel-ai-user"))):
+            self.qdrant_client.create_collection(
+                            collection_name="siel-ai-user",
+                            vectors_config=VectorParams(size=1536,
+                                                        distance=Distance.COSINE),
+                                            )
+        vector_store = QdrantVectorStore(
+                        client=self.qdrant_client,
+                        collection_name="siel-ai-user",
+                        embedding=OpenAIEmbeddings(),
+                                        )
+        documents = []
+        meta_data = os.path.basename(path)
+        docs,identifier = self.getdocs(meta_data,path)
+        documents+=docs
+        # uuid4 is used to generate unique id number of documents to use that particular doc alone as context.
+        ids = [str(uuid4())]*len(documents)
+        vector_store.add_documents(documents=documents, ids=ids)
+        return identifier
+class Speech_Text():
+    def __init__(self):
+        self.client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+        self.deepgram = DeepgramClient(os.environ.get("VOICE_API_KEY"))
+        self.options = SpeakOptions(
+            model="aura-luna-en",
+        )
+    # Function to get transcript from audio
+    def get_transcript(self,audio):
+        audio_buffer = io.BytesIO()
+        sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3")
+        audio_buffer.seek(0)
+        translation = self.client.audio.transcriptions.create(
+            file=("audio.mp3", audio_buffer.read()),
+            model="distil-whisper-large-v3-en",
+            response_format="json",
+            temperature=0.0,
+        )
+        return translation.text
+    # Function for speech synthesis
+    def speech_synthesis(self,text: str):
+        TEXT = {"text": text}
+        FILENAME = "audio.mp3"
+        try:
+            self.deepgram.speak.v("1").save(FILENAME, TEXT, self.options)
+            with open(FILENAME, "rb") as audio_file:
+                audio_data = audio_file.read()
+            return audio_data
+        except Exception as e:
+            print(f"Exception: {e}")
+            return None