edithram23 commited on
Commit
7661630
·
1 Parent(s): 1b69370

initial comit

Browse files
Files changed (4) hide show
  1. app.py +87 -0
  2. requirements.txt +0 -0
  3. retriever.py +70 -0
  4. setup.py +218 -0
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from dotenv import load_dotenv
3
+ from gradio import ChatMessage
4
+ from deepgram import DeepgramClient, SpeakOptions
5
+ from setup import Script, Vector_db, Speech_Text
6
+ from langchain_openai import ChatOpenAI
7
+ load_dotenv()
8
+
9
+ bot = Script()
10
+ vector = Vector_db()
11
+ transcriptor = Speech_Text()
12
+ pdf_uploaded = False
13
+ output_id = None
14
+
15
+ # Function to generate chatbot response
16
+ def generate_response(chat_history: list[ChatMessage], id=None):
17
+ user_input = chat_history[-1]["content"]
18
+ if len(chat_history) > 1:
19
+ chat = bot.history(chat_history[:-2])
20
+ else:
21
+ chat = ''
22
+ if id is not None:
23
+ rag_chain, question = bot.gpt_loaders_id(user_input, chat, id)
24
+ else:
25
+ rag_chain, question = bot.gpt_loaders(user_input, chat)
26
+ return rag_chain.invoke(question)
27
+
28
+
29
+ def process(audio, input_text, pdfs, chat_history: list[ChatMessage]):
30
+ global pdf_uploaded, input_pdf, output_id
31
+ if pdfs is not None and not pdf_uploaded:
32
+ pdf_uploaded = True
33
+ pdf_path = pdfs.name
34
+ output_id = vector.upload_pdfs_user(pdf_path)
35
+ print(output_id)
36
+ if pdfs is None:
37
+ pdf_uploaded = False
38
+ output_id = None
39
+ print(output_id)
40
+ if audio is not None:
41
+ transcript = transcriptor.get_transcript(audio)
42
+ chat_history.append({"role": "user", "content": transcript})
43
+
44
+ elif input_text:
45
+ print(input_text)
46
+ chat_history.append({"role": "user", "content": input_text})
47
+
48
+ else:
49
+ response = 'Provide a query text or an audio to query.'
50
+ chat_history.append({"role": "assistant", "content": response})
51
+ audio_data = transcriptor.speech_synthesis(response)
52
+ return audio_data, chat_history
53
+
54
+ response = generate_response(chat_history, output_id)
55
+ chat_history.append({"role": "assistant", "content": response})
56
+ audio_data = transcriptor.speech_synthesis(response)
57
+ return audio_data, chat_history
58
+
59
+ # Create Gradio Blocks interface
60
+ with gr.Blocks() as demo:
61
+ gr.Markdown("""
62
+ # 🎤 Welcome to the ChatBot
63
+ This Bot has a Knowledge base on Indian Taxation Data by default. It allows you to chat with an AI assistant using either **text** or **voice**.<br>You can upload your own PDF data as knowledge base in the **upload a PDF** and can talk to your data seamlessly.
64
+ """)
65
+ with gr.Row():
66
+ with gr.Column(scale=1, min_width=300):
67
+ input_pdf = gr.File(label="Upload PDF", file_types=[".pdf"], file_count='single')
68
+ gr.Markdown("_Use a PDF to enhance the chatbot's knowledge!_", visible=not pdf_uploaded)
69
+
70
+ with gr.Row():
71
+ chatbot = gr.Chatbot(label="Chatbot Conversation", type="messages", bubble_full_width=True, show_copy_button=True, autoscroll=True)
72
+
73
+ with gr.Row():
74
+ input_textbox = gr.Textbox(label="Input Text", placeholder="Type your message here...")
75
+ input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
76
+
77
+ process_button = gr.Button("Submit Query")
78
+ output_audio = gr.Audio(label="Assistant's Response Audio", interactive=False, autoplay=True)
79
+
80
+ process_button.click(
81
+ fn=process,
82
+ inputs=[input_audio, input_textbox, input_pdf, chatbot],
83
+ outputs=[output_audio, chatbot]
84
+ )
85
+
86
+ if __name__ == "__main__":
87
+ demo.launch()
requirements.txt ADDED
Binary file (5.15 kB). View file
 
retriever.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_openai import OpenAIEmbeddings
3
+ from qdrant_client import QdrantClient
4
+ from langchain_qdrant import QdrantVectorStore
5
+ from qdrant_client.http import models
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ # Load environment variables
10
+ load_dotenv('.env')
11
+
12
+ class Retriever():
13
+ def __init__(self):
14
+ # Initialize Qdrant client
15
+ qdrant_client = QdrantClient(
16
+ url=os.getenv("QDRANT_URL"),
17
+ api_key=os.getenv("QDRANT_API_KEY")
18
+ )
19
+ # Initialize Qdrant vector store
20
+ self.vector_store = QdrantVectorStore(
21
+ client=qdrant_client,
22
+ collection_name="siel-ai-assignment",
23
+ embedding=OpenAIEmbeddings(),
24
+ )
25
+ self.vector_store_user = QdrantVectorStore(
26
+ client=qdrant_client,
27
+ collection_name="siel-ai-user",
28
+ embedding=OpenAIEmbeddings(),
29
+ )
30
+ self.filters = ['Taxation-Goods-and-service-Tax',
31
+ 'Taxation-INCOME-TAX-LAW',
32
+ 'Direct Tax Laws and International Taxation',
33
+ 'Indirect Tax Laws',
34
+ 'INDIAN Income Tax ACTS',
35
+ 'ONLINESITES']
36
+
37
+ def filter(self,query):
38
+ retriever1 = self.vector_store.as_retriever(
39
+ search_type="similarity_score_threshold",
40
+ search_kwargs={"k": 7,
41
+ 'score_threshold':0.7,
42
+ 'filter':models.Filter(must=[models.FieldCondition(key="metadata.DOCUMENT_IS_ABOUT", match=models.MatchValue(value=self.filters[-1]),)])
43
+ },
44
+ )
45
+ retriever2 = self.vector_store.as_retriever(
46
+ search_type="similarity_score_threshold",
47
+ search_kwargs={"k": 17,
48
+ 'score_threshold':0.7,
49
+ 'filter':models.Filter(must_not=[models.FieldCondition(key="metadata.DOCUMENT_IS_ABOUT", match=models.MatchValue(value=self.filters[-1]),)])
50
+ },
51
+ )
52
+ ret = retriever1.invoke(query)+retriever2.invoke(query)
53
+ return ret
54
+
55
+ def id_filter(self,query,id):
56
+ retriever1 = self.vector_store_user.as_retriever(
57
+ search_type="similarity_score_threshold",
58
+ search_kwargs={"k": 10,
59
+ 'score_threshold':0.7,
60
+ 'filter':models.Filter(must=[models.FieldCondition(key="metadata.ID", match=models.MatchValue(value=id),)])
61
+ }
62
+ )
63
+ ret = retriever1.invoke(query)
64
+ return ret
65
+
66
+ def data_retrieve(self, query=''):
67
+ retrieved_docs = self.vector_store.similarity_search_with_score(query, k=20)
68
+ return [doc for doc, _ in retrieved_docs]
69
+
70
+
setup.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import PromptTemplate
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain_core.output_parsers import StrOutputParser
4
+ from retriever import Retriever
5
+ from qdrant_client import QdrantClient
6
+ from qdrant_client.http.models import Distance, VectorParams
7
+ import os
8
+ import io
9
+ from langchain_qdrant import QdrantVectorStore
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
+ from langchain_openai import OpenAIEmbeddings
12
+ from langchain_community.document_loaders import PyPDFLoader
13
+ from openai import OpenAI
14
+ from groq import Groq
15
+ import soundfile as sf
16
+ from deepgram import DeepgramClient, SpeakOptions
17
+ from langchain_groq import ChatGroq
18
+ import hashlib
19
+ import time
20
+ from uuid import uuid4
21
+ from dotenv import load_dotenv
22
+
23
+ load_dotenv('.env')
24
+
25
+ class Script():
26
+ def __init__(self):
27
+ self.retriever = Retriever()
28
+ self.openai_client = ChatOpenAI(model="gpt-4o")
29
+ self.groq = ChatGroq(model='llama3-70b-8192')
30
+
31
+
32
+ def format_docs(self,format_results,id=False):
33
+ formatted_docs = []
34
+ for i,doc in enumerate(format_results,start=1):
35
+ if(id==True):
36
+ metadata = doc.metadata['DOCUMENT_NAME']
37
+ else:
38
+ metadata = doc.metadata['DOCUMENT_IS_ABOUT']
39
+ page = doc.page_content.strip()
40
+ content = f"**DOC {i}. METADATA : This DOC is about {metadata} \n CONTENT:{page}**"
41
+ formatted_docs.append(content)
42
+ return "".join(formatted_docs)
43
+
44
+ def history(self,hist):
45
+ text = ''
46
+ for i in hist:
47
+ if(i['content']!='Sorry! Unable to find an answer for your question. Try Again.'):
48
+ text += '|Role:'+i['role']+'Content:'+i['content']+'|'
49
+
50
+ def gpt_loaders(self,query:str,history:str):
51
+ template= f"""
52
+ # You are an excellent Question & Answering BOT. Given a question and the context you will answer the question only based on the given context.
53
+ # You will be given a user_query (or) User_question (or) User_scenario.
54
+ # TASK: Your task is to provide an Answer to the USER_QUERY with the given CONTEXT_DATA.
55
+ ===============================
56
+ #USER_QUERY : {{question}}
57
+ ===============================
58
+ #METADATA_OF_CONTEXT : -> The context given is related to INDIAN-TAXATIONS.
59
+ -> It may contain how to calculate tax for GOODS/SERVICES/INDIVIDUAL/CARS/TRAINS/etc anything related to INDIAN TAXES.
60
+ -> Based on the user_query use the context accordingly.
61
+ -> You can also provide a rough calculation for an example if asked for tax calculations related from the CONTEXT (if it is available in the CONTEXT).
62
+ #CONTEXT : {{context}}
63
+ ===============================
64
+ You are also given previous ChatHistories (User question and corressponding AI answer) to you as an extra data.
65
+ --# When to take the history as CONTEXT : Only if the history is relevant to the current question you are permitted to take the chat history as a context.
66
+ --# If it is not relevant to the current question do not take it.
67
+ #Chat History : {{history}}
68
+ ===============================
69
+ -> You are allowed to provide the answer only from the given context.
70
+ -> Don't provide your own answer that is not in the given context.
71
+ -> If you are not able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."
72
+ -> Try to be a precise and provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
73
+ -> Provide answer only to the question that is asked.
74
+ ===============================
75
+ # OUTPUT FORMAT:
76
+ -> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer
77
+ -> Don't provide any etc explanation apart from the answer output.
78
+ """
79
+ rag_prompt = PromptTemplate.from_template(template)
80
+ rag_chain = (
81
+ rag_prompt
82
+ | self.openai_client
83
+ | StrOutputParser()
84
+ )
85
+ question ={"context": self.format_docs(self.retriever.data_retrieve(query)), "question": query, "history": history}
86
+ return rag_chain,question
87
+
88
+ def gpt_loaders_id(self,query:str,history:str,id:str):
89
+ template= f"""
90
+ # You are an excellent Question & Answering BOT. Given a question and the context you will answer the question only based on the given context.
91
+ # You will be given a user_query (or) User_question (or) User_scenario.
92
+ # TASK: Your task is to provide an Answer to the USER_QUERY with the given CONTEXT_DATA.
93
+ ===============================
94
+ #USER_QUERY : {{question}}
95
+ ===============================
96
+ #METADATA_OF_CONTEXT : -> The context given is a given from the user pdf input.
97
+ -> Based on the user_query use the context accordingly.
98
+ #CONTEXT : {{context}}
99
+ ===============================
100
+ You are also given previous ChatHistories (User question and corressponding AI answer) to you as an extra data.
101
+ --# When to take the history as CONTEXT : Only if the history is relevant to the current question you are permitted to take the chat history as a context.
102
+ --# If it is not relevant to the current question do not take it.
103
+ #Chat History : {{history}}
104
+ ===============================
105
+ -> You are allowed to provide the answer only from the given context.
106
+ -> Don't provide your own answer that is not in the given context.
107
+ -> If you are not able to answer the given question from the context => PROVIDE "Sorry! Unable to find an answer for your question. Try Again."
108
+ -> Try to be a precise and provide a proper output for the question. Don't explain any questions too lengthy max[100 words].
109
+ -> Provide answer only to the question that is asked.
110
+ ===============================
111
+ # OUTPUT FORMAT:
112
+ -> Your output may be given to a voice model for a speech output. Try to be precise with your words. At the same time, fill the user with your answer
113
+ -> Don't provide any etc explanation apart from the answer output.
114
+ """
115
+ rag_prompt = PromptTemplate.from_template(template)
116
+ rag_chain = (
117
+ rag_prompt
118
+ | self.groq
119
+ | StrOutputParser()
120
+ )
121
+ question ={"context": self.format_docs(self.retriever.id_filter(query,id),id=True), "question": query, "history": history}
122
+ return rag_chain,question
123
+
124
+ class Vector_db():
125
+ def __init__(self):
126
+ self.text_splitter = RecursiveCharacterTextSplitter(
127
+ chunk_size=1024,
128
+ chunk_overlap=256,
129
+ length_function=len,
130
+ is_separator_regex=False,
131
+ )
132
+ self.qdrant_client = QdrantClient(
133
+ url=os.getenv("QDRANT_URL"),
134
+ api_key=os.getenv("QDRANT_API_KEY")
135
+ )
136
+ self.openai_client = OpenAI()
137
+
138
+ def get_embed(self, texts):
139
+ return self.openai_client.embeddings.create(input = texts, model="text-embedding-3-large").data[0].embedding
140
+
141
+ def text_split(self,full_text,meta):
142
+ documents = self.text_splitter.create_documents([full_text],metadatas=[meta])
143
+ return documents
144
+
145
+ def load_data(self,pdf_path:str):
146
+ loader = PyPDFLoader(pdf_path)
147
+ file = loader.load()
148
+ text = ''
149
+ for i in file:
150
+ text+=i.page_content
151
+ return text
152
+
153
+ def getdocs(self,about,filename):
154
+ text = self.load_data(filename)
155
+ data = (text+str(time.time())).encode('utf-8')
156
+ identifier = hashlib.sha256(data).hexdigest()
157
+ metadata = {'DOCUMENT_NAME':about,'ID':str(identifier)}
158
+ documents = self.text_split(text,metadata)
159
+ return documents,identifier
160
+
161
+ def upload_pdfs_user(self,path,delete=False):
162
+ if delete==True:
163
+ if(self.qdrant_client.collection_exists("siel-ai-user")):
164
+ self.qdrant_client.delete_collection("siel-ai-user")
165
+ if(not(self.qdrant_client.collection_exists("siel-ai-user"))):
166
+ self.qdrant_client.create_collection(
167
+ collection_name="siel-ai-user",
168
+ vectors_config=VectorParams(size=1536,
169
+ distance=Distance.COSINE),
170
+ )
171
+ vector_store = QdrantVectorStore(
172
+ client=self.qdrant_client,
173
+ collection_name="siel-ai-user",
174
+ embedding=OpenAIEmbeddings(),
175
+ )
176
+ documents = []
177
+ meta_data = os.path.basename(path)
178
+ docs,identifier = self.getdocs(meta_data,path)
179
+ documents+=docs
180
+ # uuid4 is used to generate unique id number of documents to use that particular doc alone as context.
181
+ ids = [str(uuid4())]*len(documents)
182
+ vector_store.add_documents(documents=documents, ids=ids)
183
+ return identifier
184
+
185
+ class Speech_Text():
186
+ def __init__(self):
187
+ self.client = Groq(api_key=os.getenv("GROQ_API_KEY"))
188
+ self.deepgram = DeepgramClient(os.environ.get("VOICE_API_KEY"))
189
+ self.options = SpeakOptions(
190
+ model="aura-luna-en",
191
+ )
192
+
193
+ # Function to get transcript from audio
194
+ def get_transcript(self,audio):
195
+ audio_buffer = io.BytesIO()
196
+ sf.write(audio_buffer, audio[1], samplerate=audio[0], format="MP3")
197
+ audio_buffer.seek(0)
198
+ translation = self.client.audio.transcriptions.create(
199
+ file=("audio.mp3", audio_buffer.read()),
200
+ model="distil-whisper-large-v3-en",
201
+ response_format="json",
202
+ temperature=0.0,
203
+ )
204
+
205
+ return translation.text
206
+
207
+ # Function for speech synthesis
208
+ def speech_synthesis(self,text: str):
209
+ TEXT = {"text": text}
210
+ FILENAME = "audio.mp3"
211
+ try:
212
+ self.deepgram.speak.v("1").save(FILENAME, TEXT, self.options)
213
+ with open(FILENAME, "rb") as audio_file:
214
+ audio_data = audio_file.read()
215
+ return audio_data
216
+ except Exception as e:
217
+ print(f"Exception: {e}")
218
+ return None