Baskar2005 commited on
Commit
f5c9726
·
verified ·
1 Parent(s): 052b4dd

Upload py.py

Browse files
Files changed (1) hide show
  1. py.py +325 -0
py.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import CharacterTextSplitter
2
+ from langchain.vectorstores import FAISS
3
+ from langchain.chat_models import ChatOpenAI
4
+ from langchain_openai import AzureChatOpenAI,AzureOpenAIEmbeddings
5
+ from langchain.memory import ConversationBufferMemory
6
+ from langchain.chains import ConversationChain
7
+ from langchain.chains import ConversationalRetrievalChain
8
+ from langchain.document_loaders import UnstructuredFileLoader
9
+ from typing import List, Dict, Tuple
10
+ import gradio as gr
11
+ import validators
12
+ import requests
13
+ import mimetypes
14
+ import tempfile
15
+ import os
16
+ from langchain.chains.question_answering import load_qa_chain
17
+ from langchain.llms import OpenAI
18
+ from langchain.prompts import PromptTemplate
19
+ from langchain.prompts.prompt import PromptTemplate
20
+ import pandas as pd
21
+ from langchain_experimental.agents.agent_toolkits import create_csv_agent
22
+ from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
23
+ from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
24
+ from langchain.agents.agent_types import AgentType
25
+ # from langchain.agents import create_csv_agent
26
+ from langchain import OpenAI, LLMChain
27
+ from openai import AzureOpenAI
28
+
29
+ os.environ['AZURE_OPENAI_API_KEY'] = "a96a965049c8420dad412abf07cbd26d"
30
+ os.environ['AZURE_OPENAI_ENDPOINT'] = "https://eastus2.api.cognitive.microsoft.com/"
31
+ os.environ['OPENAI_API_VERSION'] = "2024-02-01"
32
+
33
+ class ChatDocumentQA:
34
+ def __init__(self) -> None:
35
+ pass
36
+
37
+ def _get_empty_state(self) -> Dict[str, None]:
38
+ """Create an empty knowledge base."""
39
+ return {"knowledge_base": None}
40
+
41
+ def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
42
+ """Extract text content from PDF files.
43
+
44
+ Args:
45
+ file_paths (List[str]): List of file paths.
46
+
47
+ Returns:
48
+ List[str]: Extracted text from the PDFs.
49
+ """
50
+ docs = []
51
+ loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
52
+ for loader in loaders:
53
+ docs.extend(loader.load())
54
+ return docs
55
+
56
+ def _get_content_from_url(self, urls: str) -> List[str]:
57
+ """Fetch content from given URLs.
58
+
59
+ Args:
60
+ urls (str): Comma-separated URLs.
61
+
62
+ Returns:
63
+ List[str]: List of text content fetched from the URLs.
64
+ """
65
+ file_paths = []
66
+ for url in urls.split(','):
67
+ if validators.url(url):
68
+ headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
69
+ r = requests.get(url, headers=headers)
70
+ if r.status_code != 200:
71
+ raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
72
+ content_type = r.headers.get("content-type")
73
+ file_extension = mimetypes.guess_extension(content_type)
74
+ temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
75
+ temp_file.write(r.content)
76
+ file_paths.append(temp_file.name)
77
+
78
+ print("File_Paths:",file_paths)
79
+ docs = self._extract_text_from_pdfs(file_paths)
80
+ return docs
81
+
82
+ def _split_text_into_chunks(self, text: str) -> List[str]:
83
+ """Split text into smaller chunks.
84
+
85
+ Args:
86
+ text (str): Input text to be split.
87
+
88
+ Returns:
89
+ List[str]: List of smaller text chunks.
90
+ """
91
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=6000, chunk_overlap=0, length_function=len)
92
+
93
+ chunks = text_splitter.split_documents(text)
94
+
95
+ return chunks
96
+
97
+ def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
98
+ """Create a vector store from text chunks.
99
+
100
+ Args:
101
+ text_chunks (List[str]): List of text chunks.
102
+
103
+ Returns:
104
+ FAISS: Vector store created from the text chunks.
105
+ """
106
+ embeddings = AzureOpenAIEmbeddings(
107
+ azure_deployment="text-embedding-3-large",
108
+ )
109
+
110
+ return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
111
+
112
+
113
+ def _create_conversation_chain(self,vectorstore):
114
+
115
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
116
+
117
+ Chat History: {chat_history}
118
+ Follow Up Input: {question}
119
+ Standalone question:"""
120
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
121
+
122
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
123
+
124
+ # llm = ChatOpenAI(temperature=0)
125
+ llm=AzureChatOpenAI(azure_deployment = "GPT-4o")
126
+
127
+ return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(),
128
+ condense_question_prompt=CONDENSE_QUESTION_PROMPT,
129
+ memory=memory)
130
+
131
+ def _get_documents_knowledge_base(self, file_paths: List[str]) -> Tuple[str, Dict[str, FAISS]]:
132
+ """Build knowledge base from uploaded files.
133
+
134
+ Args:
135
+ file_paths (List[str]): List of file paths.
136
+
137
+ Returns:
138
+ Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
139
+ """
140
+ file_path = file_paths[0].name
141
+ file_extension = os.path.splitext(file_path)[1]
142
+
143
+ if file_extension == '.csv':
144
+ # agent = self.create_agent(file_path)
145
+ # tools = self.get_agent_tools(agent)
146
+ # memory,tools,prompt = self.create_memory_for_csv_qa(tools)
147
+ # agent_chain = self.create_agent_chain_for_csv_qa(memory,tools,prompt)
148
+ agent_chain = create_csv_agent(
149
+ AzureChatOpenAI(azure_deployment = "GPT-4o"),
150
+ file_path,
151
+ verbose=True,
152
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
153
+ )
154
+ return "file uploaded", {"knowledge_base": agent_chain}
155
+
156
+ else:
157
+ pdf_docs = [file_path.name for file_path in file_paths]
158
+ raw_text = self._extract_text_from_pdfs(pdf_docs)
159
+ text_chunks = self._split_text_into_chunks(raw_text)
160
+ vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
161
+ return "file uploaded", {"knowledge_base": vectorstore}
162
+
163
+
164
+ def _get_urls_knowledge_base(self, urls: str) -> Tuple[str, Dict[str, FAISS]]:
165
+ """Build knowledge base from URLs.
166
+
167
+ Args:
168
+ urls (str): Comma-separated URLs.
169
+
170
+ Returns:
171
+ Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
172
+ """
173
+ webpage_text = self._get_content_from_url(urls)
174
+ text_chunks = self._split_text_into_chunks(webpage_text)
175
+ vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
176
+ return "file uploaded", {"knowledge_base": vectorstore}
177
+
178
+ #************************
179
+ # csv qa
180
+ #************************
181
+ def create_agent(self,file_path):
182
+ agent_chain = create_csv_agent(
183
+ AzureChatOpenAI(azure_deployment = "GPT-4o"),
184
+ file_path,
185
+ verbose=True,
186
+ agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
187
+ )
188
+ return agent_chain
189
+ def get_agent_tools(self,agent):
190
+ # search = agent
191
+ tools = [
192
+ Tool(
193
+ name="dataframe qa",
194
+ func=agent.run,
195
+ description="useful for when you need to answer questions about table data and dataframe data",
196
+ )
197
+ ]
198
+ return tools
199
+
200
+ def create_memory_for_csv_qa(self,tools):
201
+ prefix = """Have a conversation with a human, answering the following questions about table data and dataframe data as best you can. You have access to the following tools:"""
202
+ suffix = """Begin!"
203
+
204
+ {chat_history}
205
+ Question: {input}
206
+ {agent_scratchpad}"""
207
+
208
+ prompt = ZeroShotAgent.create_prompt(
209
+ tools,
210
+ prefix=prefix,
211
+ suffix=suffix,
212
+ input_variables=["input", "chat_history", "agent_scratchpad"],
213
+ )
214
+ memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)
215
+
216
+ return memory,tools,prompt
217
+
218
+ def create_agent_chain_for_csv_qa(self,memory,tools,prompt):
219
+
220
+ llm_chain = LLMChain(llm=AzureChatOpenAI(azure_deployment = "GPT-4o"), prompt=prompt)
221
+ agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
222
+ agent_chain = AgentExecutor.from_agent_and_tools(
223
+ agent=agent, tools=tools, verbose=True, memory=memory
224
+ )
225
+
226
+ return agent_chain
227
+
228
+ def _get_response(self, message: str, chat_history: List[Tuple[str, str]], state: Dict[str, FAISS],file_paths) -> Tuple[str, List[Tuple[str, str]]]:
229
+ """Get a response from the chatbot.
230
+
231
+ Args:
232
+ message (str): User's message/question.
233
+ chat_history (List[Tuple[str, str]]): List of chat history as tuples of (user_message, bot_response).
234
+ state (dict): State containing the knowledge base.
235
+
236
+ Returns:
237
+ Tuple[str, List[Tuple[str, str]]]: Tuple containing a status message and updated chat history.
238
+ """
239
+ try:
240
+ if file_paths:
241
+ file_path = file_paths[0].name
242
+ file_extension = os.path.splitext(file_path)[1]
243
+
244
+ if file_extension == '.csv':
245
+ agent_chain = state["knowledge_base"]
246
+ response = agent_chain.run(input = message)
247
+ chat_history.append((message, response))
248
+ return "", chat_history
249
+
250
+ else:
251
+ vectorstore = state["knowledge_base"]
252
+ chat = self._create_conversation_chain(vectorstore)
253
+ response = chat({"question": message,"chat_history": chat_history})
254
+ chat_history.append((message, response["answer"]))
255
+ return "", chat_history
256
+ else:
257
+ vectorstore = state["knowledge_base"]
258
+ chat = self._create_conversation_chain(vectorstore)
259
+ response = chat({"question": message,"chat_history": chat_history})
260
+ chat_history.append((message, response["answer"]))
261
+ return "", chat_history
262
+ except:
263
+ chat_history.append((message, "Please Upload Document or URL"))
264
+ return "", chat_history
265
+
266
+ def gradio_interface(self) -> None:
267
+ """Create a Gradio interface for the chatbot."""
268
+ with gr.Blocks(css="#textbox_id textarea {color: white}",theme='SherlockRamos/Feliz') as demo:
269
+ gr.HTML("""
270
+ <style>
271
+ .footer {
272
+ display: none !important;
273
+ }
274
+ footer {
275
+ display: none !important;
276
+ }
277
+ #foot {
278
+ display: none !important;
279
+ }
280
+ .svelte-1fzp3xt {
281
+ display: none !important;
282
+ }
283
+ #root > div > div > div {
284
+ padding-bottom: 0 !important;
285
+ }
286
+ .custom-footer {
287
+ text-align: center;
288
+ padding: 10px;
289
+ font-size: 14px;
290
+ color: #333;
291
+ }
292
+ </style>
293
+ """)
294
+ gr.HTML("""<div><img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRUYJEAh2t0b2seQECPuBqkwA3e0NF8oSsfiA&s" alt="Intercontinental Exchange" style="float:left;width:80px;height:80px;"><h1 style="color:#000;margin-left:4in;padding-top:10px">Virtual Assistant Chatbot</h1></div>""")
295
+ state = gr.State(self._get_empty_state())
296
+ chatbot = gr.Chatbot()
297
+
298
+ with gr.Row():
299
+ with gr.Column(scale=0.85):
300
+ msg = gr.Textbox(label="Question", elem_id="textbox_id")
301
+ with gr.Column(scale=0.15):
302
+ file_output = gr.Textbox(label="File Status")
303
+ with gr.Row():
304
+ with gr.Column(scale=0.85):
305
+ clear = gr.ClearButton([msg, chatbot])
306
+ with gr.Column(scale=0.15):
307
+ upload_button = gr.UploadButton(
308
+ "Browse File",
309
+ file_types=[".txt", ".pdf", ".doc", ".docx", ".csv"],
310
+ file_count="multiple", variant="primary"
311
+ )
312
+ with gr.Row():
313
+ with gr.Column(scale=1):
314
+ input_url = gr.Textbox(label="urls", elem_id="textbox_id")
315
+
316
+ input_url.submit(self._get_urls_knowledge_base, input_url, [file_output, state])
317
+ upload_button.upload(self._get_documents_knowledge_base, upload_button, [file_output, state])
318
+ msg.submit(self._get_response, [msg, chatbot, state,upload_button], [msg, chatbot])
319
+
320
+ demo.launch(debug=True,allowed_paths=["/content/"])
321
+
322
+
323
+ if __name__ == "__main__":
324
+ chatdocumentqa = ChatDocumentQA()
325
+ chatdocumentqa.gradio_interface()