mathpal123 commited on
Commit
1a6d322
1 Parent(s): f168fc7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -57
app.py CHANGED
@@ -1,63 +1,203 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  )
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- if __name__ == "__main__":
63
- demo.launch()
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
4
+ import time
5
+ import random
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from qdrant_client import QdrantClient
9
+ # from qdrant_client.http.models import VectorParams
10
+ from langchain.vectorstores import Qdrant
11
+ from langchain.embeddings import HuggingFaceEmbeddings
12
+ # from transformers import pipeline
13
+ from google.colab import userdata
14
+
15
+ from langchain import PromptTemplate
16
+ from langchain_groq import ChatGroq
17
+ # from langchain_community.llms import ChatGroq
18
+ from langchain_core.output_parsers import StrOutputParser
19
+ from langchain_core.runnables import RunnablePassthrough
20
+ from langchain_community.document_loaders import PyPDFLoader
21
+ from langchain.schema import Document
22
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
23
+ import PyPDF2
24
+ import os
25
+
26
+ qdrant_url = userdata.get('QDRANT_URL')
27
+ qdrant_api_key = userdata.get('QDRANT_API-KEY')
28
+ groq_api_key = userdata.get('GROQ_API_KEY')
29
+
30
+ # Function to extract text from PDFs
31
+ def extract_text_from_pdf(pdf_path):
32
+ pdf_text = ""
33
+ with open(pdf_path, "rb") as pdf_file:
34
+ reader = PyPDF2.PdfReader(pdf_file)
35
+ for page_num in range(len(reader.pages)):
36
+ pdf_text += reader.pages[page_num].extract_text()
37
+ return pdf_text
38
+
39
+ # Function to load and extract text from different document types
40
+ def load_documents_from_directory(directory_path):
41
+ documents = []
42
+
43
+ # Iterate over files in the directory
44
+ for filename in os.listdir(directory_path):
45
+ file_path = os.path.join(directory_path, filename)
46
+
47
+ # Handling text files (.txt)
48
+ if filename.endswith(".txt"):
49
+ with open(file_path, "r") as file:
50
+ content = file.read()
51
+ doc = Document(page_content=content, metadata={"filename": filename})
52
+ documents.append(doc)
53
+
54
+ # Handling PDF files (.pdf)
55
+ elif filename.endswith(".pdf"):
56
+ pdf_text = extract_text_from_pdf(file_path)
57
+ doc = Document(page_content=pdf_text, metadata={"filename": filename})
58
+ documents.append(doc)
59
+
60
+ return documents
61
+
62
+ # Step 1: Load documents from a directory (handling both .txt and .pdf)
63
+ directory_path = "/content/drive/Othercomputers/My Laptop/Training/Atomcamp/DS6_Bootcamp/Projects/FYP/Rules_and_Policies"
64
+ documents = load_documents_from_directory(directory_path)
65
+
66
+ # Step 2: Split the documents into chunks
67
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=250)
68
+ split_docs = text_splitter.split_documents(documents)
69
+
70
+ # Step 3: Embed the document chunks using HuggingFaceEmbeddings
71
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
72
+
73
+ # # Step 3: Connect to Qdrant
74
+ # ##########################################
75
+ # # Run once!
76
+ # ##########################################
77
+ qdrant = Qdrant.from_documents(
78
+ split_docs,
79
+ embedding = embeddings,
80
+ url = qdrant_url,
81
+ prefer_grpc = True,
82
+ api_key = qdrant_api_key,
83
+ collection_name = "university-rules-chatbot"
84
  )
85
 
86
+ def format_docs(docs):
87
+ formatted_docs = []
88
+ for doc in docs:
89
+ # Format the metadata into a string
90
+ metadata_str = ', '.join(f"{key}: {value}" for key, value in doc.metadata.items())
91
+
92
+ # Combine page content with its metadata
93
+ doc_str = f"{doc.page_content}\nMetadata: {metadata_str}"
94
+
95
+ # Append to the list of formatted documents
96
+ formatted_docs.append(doc_str)
97
+
98
+ # print(f"Formatted Document {len(formatted_docs)}:\n{doc_str}\n{formatted_docs}\n") # my addition
99
+
100
+ # Join all formatted documents with double newlines
101
+ return "\n\n".join(formatted_docs)
102
+
103
+ def retrieve_answer(question: str, bot: str):
104
+ """
105
+ Retrieve the answer to a question from the documents.
106
+
107
+ Args:
108
+ question (str): The question to answer.
109
+
110
+ Returns:
111
+ str: The generated answer.
112
+ """
113
+
114
+ prompt = PromptTemplate(
115
+ template = """
116
+ # Your role
117
+ You are a brilliant expert at understanding the intent of the questioner and the crux of the question, and providing the most optimal answer
118
+ from the scraped content to the questioner's needs from the text you are given.
119
+
120
+
121
+ # Instructions
122
+ Your task is to answer the question using the following pieces of retrieved context delimited by XML tags.
123
+
124
+ <retrieved context>
125
+ Retrieved Context:
126
+ {context}
127
+ </retrieved context>
128
+
129
+
130
+ # Constraint
131
+ 1. Think deeply and multiple times about the user's question\nUser's question:\n{question}\nYou must understand the intent of their question
132
+ and provide the most appropriate answer.
133
+ - Ask yourself why to understand the context of the question and why the questioner asked it, reflect on it, and provide an appropriate
134
+ response based on what you understand.
135
+ 2. Choose the most relevant content(the key content that directly relates to the question) from the retrieved context and use it to generate an answer.
136
+ 3. Generate a concise, logical answer. When generating the answer, Do Not just list your selections, But rearrange them in context
137
+ so that they become paragraphs with a natural flow.
138
+ 4. When you don't have retrieved context for the question or If you have a retrieved documents, but their content is irrelevant to the question,
139
+ you should answer 'I can't find the answer to that question in the material I have'.
140
+ 5. Use five sentences maximum. Keep the answer concise but logical/natural/in-depth.
141
+ 6. At the end of the response provide metadata provided in the relevant docs,
142
+ For example:"Metadata: page: 19, source: /content/OCR_RSCA/Analyse docs JVB + mails et convention FOOT INNOVATION.pdf'. Return Just the page and source
143
+
144
+ Question: {question}
145
+ Helpful Answer, formated in markdown:""",
146
+
147
+ input_variables = ["context","question"]
148
+ )
149
+
150
+ embeddings_model = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
151
+
152
+ qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
153
+
154
+ qdrant = Qdrant(
155
+ client=qdrant_client,
156
+ collection_name="university-rules-chatbot",
157
+ embeddings=embeddings_model
158
+ )
159
+
160
+ retriever = qdrant.as_retriever(search_kwargs={"k": 20})
161
+
162
+ # docs = retriever.get_relevant_documents(query)
163
+ # for doc in docs:
164
+ # print(f"Retrieved document:", doc.page_content)
165
+ # print('*' * 60)
166
+
167
+ # llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0,openai_api_key=openai_api_key)
168
+
169
+ groq_llm = ChatGroq(
170
+ model="llama-3.1-70b-versatile", # llma-3.1-70b-versatile
171
+ temperature=0,
172
+ groq_api_key=groq_api_key,
173
+ max_retries=2,
174
+ )
175
+
176
+ rag_chain = (
177
+ {"context": retriever| format_docs, "question": RunnablePassthrough()}
178
+ | prompt
179
+ | groq_llm
180
+ | StrOutputParser()
181
+ )
182
+
183
+ answer = rag_chain.invoke(question)
184
+
185
+ return answer
186
+
187
+ # Create an empty list to store chatbot messages
188
+ messages = []
189
+
190
+ # Add initial instructions or welcome message
191
+ messages.append(("Hello! How can I help you today?", "KIU-bot"))
192
+
193
+ # Create Gradio chatbot with the messages list
194
+ chatbot = gr.Chatbot(value=messages)
195
 
196
+ # Create Gradio interface
197
+ gr.ChatInterface(
198
+ fn=retrieve_answer,
199
+ chatbot=chatbot,
200
+ title="university-rules-chatbot",
201
+ description="Ask any question related to Karakoram International University Gilgit-Baltistan.",
202
+ examples=[["What courses does KIU offer?"]]
203
+ ).launch(debug=True)