Spaces:
Build error
Build error
mathpal123
commited on
Commit
•
1a6d322
1
Parent(s):
f168fc7
Update app.py
Browse files
app.py
CHANGED
@@ -1,63 +1,203 @@
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
)
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from huggingface_hub import InferenceClient
|
3 |
|
4 |
+
import time
|
5 |
+
import random
|
6 |
+
import requests
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
from qdrant_client import QdrantClient
|
9 |
+
# from qdrant_client.http.models import VectorParams
|
10 |
+
from langchain.vectorstores import Qdrant
|
11 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
12 |
+
# from transformers import pipeline
|
13 |
+
from google.colab import userdata
|
14 |
+
|
15 |
+
from langchain import PromptTemplate
|
16 |
+
from langchain_groq import ChatGroq
|
17 |
+
# from langchain_community.llms import ChatGroq
|
18 |
+
from langchain_core.output_parsers import StrOutputParser
|
19 |
+
from langchain_core.runnables import RunnablePassthrough
|
20 |
+
from langchain_community.document_loaders import PyPDFLoader
|
21 |
+
from langchain.schema import Document
|
22 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
23 |
+
import PyPDF2
|
24 |
+
import os
|
25 |
+
|
26 |
+
qdrant_url = userdata.get('QDRANT_URL')
|
27 |
+
qdrant_api_key = userdata.get('QDRANT_API-KEY')
|
28 |
+
groq_api_key = userdata.get('GROQ_API_KEY')
|
29 |
+
|
30 |
+
# Function to extract text from PDFs
|
31 |
+
def extract_text_from_pdf(pdf_path):
|
32 |
+
pdf_text = ""
|
33 |
+
with open(pdf_path, "rb") as pdf_file:
|
34 |
+
reader = PyPDF2.PdfReader(pdf_file)
|
35 |
+
for page_num in range(len(reader.pages)):
|
36 |
+
pdf_text += reader.pages[page_num].extract_text()
|
37 |
+
return pdf_text
|
38 |
+
|
39 |
+
# Function to load and extract text from different document types
|
40 |
+
def load_documents_from_directory(directory_path):
|
41 |
+
documents = []
|
42 |
+
|
43 |
+
# Iterate over files in the directory
|
44 |
+
for filename in os.listdir(directory_path):
|
45 |
+
file_path = os.path.join(directory_path, filename)
|
46 |
+
|
47 |
+
# Handling text files (.txt)
|
48 |
+
if filename.endswith(".txt"):
|
49 |
+
with open(file_path, "r") as file:
|
50 |
+
content = file.read()
|
51 |
+
doc = Document(page_content=content, metadata={"filename": filename})
|
52 |
+
documents.append(doc)
|
53 |
+
|
54 |
+
# Handling PDF files (.pdf)
|
55 |
+
elif filename.endswith(".pdf"):
|
56 |
+
pdf_text = extract_text_from_pdf(file_path)
|
57 |
+
doc = Document(page_content=pdf_text, metadata={"filename": filename})
|
58 |
+
documents.append(doc)
|
59 |
+
|
60 |
+
return documents
|
61 |
+
|
62 |
+
# Step 1: Load documents from a directory (handling both .txt and .pdf)
|
63 |
+
directory_path = "/content/drive/Othercomputers/My Laptop/Training/Atomcamp/DS6_Bootcamp/Projects/FYP/Rules_and_Policies"
|
64 |
+
documents = load_documents_from_directory(directory_path)
|
65 |
+
|
66 |
+
# Step 2: Split the documents into chunks
|
67 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=250)
|
68 |
+
split_docs = text_splitter.split_documents(documents)
|
69 |
+
|
70 |
+
# Step 3: Embed the document chunks using HuggingFaceEmbeddings
|
71 |
+
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
72 |
+
|
73 |
+
# # Step 3: Connect to Qdrant
|
74 |
+
# ##########################################
|
75 |
+
# # Run once!
|
76 |
+
# ##########################################
|
77 |
+
qdrant = Qdrant.from_documents(
|
78 |
+
split_docs,
|
79 |
+
embedding = embeddings,
|
80 |
+
url = qdrant_url,
|
81 |
+
prefer_grpc = True,
|
82 |
+
api_key = qdrant_api_key,
|
83 |
+
collection_name = "university-rules-chatbot"
|
84 |
)
|
85 |
|
86 |
+
def format_docs(docs):
|
87 |
+
formatted_docs = []
|
88 |
+
for doc in docs:
|
89 |
+
# Format the metadata into a string
|
90 |
+
metadata_str = ', '.join(f"{key}: {value}" for key, value in doc.metadata.items())
|
91 |
+
|
92 |
+
# Combine page content with its metadata
|
93 |
+
doc_str = f"{doc.page_content}\nMetadata: {metadata_str}"
|
94 |
+
|
95 |
+
# Append to the list of formatted documents
|
96 |
+
formatted_docs.append(doc_str)
|
97 |
+
|
98 |
+
# print(f"Formatted Document {len(formatted_docs)}:\n{doc_str}\n{formatted_docs}\n") # my addition
|
99 |
+
|
100 |
+
# Join all formatted documents with double newlines
|
101 |
+
return "\n\n".join(formatted_docs)
|
102 |
+
|
103 |
+
def retrieve_answer(question: str, bot: str):
|
104 |
+
"""
|
105 |
+
Retrieve the answer to a question from the documents.
|
106 |
+
|
107 |
+
Args:
|
108 |
+
question (str): The question to answer.
|
109 |
+
|
110 |
+
Returns:
|
111 |
+
str: The generated answer.
|
112 |
+
"""
|
113 |
+
|
114 |
+
prompt = PromptTemplate(
|
115 |
+
template = """
|
116 |
+
# Your role
|
117 |
+
You are a brilliant expert at understanding the intent of the questioner and the crux of the question, and providing the most optimal answer
|
118 |
+
from the scraped content to the questioner's needs from the text you are given.
|
119 |
+
|
120 |
+
|
121 |
+
# Instructions
|
122 |
+
Your task is to answer the question using the following pieces of retrieved context delimited by XML tags.
|
123 |
+
|
124 |
+
<retrieved context>
|
125 |
+
Retrieved Context:
|
126 |
+
{context}
|
127 |
+
</retrieved context>
|
128 |
+
|
129 |
+
|
130 |
+
# Constraint
|
131 |
+
1. Think deeply and multiple times about the user's question\nUser's question:\n{question}\nYou must understand the intent of their question
|
132 |
+
and provide the most appropriate answer.
|
133 |
+
- Ask yourself why to understand the context of the question and why the questioner asked it, reflect on it, and provide an appropriate
|
134 |
+
response based on what you understand.
|
135 |
+
2. Choose the most relevant content(the key content that directly relates to the question) from the retrieved context and use it to generate an answer.
|
136 |
+
3. Generate a concise, logical answer. When generating the answer, Do Not just list your selections, But rearrange them in context
|
137 |
+
so that they become paragraphs with a natural flow.
|
138 |
+
4. When you don't have retrieved context for the question or If you have a retrieved documents, but their content is irrelevant to the question,
|
139 |
+
you should answer 'I can't find the answer to that question in the material I have'.
|
140 |
+
5. Use five sentences maximum. Keep the answer concise but logical/natural/in-depth.
|
141 |
+
6. At the end of the response provide metadata provided in the relevant docs,
|
142 |
+
For example:"Metadata: page: 19, source: /content/OCR_RSCA/Analyse docs JVB + mails et convention FOOT INNOVATION.pdf'. Return Just the page and source
|
143 |
+
|
144 |
+
Question: {question}
|
145 |
+
Helpful Answer, formated in markdown:""",
|
146 |
+
|
147 |
+
input_variables = ["context","question"]
|
148 |
+
)
|
149 |
+
|
150 |
+
embeddings_model = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")
|
151 |
+
|
152 |
+
qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
|
153 |
+
|
154 |
+
qdrant = Qdrant(
|
155 |
+
client=qdrant_client,
|
156 |
+
collection_name="university-rules-chatbot",
|
157 |
+
embeddings=embeddings_model
|
158 |
+
)
|
159 |
+
|
160 |
+
retriever = qdrant.as_retriever(search_kwargs={"k": 20})
|
161 |
+
|
162 |
+
# docs = retriever.get_relevant_documents(query)
|
163 |
+
# for doc in docs:
|
164 |
+
# print(f"Retrieved document:", doc.page_content)
|
165 |
+
# print('*' * 60)
|
166 |
+
|
167 |
+
# llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0,openai_api_key=openai_api_key)
|
168 |
+
|
169 |
+
groq_llm = ChatGroq(
|
170 |
+
model="llama-3.1-70b-versatile", # llma-3.1-70b-versatile
|
171 |
+
temperature=0,
|
172 |
+
groq_api_key=groq_api_key,
|
173 |
+
max_retries=2,
|
174 |
+
)
|
175 |
+
|
176 |
+
rag_chain = (
|
177 |
+
{"context": retriever| format_docs, "question": RunnablePassthrough()}
|
178 |
+
| prompt
|
179 |
+
| groq_llm
|
180 |
+
| StrOutputParser()
|
181 |
+
)
|
182 |
+
|
183 |
+
answer = rag_chain.invoke(question)
|
184 |
+
|
185 |
+
return answer
|
186 |
+
|
187 |
+
# Create an empty list to store chatbot messages
|
188 |
+
messages = []
|
189 |
+
|
190 |
+
# Add initial instructions or welcome message
|
191 |
+
messages.append(("Hello! How can I help you today?", "KIU-bot"))
|
192 |
+
|
193 |
+
# Create Gradio chatbot with the messages list
|
194 |
+
chatbot = gr.Chatbot(value=messages)
|
195 |
|
196 |
+
# Create Gradio interface
|
197 |
+
gr.ChatInterface(
|
198 |
+
fn=retrieve_answer,
|
199 |
+
chatbot=chatbot,
|
200 |
+
title="university-rules-chatbot",
|
201 |
+
description="Ask any question related to Karakoram International University Gilgit-Baltistan.",
|
202 |
+
examples=[["What courses does KIU offer?"]]
|
203 |
+
).launch(debug=True)
|