Spaces:
Build error
Build error
import gradio as gr | |
from huggingface_hub import InferenceClient | |
import time | |
import random | |
import requests | |
from bs4 import BeautifulSoup | |
from qdrant_client import QdrantClient | |
# from qdrant_client.http.models import VectorParams | |
from langchain.vectorstores import Qdrant | |
from langchain.embeddings import HuggingFaceEmbeddings | |
# from transformers import pipeline | |
from google.colab import userdata | |
from langchain import PromptTemplate | |
from langchain_groq import ChatGroq | |
# from langchain_community.llms import ChatGroq | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.runnables import RunnablePassthrough | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.schema import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import PyPDF2 | |
import os | |
qdrant_url = userdata.get('QDRANT_URL') | |
qdrant_api_key = userdata.get('QDRANT_API-KEY') | |
groq_api_key = userdata.get('GROQ_API_KEY') | |
# Function to extract text from PDFs | |
def extract_text_from_pdf(pdf_path): | |
pdf_text = "" | |
with open(pdf_path, "rb") as pdf_file: | |
reader = PyPDF2.PdfReader(pdf_file) | |
for page_num in range(len(reader.pages)): | |
pdf_text += reader.pages[page_num].extract_text() | |
return pdf_text | |
# Function to load and extract text from different document types | |
def load_documents_from_directory(directory_path): | |
documents = [] | |
# Iterate over files in the directory | |
for filename in os.listdir(directory_path): | |
file_path = os.path.join(directory_path, filename) | |
# Handling text files (.txt) | |
if filename.endswith(".txt"): | |
with open(file_path, "r") as file: | |
content = file.read() | |
doc = Document(page_content=content, metadata={"filename": filename}) | |
documents.append(doc) | |
# Handling PDF files (.pdf) | |
elif filename.endswith(".pdf"): | |
pdf_text = extract_text_from_pdf(file_path) | |
doc = Document(page_content=pdf_text, metadata={"filename": filename}) | |
documents.append(doc) | |
return documents | |
# Step 1: Load documents from a directory (handling both .txt and .pdf) | |
directory_path = "/content/drive/Othercomputers/My Laptop/Training/Atomcamp/DS6_Bootcamp/Projects/FYP/Rules_and_Policies" | |
documents = load_documents_from_directory(directory_path) | |
# Step 2: Split the documents into chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=250) | |
split_docs = text_splitter.split_documents(documents) | |
# Step 3: Embed the document chunks using HuggingFaceEmbeddings | |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
# # Step 3: Connect to Qdrant | |
# ########################################## | |
# # Run once! | |
# ########################################## | |
qdrant = Qdrant.from_documents( | |
split_docs, | |
embedding = embeddings, | |
url = qdrant_url, | |
prefer_grpc = True, | |
api_key = qdrant_api_key, | |
collection_name = "university-rules-chatbot" | |
) | |
def format_docs(docs): | |
formatted_docs = [] | |
for doc in docs: | |
# Format the metadata into a string | |
metadata_str = ', '.join(f"{key}: {value}" for key, value in doc.metadata.items()) | |
# Combine page content with its metadata | |
doc_str = f"{doc.page_content}\nMetadata: {metadata_str}" | |
# Append to the list of formatted documents | |
formatted_docs.append(doc_str) | |
# print(f"Formatted Document {len(formatted_docs)}:\n{doc_str}\n{formatted_docs}\n") # my addition | |
# Join all formatted documents with double newlines | |
return "\n\n".join(formatted_docs) | |
def retrieve_answer(question: str, bot: str): | |
""" | |
Retrieve the answer to a question from the documents. | |
Args: | |
question (str): The question to answer. | |
Returns: | |
str: The generated answer. | |
""" | |
prompt = PromptTemplate( | |
template = """ | |
# Your role | |
You are a brilliant expert at understanding the intent of the questioner and the crux of the question, and providing the most optimal answer | |
from the scraped content to the questioner's needs from the text you are given. | |
# Instructions | |
Your task is to answer the question using the following pieces of retrieved context delimited by XML tags. | |
<retrieved context> | |
Retrieved Context: | |
{context} | |
</retrieved context> | |
# Constraint | |
1. Think deeply and multiple times about the user's question\nUser's question:\n{question}\nYou must understand the intent of their question | |
and provide the most appropriate answer. | |
- Ask yourself why to understand the context of the question and why the questioner asked it, reflect on it, and provide an appropriate | |
response based on what you understand. | |
2. Choose the most relevant content(the key content that directly relates to the question) from the retrieved context and use it to generate an answer. | |
3. Generate a concise, logical answer. When generating the answer, Do Not just list your selections, But rearrange them in context | |
so that they become paragraphs with a natural flow. | |
4. When you don't have retrieved context for the question or If you have a retrieved documents, but their content is irrelevant to the question, | |
you should answer 'I can't find the answer to that question in the material I have'. | |
5. Use five sentences maximum. Keep the answer concise but logical/natural/in-depth. | |
6. At the end of the response provide metadata provided in the relevant docs, | |
For example:"Metadata: page: 19, source: /content/OCR_RSCA/Analyse docs JVB + mails et convention FOOT INNOVATION.pdf'. Return Just the page and source | |
Question: {question} | |
Helpful Answer, formated in markdown:""", | |
input_variables = ["context","question"] | |
) | |
embeddings_model = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2") | |
qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key) | |
qdrant = Qdrant( | |
client=qdrant_client, | |
collection_name="university-rules-chatbot", | |
embeddings=embeddings_model | |
) | |
retriever = qdrant.as_retriever(search_kwargs={"k": 20}) | |
# docs = retriever.get_relevant_documents(query) | |
# for doc in docs: | |
# print(f"Retrieved document:", doc.page_content) | |
# print('*' * 60) | |
# llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0,openai_api_key=openai_api_key) | |
groq_llm = ChatGroq( | |
model="llama-3.1-70b-versatile", # llma-3.1-70b-versatile | |
temperature=0, | |
groq_api_key=groq_api_key, | |
max_retries=2, | |
) | |
rag_chain = ( | |
{"context": retriever| format_docs, "question": RunnablePassthrough()} | |
| prompt | |
| groq_llm | |
| StrOutputParser() | |
) | |
answer = rag_chain.invoke(question) | |
return answer | |
# Create an empty list to store chatbot messages | |
messages = [] | |
# Add initial instructions or welcome message | |
messages.append(("Hello! How can I help you today?", "KIU-bot")) | |
# Create Gradio chatbot with the messages list | |
chatbot = gr.Chatbot(value=messages) | |
# Create Gradio interface | |
gr.ChatInterface( | |
fn=retrieve_answer, | |
chatbot=chatbot, | |
title="university-rules-chatbot", | |
description="Ask any question related to Karakoram International University Gilgit-Baltistan.", | |
examples=[["What courses does KIU offer?"]] | |
).launch(debug=True) |