hoan17's picture
Upload 3 files
b6e5245 verified
# -*- coding: utf-8 -*-
"""Chatbot_LLM_with_RAG Quyche_FINAL.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1kRGRGeOuF9JORajZPlEI2H0IrvcrgYr0
"""
import os
import textwrap
import chromadb
import langchain
import openai
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader, UnstructuredPDFLoader, YoutubeLoader, PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.llms import GPT4All
from pdf2image import convert_from_path
# !pip uninstall 'git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a'
"""Download file pdf"""
# Download file pdf
# !gdown https://drive.google.com/uc?id=19_MlM7Cmw8z_j40dk80PQbITYNET3tL2
# !gdown https://drive.google.com/uc?id=1gdM3TfvyQPDXOuFjNS9n_DgD24ThDB84
FILE_NAME="quyche_uit_plus_removed.pdf"
"""Load Data & Model"""
from getpass import getpass
OPENAI_API_KEY = "sk-proj-jFDUBtItWEzg2vE9ZZhaT3BlbkFJi3l93u3z3FuQItueKZQp"
# OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
model = OpenAI(temperature=0, model_name="gpt-3.5-turbo")
# (trang)
images = convert_from_path(FILE_NAME, dpi=88)
# len(images)
# images[-1]
"""Use UnstructuredPDFLoader to load PDFs"""
# Use UnstructuredPDFLoader to load PDFs from the Internets
pdf_loader = UnstructuredPDFLoader(FILE_NAME)
pdf_pages = pdf_loader.load_and_split()
# Text Splitters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(pdf_pages)
# len(texts)
# texts[0]
# texts[-1]
"""Create Embeddings & Vectorstores"""
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
hf_embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)
db = Chroma.from_documents(texts, hf_embeddings, persist_directory="db")
"""#Use a Chain"""
custom_prompt_template = """Sử dụng các thông tin sau đây để trả lời câu hỏi của người dùng.
Nếu bạn không biết câu trả lời, chỉ cần nói rằng bạn không biết, đừng cố bịa ra câu trả lời.
Tất cả câu trả lời của bạn đều phải trả lời bằng tiếng việt
Context: {context}
Question: {question}
"""
from langchain import PromptTemplate
def set_custom_prompt():
"""
Prompt template for QA retrieval for each vectorstore
"""
prompt = PromptTemplate(template=custom_prompt_template,
input_variables=['context', 'question'])
return prompt
prompt = set_custom_prompt()
chain = RetrievalQA.from_chain_type(
llm=model,
chain_type="stuff",
retriever=db.as_retriever(search_kwargs={"k": 3}),
chain_type_kwargs={'prompt': prompt}
)
"""#QA Chatbot"""
def print_response(response: str):
print("\n".join(textwrap.wrap(response, width=100)))
# query = "Các môn bổ túc kiến thức của khóa cao học ngành khoa học máy tính gồm những môn nào?"
# response = chain.run(query)
# print_response(response)
# from langchain.chat_models import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage
# import openai
import gradio as gr
def predict(message, history):
history_langchain_format = []
for human, ai in history:
history_langchain_format.append(HumanMessage(content=human))
history_langchain_format.append(AIMessage(content=ai))
history_langchain_format.append(HumanMessage(content=message))
# gpt_response = llm(history_langchain_format)
return chain.run(message)
chatbot=gr.ChatInterface(predict)
chatbot.launch(share=True)