# -*- coding: utf-8 -*- """Chatbot_LLM_with_RAG Quyche_FINAL.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1kRGRGeOuF9JORajZPlEI2H0IrvcrgYr0 """ import os import textwrap import chromadb import langchain import openai from langchain.chains import RetrievalQA from langchain.chat_models import ChatOpenAI from langchain.document_loaders import TextLoader, UnstructuredPDFLoader, YoutubeLoader, PyPDFLoader from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings from langchain.indexes import VectorstoreIndexCreator from langchain.llms import OpenAI from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.llms import GPT4All from pdf2image import convert_from_path # !pip uninstall 'git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a' """Download file pdf""" # Download file pdf # !gdown https://drive.google.com/uc?id=19_MlM7Cmw8z_j40dk80PQbITYNET3tL2 # !gdown https://drive.google.com/uc?id=1gdM3TfvyQPDXOuFjNS9n_DgD24ThDB84 FILE_NAME="quyche_uit_plus_removed.pdf" """Load Data & Model""" from getpass import getpass OPENAI_API_KEY = "sk-proj-jFDUBtItWEzg2vE9ZZhaT3BlbkFJi3l93u3z3FuQItueKZQp" # OPENAI_API_KEY = getpass() os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY model = OpenAI(temperature=0, model_name="gpt-3.5-turbo") # (trang) images = convert_from_path(FILE_NAME, dpi=88) # len(images) # images[-1] """Use UnstructuredPDFLoader to load PDFs""" # Use UnstructuredPDFLoader to load PDFs from the Internets pdf_loader = UnstructuredPDFLoader(FILE_NAME) pdf_pages = pdf_loader.load_and_split() # Text Splitters text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64) texts = text_splitter.split_documents(pdf_pages) # len(texts) # texts[0] # texts[-1] """Create Embeddings & Vectorstores""" MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" hf_embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME) db = Chroma.from_documents(texts, hf_embeddings, persist_directory="db") """#Use a Chain""" custom_prompt_template = """Sử dụng các thông tin sau đây để trả lời câu hỏi của người dùng. Nếu bạn không biết câu trả lời, chỉ cần nói rằng bạn không biết, đừng cố bịa ra câu trả lời. Tất cả câu trả lời của bạn đều phải trả lời bằng tiếng việt Context: {context} Question: {question} """ from langchain import PromptTemplate def set_custom_prompt(): """ Prompt template for QA retrieval for each vectorstore """ prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question']) return prompt prompt = set_custom_prompt() chain = RetrievalQA.from_chain_type( llm=model, chain_type="stuff", retriever=db.as_retriever(search_kwargs={"k": 3}), chain_type_kwargs={'prompt': prompt} ) """#QA Chatbot""" def print_response(response: str): print("\n".join(textwrap.wrap(response, width=100))) # query = "Các môn bổ túc kiến thức của khóa cao học ngành khoa học máy tính gồm những môn nào?" # response = chain.run(query) # print_response(response) # from langchain.chat_models import ChatOpenAI from langchain.schema import AIMessage, HumanMessage # import openai import gradio as gr def predict(message, history): history_langchain_format = [] for human, ai in history: history_langchain_format.append(HumanMessage(content=human)) history_langchain_format.append(AIMessage(content=ai)) history_langchain_format.append(HumanMessage(content=message)) # gpt_response = llm(history_langchain_format) return chain.run(message) chatbot=gr.ChatInterface(predict) chatbot.launch(share=True)