File size: 3,891 Bytes
b6e5245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
"""Chatbot_LLM_with_RAG Quyche_FINAL.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1kRGRGeOuF9JORajZPlEI2H0IrvcrgYr0
"""


import os
import textwrap

import chromadb
import langchain
import openai
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader, UnstructuredPDFLoader, YoutubeLoader, PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.llms import GPT4All
from pdf2image import convert_from_path



# !pip uninstall 'git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a'

"""Download file pdf"""

# Download file pdf
# !gdown https://drive.google.com/uc?id=19_MlM7Cmw8z_j40dk80PQbITYNET3tL2
# !gdown https://drive.google.com/uc?id=1gdM3TfvyQPDXOuFjNS9n_DgD24ThDB84

FILE_NAME="quyche_uit_plus_removed.pdf"

"""Load Data & Model"""

from getpass import getpass
OPENAI_API_KEY = "sk-proj-jFDUBtItWEzg2vE9ZZhaT3BlbkFJi3l93u3z3FuQItueKZQp"

# OPENAI_API_KEY = getpass()

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
model = OpenAI(temperature=0, model_name="gpt-3.5-turbo")
#  (trang)

images = convert_from_path(FILE_NAME, dpi=88)
# len(images)
# images[-1]

"""Use UnstructuredPDFLoader to load PDFs"""

# Use UnstructuredPDFLoader to load PDFs from the Internets
pdf_loader = UnstructuredPDFLoader(FILE_NAME)
pdf_pages = pdf_loader.load_and_split()

# Text Splitters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(pdf_pages)
# len(texts)

# texts[0]

# texts[-1]

"""Create Embeddings & Vectorstores"""

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
hf_embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)

db = Chroma.from_documents(texts, hf_embeddings, persist_directory="db")

"""#Use a Chain"""

custom_prompt_template = """Sử dụng các thông tin sau đây để trả lời câu hỏi của người dùng.
Nếu bạn không biết câu trả lời, chỉ cần nói rằng bạn không biết, đừng cố bịa ra câu trả lời.
Tất cả câu trả lời của bạn đều phải trả lời bằng tiếng việt

Context: {context}
Question: {question}

"""

from langchain import PromptTemplate
def set_custom_prompt():
    """
    Prompt template for QA retrieval for each vectorstore
    """
    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])
    return prompt

prompt = set_custom_prompt()
chain = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 3}),
    chain_type_kwargs={'prompt': prompt}
)

"""#QA Chatbot"""

def print_response(response: str):
    print("\n".join(textwrap.wrap(response, width=100)))

# query = "Các môn bổ túc kiến thức của khóa cao học ngành khoa học máy tính gồm những môn nào?"
# response = chain.run(query)
# print_response(response)


# from langchain.chat_models import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage
# import openai
import gradio as gr

def predict(message, history):
    history_langchain_format = []
    for human, ai in history:
        history_langchain_format.append(HumanMessage(content=human))
        history_langchain_format.append(AIMessage(content=ai))
    history_langchain_format.append(HumanMessage(content=message))
    # gpt_response = llm(history_langchain_format)
    return chain.run(message)

chatbot=gr.ChatInterface(predict)
chatbot.launch(share=True)