Yinong Liang commited on
Commit
e79acb8
·
unverified ·
1 Parent(s): a2bc835

Add files via upload

Browse files
Files changed (5) hide show
  1. Dockerfile +11 -0
  2. README.md +1 -0
  3. app.py +162 -0
  4. chainlit.md +3 -0
  5. chat_logo.jpg +0 -0
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ title: ChatWithYourLegalPDF
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ from langchain_community.document_loaders import PyMuPDFLoader
4
+ import uuid
5
+
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain.vectorstores import Chroma
9
+ from langchain.chains import (
10
+ ConversationalRetrievalChain,
11
+ )
12
+ from langchain.document_loaders import PyPDFLoader
13
+ from langchain.chat_models import ChatOpenAI
14
+ from langchain.prompts.chat import (
15
+ ChatPromptTemplate,
16
+ SystemMessagePromptTemplate,
17
+ HumanMessagePromptTemplate,
18
+ )
19
+ from langchain.docstore.document import Document
20
+ from langchain.memory import ChatMessageHistory, ConversationBufferMemory
21
+ from chainlit.types import AskFileResponse
22
+
23
+ import chainlit as cl
24
+ from langchain_qdrant import QdrantVectorStore
25
+ from qdrant_client import QdrantClient
26
+ from qdrant_client.http.models import Distance, VectorParams
27
+
28
+ system_template = """Use the following pieces of context to answer the users question.
29
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
30
+ ALWAYS return a "SOURCES" part in your answer.
31
+ The "SOURCES" part should be a reference to the source of the document from which you got your answer.
32
+ And if the user greets with greetings like Hi, hello, How are you, etc reply accordingly as well.
33
+ Example of your response should be:
34
+ The answer is foo
35
+ SOURCES: xyz
36
+ Begin!
37
+ ----------------
38
+ {summaries}"""
39
+ messages = [
40
+ SystemMessagePromptTemplate.from_template(system_template),
41
+ HumanMessagePromptTemplate.from_template("{question}"),
42
+ ]
43
+ prompt = ChatPromptTemplate.from_messages(messages)
44
+ chain_type_kwargs = {"prompt": prompt}
45
+
46
+
47
+ def generate_vdb(chunks=None):
48
+ EMBEDDING_MODEL = "text-embedding-3-small"
49
+ embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
50
+ PERSIST_PATH = "./qdrant_vector_db" # Directory to store Qdrant collection
51
+ COLLECTION_NAME = "legal_data"
52
+ VECTOR_SIZE = 1536
53
+
54
+ # Check if the vector database already exists
55
+ if os.path.exists(PERSIST_PATH):
56
+ print(f"Loading existing Qdrant database from {PERSIST_PATH}")
57
+ qdrant_client = QdrantClient(path=PERSIST_PATH) # Load the existing DB
58
+ qdrant_vector_store = QdrantVectorStore(
59
+ client=qdrant_client,
60
+ collection_name=COLLECTION_NAME,
61
+ embedding=embeddings,
62
+ )
63
+ else:
64
+ print(f"Creating new Qdrant database at {PERSIST_PATH}")
65
+ qdrant_client = QdrantClient(path=PERSIST_PATH) # Create a new DB
66
+ qdrant_client.create_collection(
67
+ collection_name=COLLECTION_NAME,
68
+ vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
69
+ )
70
+ qdrant_vector_store = QdrantVectorStore(
71
+ client=qdrant_client,
72
+ collection_name=COLLECTION_NAME,
73
+ embedding=embeddings,
74
+ )
75
+ qdrant_vector_store.add_documents(chunks)
76
+ return qdrant_vector_store
77
+
78
+
79
+ @cl.on_chat_start
80
+ async def on_chat_start():
81
+ await cl.Avatar(
82
+ name="Chat Legal AI",
83
+ path="./chat_logo.jpg",
84
+ ).send()
85
+
86
+ pdf_links = [
87
+ "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf",
88
+ "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"]
89
+
90
+ if not os.path.exists("./qdrant_vector_db"):
91
+ documents = []
92
+ for pdf_link in pdf_links:
93
+ loader = PyMuPDFLoader(pdf_link)
94
+ loaded_docs = loader.load()
95
+ documents.extend(loaded_docs)
96
+
97
+ CHUNK_SIZE = 1000
98
+ CHUNK_OVERLAP = 200
99
+
100
+ text_splitter = RecursiveCharacterTextSplitter(
101
+ chunk_size=CHUNK_SIZE,
102
+ chunk_overlap=CHUNK_OVERLAP,
103
+ length_function=len,
104
+ )
105
+ split_chunks = text_splitter.split_documents(documents)
106
+
107
+ docsearch = generate_vdb(split_chunks)
108
+ else:
109
+ docsearch = generate_vdb()
110
+
111
+ # Let the user know that the system is ready
112
+ msg = cl.Message(
113
+ content=f"Welcome to the AI Legal Chatbot! Ask me anything about the AI policy", disable_human_feedback=True, author="Chat Legal AI"
114
+ )
115
+ await msg.send()
116
+
117
+ message_history = ChatMessageHistory()
118
+
119
+ memory = ConversationBufferMemory(
120
+ memory_key="chat_history",
121
+ output_key="answer",
122
+ chat_memory=message_history,
123
+ return_messages=True,
124
+ )
125
+
126
+ # Create a chain that uses the Chroma vector store
127
+ chain = ConversationalRetrievalChain.from_llm(
128
+ ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
129
+ chain_type="stuff",
130
+ retriever=docsearch.as_retriever(),
131
+ memory=memory,
132
+ return_source_documents=True,
133
+ )
134
+ cl.user_session.set("chain", chain)
135
+
136
+
137
+ @cl.on_message
138
+ async def main(message):
139
+ chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain
140
+ cb = cl.AsyncLangchainCallbackHandler()
141
+
142
+ res = await chain.acall(message.content, callbacks=[cb])
143
+ answer = res["answer"]
144
+ source_documents = res["source_documents"] # type: List[Document]
145
+
146
+ text_elements = [] # type: List[cl.Text]
147
+
148
+ if source_documents:
149
+ for source_idx, source_doc in enumerate(source_documents):
150
+ source_name = f"source_{source_idx}"
151
+ # Create the text element referenced in the message
152
+ text_elements.append(
153
+ cl.Text(content=source_doc.page_content, name=source_name)
154
+ )
155
+ source_names = [text_el.name for text_el in text_elements]
156
+
157
+ if source_names:
158
+ answer += f"\nSources: {', '.join(source_names)}"
159
+ else:
160
+ answer += "\nNo sources found"
161
+
162
+ await cl.Message(content=answer, elements=text_elements,author="Chat Legal AI").send()
chainlit.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Chat with Legal PDF
2
+
3
+ This Chainlit app was created following instructions from [this repository!](https://github.com/AI-Maker-Space/Beyond-ChatGPT)
chat_logo.jpg ADDED