Msp commited on
Commit
a82bdea
·
1 Parent(s): b563562

Upload 4 files

Browse files
Files changed (3) hide show
  1. Dockerfile +11 -0
  2. rag_engine.py +149 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ COPY . .
11
+ CMD ["chainlit", "run", "rag_engine.py", "--port", "7860"]
rag_engine.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ from langchain.document_loaders import PyPDFLoader, TextLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.embeddings.openai import OpenAIEmbeddings
6
+ from langchain.vectorstores.pinecone import Pinecone
7
+ from langchain.chains import RetrievalQA
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain.memory import ChatMessageHistory, ConversationBufferMemory
10
+ from langchain.docstore.document import Document
11
+ import pinecone
12
+ import chainlit as cl
13
+ from chainlit.types import AskFileResponse
14
+
15
+ pinecone.init(
16
+ api_key="2b6aa6bf-2e20-4445-a560-f7dd4952e59e",
17
+ environment="gcp-starter",
18
+ )
19
+
20
+ index_name = "skandhaar"
21
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
22
+ embeddings = OpenAIEmbeddings()
23
+
24
+ namespaces = set()
25
+
26
+ welcome_message = """Welcome to the Chainlit PDF QA demo! To get started:
27
+ 1. Upload a PDF or text file
28
+ """
29
+
30
+
31
+ def process_file(file: AskFileResponse):
32
+ import tempfile
33
+
34
+ if file.type == "text/plain":
35
+ Loader = TextLoader
36
+ elif file.type == "application/pdf":
37
+ Loader = PyPDFLoader
38
+
39
+ with tempfile.NamedTemporaryFile(mode="wb", delete=False) as tempfile:
40
+ if file.type == "text/plain":
41
+ tempfile.write(file.content)
42
+ elif file.type == "application/pdf":
43
+ with open(tempfile.name, "wb") as f:
44
+ f.write(file.content)
45
+
46
+ loader = Loader(tempfile.name)
47
+ documents = loader.load()
48
+ docs = text_splitter.split_documents(documents)
49
+ for i, doc in enumerate(docs):
50
+ doc.metadata["source"] = f"source_{i}"
51
+ return docs
52
+
53
+
54
+ def get_docsearch(file: AskFileResponse):
55
+ docs = process_file(file)
56
+
57
+ # Save data in the user session
58
+ cl.user_session.set("docs", docs)
59
+
60
+ # Create a unique namespace for the file
61
+ namespace = str(hash(file.content))
62
+
63
+ if namespace in namespaces:
64
+ docsearch = Pinecone.from_existing_index(
65
+ index_name=index_name, embedding=embeddings
66
+ )
67
+ else:
68
+ docsearch = Pinecone.from_documents(
69
+ docs, embeddings, index_name=index_name
70
+ )
71
+ namespaces.add(namespace)
72
+
73
+ return docsearch
74
+
75
+
76
+ @cl.on_chat_start
77
+ async def start():
78
+ await cl.Avatar(
79
+ name="Chatbot",
80
+ url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
81
+ ).send()
82
+
83
+ files = None
84
+ while files is None:
85
+ files = await cl.AskFileMessage(
86
+ content=welcome_message,
87
+ accept=["text/plain", "application/pdf"],
88
+ max_size_mb=20,
89
+ timeout=180,
90
+ disable_human_feedback=True,
91
+ ).send()
92
+
93
+ for file in files:
94
+ msg = cl.Message(
95
+ content=f"Processing `{file.name}`...", disable_human_feedback=True
96
+ )
97
+ await msg.send()
98
+
99
+ # No async implementation in the Pinecone client, fallback to sync
100
+ docsearch = await cl.make_async(get_docsearch)(file)
101
+
102
+ message_history = ChatMessageHistory()
103
+
104
+ memory = ConversationBufferMemory(
105
+ memory_key="chat_history",
106
+ output_key="result",
107
+ chat_memory=message_history,
108
+ return_messages=True,
109
+ )
110
+
111
+ chain = RetrievalQA.from_chain_type(
112
+ ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True, openai_api_key="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"),
113
+ chain_type="stuff",
114
+ retriever=docsearch.as_retriever(),
115
+ return_source_documents=True,
116
+ )
117
+
118
+ # Let the user know that the system is ready
119
+ msg.content = f"`{file.name}` processed. You can now ask questions!"
120
+ await msg.update()
121
+
122
+ cl.user_session.set("chain", chain)
123
+
124
+
125
+ @cl.on_message
126
+ async def main(message: cl.Message):
127
+ chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain
128
+ cb = cl.AsyncLangchainCallbackHandler()
129
+ res = await chain.acall(message.content, callbacks=[cb])
130
+ answer = res["result"]
131
+ source_documents = res["source_documents"] # type: List[Document]
132
+
133
+ text_elements = [] # type: List[cl.Text]
134
+
135
+ if source_documents:
136
+ for source_idx, source_doc in enumerate(source_documents):
137
+ source_name = f"source_{source_idx}"
138
+ # Create the text element referenced in the message
139
+ text_elements.append(
140
+ cl.Text(content=source_doc.page_content, name=source_name)
141
+ )
142
+ source_names = [text_el.name for text_el in text_elements]
143
+
144
+ if source_names:
145
+ answer += f"\nSources: {', '.join(source_names)}"
146
+ else:
147
+ answer += "\nNo sources found"
148
+
149
+ await cl.Message(content=answer, elements=text_elements).send()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pypdf==3.8.1
2
+ pinecone-client==2.2.1
3
+ tiktoken==0.3.3
4
+ langchain
5
+ chainlit
6
+ protobuf==3.19.3