mphycx commited on
Commit
d3c3946
1 Parent(s): 3fcca8e

Push large file

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *tokenizer.json filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim-bullseye as langchain-serve-img
2
+
3
+ COPY requirements_pytorch.txt requirements_pytorch.txt
4
+ COPY requirements_api.txt requirements_api.txt
5
+ RUN pip3 install -r requirements_pytorch.txt
6
+ RUN pip3 install -r requirements_api.txt
7
+
8
+ COPY api.py api.py
9
+
10
+ EXPOSE 8080
11
+
12
+ ENTRYPOINT [ "lc-serve", "deploy", "local", "api.py" ]
13
+
14
+ FROM python:3.9-slim-bullseye as pdfgpt-chat-img
15
+
16
+ COPY requirements_app.txt requirements_app.txt
17
+ RUN pip3 install -r requirements_app.txt
18
+
19
+ WORKDIR /app
20
+
21
+ COPY intfloat /app/intfloat
22
+ COPY app.py app.py
23
+
24
+ EXPOSE 7860
25
+
26
+ HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
27
+
28
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=7860"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 mphycx
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
api.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import os
3
+ import re
4
+ import shutil
5
+ import urllib.request
6
+ from pathlib import Path
7
+ from tempfile import NamedTemporaryFile
8
+
9
+ import fitz
10
+ import numpy as np
11
+ import openai
12
+ import torch
13
+ import torch.nn.functional as F
14
+ from fastapi import UploadFile
15
+ from lcserve import serving
16
+ from optimum.bettertransformer import BetterTransformer
17
+ from sklearn import svm
18
+ from sklearn.cluster import KMeans
19
+ from sklearn.metrics import pairwise_distances_argmin_min
20
+ from torch import Tensor
21
+ from transformers import AutoModel, AutoTokenizer
22
+
23
+ recommender = None
24
+
25
+
26
+ def download_pdf(url, output_path):
27
+ urllib.request.urlretrieve(url, output_path)
28
+
29
+
30
+ def preprocess(text):
31
+ text = text.replace("-\n", "")
32
+ text = text.replace("\n", " ")
33
+ text = re.sub("\s+", " ", text)
34
+ return text
35
+
36
+
37
+ def get_margin(pdf):
38
+ page = pdf[0]
39
+ page_size = page.mediabox
40
+ margin_hor = page.mediabox.width * 0.05
41
+ margin_ver = page.mediabox.height * 0.05
42
+ margin_size = page_size + (margin_hor, margin_ver, -margin_hor, -margin_ver)
43
+ return margin_size
44
+
45
+
46
+ def pdf_to_text(path, start_page=1, end_page=None):
47
+ doc = fitz.open(path)
48
+ total_pages = doc.page_count
49
+
50
+ if end_page is None:
51
+ end_page = total_pages
52
+
53
+ text_list = []
54
+ margin_size = get_margin(doc)
55
+ for i in range(start_page - 1, end_page):
56
+ page = doc[i]
57
+ page.set_cropbox(margin_size)
58
+ text = page.get_text("text")
59
+ text = preprocess(text)
60
+ text_list.append(text)
61
+
62
+ doc.close()
63
+ return text_list
64
+
65
+
66
+ def text_to_chunks(texts, word_length=150, start_page=1):
67
+ text_toks = [t.split(" ") for t in texts]
68
+ page_nums = []
69
+ chunks = []
70
+
71
+ for idx, words in enumerate(text_toks):
72
+ for i in range(0, len(words), word_length):
73
+ chunk = words[i : i + word_length]
74
+ if (
75
+ (i + word_length) > len(words)
76
+ and (len(chunk) < word_length)
77
+ and (len(text_toks) != (idx + 1))
78
+ ):
79
+ text_toks[idx + 1] = chunk + text_toks[idx + 1]
80
+ continue
81
+ chunk = " ".join(chunk).strip()
82
+ chunk = f"[Page no. {idx+start_page}]" + " " + '"' + chunk + '"'
83
+ chunks.append(chunk)
84
+ return chunks
85
+
86
+
87
+ class SemanticSearch:
88
+ def __init__(self, embedding_model):
89
+ self.tokenizer = AutoTokenizer.from_pretrained(f"intfloat/{embedding_model}")
90
+ self.model = AutoModel.from_pretrained(
91
+ f"intfloat/{embedding_model}",
92
+ # cache_dir =,
93
+ )
94
+ self.model = BetterTransformer.transform(self.model, keep_original_model=True)
95
+
96
+ # set device
97
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
98
+ self.model = self.model.to(self.device)
99
+ self.fitted = False
100
+
101
+ def fit(self, data, batch_size=32, n_neighbors=5):
102
+ self.data = data
103
+ self.embeddings = self.get_text_embedding(self.data, batch_size=batch_size)
104
+ self.fitted = True
105
+
106
+ def __call__(self, text, return_data=True):
107
+ self.inp_emb = self.get_text_embedding([text], prefix="query")
108
+ self.matches = self.run_svm(self.inp_emb, self.embeddings)
109
+
110
+ if return_data:
111
+ # return 5 first match, first index is query, so it has to be skipped
112
+ return [self.data[i - 1] for i in self.matches[1:6]]
113
+
114
+ else:
115
+ return self.matches
116
+
117
+ def average_pool(
118
+ self, last_hidden_states: Tensor, attention_mask: Tensor
119
+ ) -> Tensor:
120
+ self.last_hidden = last_hidden_states.masked_fill(
121
+ ~attention_mask[..., None].bool(), 0.0
122
+ )
123
+ return self.last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
124
+
125
+ def get_text_embedding(self, texts, prefix="passage", batch_size=32):
126
+ # Tokenize the input texts
127
+ texts = [f"{prefix}: {text}" for text in texts]
128
+ batch_dict = self.tokenizer(
129
+ texts, max_length=512, padding=True, truncation=True, return_tensors="pt"
130
+ ).to(self.device)
131
+
132
+ with torch.no_grad():
133
+ outputs = self.model(**batch_dict)
134
+
135
+ embeddings = self.average_pool(
136
+ outputs.last_hidden_state, batch_dict["attention_mask"]
137
+ )
138
+
139
+ # Normalize embeddings
140
+ embeddings = F.normalize(embeddings, p=2, dim=1)
141
+
142
+ # Convert pytorch tensor to numpy array (no grad)
143
+ if self.device == "cuda":
144
+ embeddings = embeddings.detach().cpu().clone().numpy()
145
+ else:
146
+ embeddings = embeddings.detach().numpy()
147
+ return embeddings
148
+
149
+ def run_svm(self, query_emb, passage_emb):
150
+ joined_emb = np.concatenate((query_emb, passage_emb))
151
+
152
+ # create var for SVM label
153
+ y = np.zeros(joined_emb.shape[0])
154
+ # mark query as a positive example
155
+ y[0] = 1
156
+
157
+ # declare SVM
158
+ clf = svm.LinearSVC(
159
+ class_weight="balanced", verbose=False, max_iter=10000, tol=1e-6, C=0.1
160
+ )
161
+ # train (Exemplar) SVM
162
+ clf.fit(joined_emb, y)
163
+
164
+ # infer on original data
165
+ similarities = clf.decision_function(joined_emb)
166
+ sorted_ix = np.argsort(-similarities)
167
+ return sorted_ix
168
+
169
+ def summarize(self):
170
+ n_clusters = int(np.ceil(len(self.embeddings)**0.5))
171
+ # max cluster 5 (reserve token)
172
+ n_clusters = n_clusters if n_clusters <= 5 else 5
173
+ kmeans = KMeans(n_clusters=n_clusters, random_state=23)
174
+ kmeans = kmeans.fit(self.embeddings)
175
+
176
+ avg = []
177
+ closest = []
178
+ for j in range(n_clusters):
179
+ # find first chunk index of every cluster
180
+ idx = np.where(kmeans.labels_ == j)[0]
181
+ avg.append(np.mean(idx))
182
+ # find chunk that is closest to the centroid
183
+ closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,
184
+ self.embeddings)
185
+ ordering = sorted(range(n_clusters), key=lambda k: avg[k])
186
+ # concat representative chunks
187
+ summary = [self.data[i] for i in [closest[idx] for idx in ordering]]
188
+ return summary
189
+
190
+
191
+ def clear_cache():
192
+ global recommender
193
+ if "recommender" in globals():
194
+ del recommender
195
+ gc.collect()
196
+ if torch.cuda.is_available():
197
+ return torch.cuda.empty_cache()
198
+
199
+
200
+ def load_recommender(path, embedding_model, rebuild_embedding, start_page=1):
201
+ global recommender
202
+ if rebuild_embedding:
203
+ clear_cache()
204
+ recommender = None
205
+ if recommender is None:
206
+ recommender = SemanticSearch(embedding_model)
207
+ if recommender.fitted:
208
+ return "Corpus Loaded."
209
+ else:
210
+ texts = pdf_to_text(path, start_page=start_page)
211
+ chunks = text_to_chunks(texts, start_page=start_page)
212
+ recommender.fit(chunks)
213
+ return "Corpus Loaded."
214
+
215
+
216
+ def generate_text(openai_key, prompt, model="gpt-3.5-turbo"):
217
+ openai.api_key = openai_key
218
+ completions = openai.ChatCompletion.create(
219
+ model=model,
220
+ messages=[{"role": "user", "content": prompt}],
221
+ max_tokens=512,
222
+ n=1,
223
+ stop=None,
224
+ temperature=0.7,
225
+ )
226
+ message = f"{prompt}###{completions.choices[0].message.content}###{completions.usage.total_tokens}###{completions.model}"
227
+ return message
228
+
229
+ def generate_answer(question, gpt_model, openai_key):
230
+ topn_chunks = recommender(question)
231
+ prompt = ""
232
+ prompt += "search results:\n\n"
233
+ for c in topn_chunks:
234
+ prompt += c + "\n\n"
235
+
236
+ prompt += (
237
+ "Instructions: Compose a comprehensive reply to the query using the search results given. "
238
+ "Cite each reference using [ Page Number] notation (every result has this number at the beginning). "
239
+ "Citation should be done at the end of each sentence. If the search results mention multiple subjects "
240
+ "with the same name, create separate answers for each. Only include information found in the results and "
241
+ "don't add any additional information. Make sure the answer is correct and don't output false content. "
242
+ "If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "
243
+ "search results which has nothing to do with the question. Only answer what is asked. The "
244
+ "answer should be short and concise. Answer step-by-step.\n\n"
245
+ )
246
+
247
+ prompt += f"Query: {question}"
248
+ answer = generate_text(openai_key, prompt, gpt_model)
249
+ return answer
250
+
251
+ def generate_summary(gpt_model, openai_key):
252
+ topn_chunks = recommender.summarize()
253
+ prompt = ""
254
+ prompt += (
255
+ "Summarize the highlights of the search results and output a summary in bulletpoints. "
256
+ "Do not write anything before the bulletpoints. "
257
+ "Cite each reference using [Page no.] notation (every result has this number at the beginning). "
258
+ "Citation should be done at the end of each sentence. "
259
+ "Give conclusion in the end. "
260
+ "Write summary in the same language as the search results. "
261
+ "Search results:\n\n"
262
+ )
263
+ for c in topn_chunks:
264
+ prompt += c + "\n\n"
265
+ summary = generate_text(openai_key, prompt, gpt_model)
266
+ return summary
267
+
268
+
269
+ def load_openai_key() -> str:
270
+ key = os.environ.get("OPENAI_API_KEY")
271
+ if key is None:
272
+ raise ValueError(
273
+ "[ERROR]: Please pass your OPENAI_API_KEY. Get your key here : https://platform.openai.com/account/api-keys"
274
+ )
275
+ return key
276
+
277
+
278
+ # %%
279
+ @serving
280
+ def ask_url(
281
+ url: str,
282
+ question: str,
283
+ rebuild_embedding: bool,
284
+ embedding_model: str,
285
+ gpt_model: str,
286
+ ) -> str:
287
+ if rebuild_embedding:
288
+ load_url(url, embedding_model, rebuild_embedding)
289
+ openai_key = load_openai_key()
290
+ return generate_answer(question, gpt_model, openai_key)
291
+
292
+
293
+ @serving
294
+ async def ask_file(
295
+ file: UploadFile,
296
+ question: str,
297
+ rebuild_embedding: bool,
298
+ embedding_model: str,
299
+ gpt_model: str,
300
+ ) -> str:
301
+ if rebuild_embedding:
302
+ load_file(file, embedding_model, rebuild_embedding)
303
+ openai_key = load_openai_key()
304
+ return generate_answer(question, gpt_model, openai_key)
305
+
306
+
307
+ @serving
308
+ def load_url(url: str,
309
+ embedding_model: str,
310
+ rebuild_embedding: bool,
311
+ gpt_model: str
312
+ ) -> str:
313
+ download_pdf(url, "corpus.pdf")
314
+ notification = load_recommender("corpus.pdf", embedding_model, rebuild_embedding)
315
+ openai_key = load_openai_key()
316
+ summary = generate_summary(gpt_model, openai_key)
317
+ response = f"{notification}###{summary}"
318
+ return response
319
+
320
+
321
+ @serving
322
+ async def load_file(
323
+ file: UploadFile,
324
+ embedding_model: str,
325
+ rebuild_embedding: bool,
326
+ gpt_model: str
327
+ ) -> str:
328
+ suffix = Path(file.filename).suffix
329
+ with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
330
+ shutil.copyfileobj(file.file, tmp)
331
+ tmp_path = Path(tmp.name)
332
+ notification = load_recommender(str(tmp_path), embedding_model, rebuild_embedding)
333
+ openai_key = load_openai_key()
334
+ summary = generate_summary(gpt_model, openai_key)
335
+ response = f"{notification}###{summary}"
336
+ return response
app.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import os
3
+ import json
4
+ import urllib.parse
5
+ from tempfile import _TemporaryFileWrapper
6
+
7
+ import pandas as pd
8
+ import requests
9
+ import streamlit as st
10
+ from streamlit_chat import message
11
+ from streamlit_extras.add_vertical_space import add_vertical_space
12
+ from streamlit_extras.colored_header import colored_header
13
+
14
+ st.set_page_config(
15
+ layout="wide",
16
+ page_title="pdfGPT-chat. Ask your PDF!",
17
+ page_icon=":robot_face:",
18
+ )
19
+
20
+
21
+ def main():
22
+ @st.cache_data
23
+ def convert_df(df):
24
+ return df.to_csv(index=False).encode("utf-8")
25
+
26
+ def pdf_change():
27
+ st.session_state["pdf_change"] = True
28
+
29
+ def check_api(api_key):
30
+ return api_key.startswith("sk-") and len(api_key) == 51
31
+
32
+ def check_url(url):
33
+ parsed_url = urllib.parse.urlparse(url)
34
+ return all([parsed_url.scheme, parsed_url.netloc])
35
+
36
+ def result_to_dict(r, start):
37
+ result = r.json()["result"]
38
+ result = result.split("###")[start:]
39
+ keys = ["prompt", "answer", "token_used", "gpt_model"]
40
+ # Error in OpenAI server also gives status_code 200
41
+ if len(result) >= 0:
42
+ result.extend([result, 0, gpt_model])
43
+ return dict(zip(keys, result))
44
+
45
+ def load_pdf():
46
+ if file is None and len(pdf_url) == 0:
47
+ return st.error("Both URL and PDF is empty. Provide at least one.")
48
+ elif len(pdf_url) > 0:
49
+ if not check_url(pdf_url):
50
+ return st.error("Please enter valid URL.")
51
+ elif file is not None:
52
+ return st.error(
53
+ "Both URL and PDF is provided. Please provide only one (either URL or PDF)."
54
+ )
55
+ # load pdf from url
56
+ else:
57
+ r = requests.post(
58
+ f"{LCSERVE_HOST}/load_url",
59
+ json={
60
+ "url": pdf_url,
61
+ "rebuild_embedding": st.session_state["pdf_change"],
62
+ "embedding_model": embedding_model,
63
+ "gpt_model": gpt_model,
64
+ "envs": {
65
+ "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
66
+ }
67
+ },
68
+ )
69
+ # load file
70
+ else:
71
+ _data = {
72
+ "rebuild_embedding": st.session_state["pdf_change"],
73
+ "embedding_model": embedding_model,
74
+ "gpt_model": gpt_model,
75
+ "envs": {
76
+ "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
77
+ }
78
+ }
79
+
80
+ r = requests.post(
81
+ f"{LCSERVE_HOST}/load_file",
82
+ params={"input_data": json.dumps(_data)},
83
+ files={"file": file},
84
+ )
85
+ if r.status_code != 200:
86
+ if "error" in r.json():
87
+ if "message" in r.json()["error"]:
88
+ return st.error(r.json()["error"]["message"])
89
+ else:
90
+ return str(r.json())
91
+ elif r.json()["result"].startswith("Corpus Loaded."):
92
+ st.session_state["loaded"] = True
93
+ st.session_state["pdf_change"] = False
94
+ # extract result
95
+ result = result_to_dict(r, 1)
96
+
97
+ # concatenate reply
98
+ reply_summary = "Hello there. I'm **pdfGPT-chat**.\nHere is a **summary** of your PDF:\n\n"
99
+ reply_summary += result["answer"]
100
+ reply_summary += "\n\nDo you have any **question** about your PDF?"
101
+
102
+ if len(st.session_state["past"]) == 1:
103
+ st.session_state["generated"][0] = reply_summary
104
+ else:
105
+ st.session_state["past"].append("Hi")
106
+ st.session_state["generated"].append(reply_summary)
107
+
108
+ # calculate cost
109
+ calculate_cost(result["token_used"], result["gpt_model"])
110
+ return st.success("The PDF file has been loaded.")
111
+ else:
112
+ return st.info(r.json()["result"])
113
+
114
+ def generate_response(
115
+ lcserve_host: str,
116
+ url: str,
117
+ file: _TemporaryFileWrapper,
118
+ question: str,
119
+ openai_key: str,
120
+ ) -> dict:
121
+ if question.strip() == "":
122
+ return "[ERROR]: Question field is empty"
123
+
124
+ _data = {
125
+ "question": question,
126
+ "rebuild_embedding": st.session_state["pdf_change"],
127
+ "embedding_model": embedding_model,
128
+ "gpt_model": gpt_model,
129
+ "envs": {
130
+ "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
131
+ },
132
+ }
133
+
134
+ if url.strip() != "":
135
+ r = requests.post(
136
+ f"{LCSERVE_HOST}/ask_url",
137
+ json={"url": url, **_data},
138
+ )
139
+
140
+ else:
141
+ r = requests.post(
142
+ f"{LCSERVE_HOST}/ask_file",
143
+ params={"input_data": json.dumps(_data)},
144
+ files={"file": file},
145
+ )
146
+
147
+ if r.status_code != 200:
148
+ content = r.content.decode() # Convert bytes to string
149
+ with open("langchainlog.txt", "w") as file:
150
+ file.write(content)
151
+ return f"[ERROR]: {r.text}"
152
+
153
+ result_dict = result_to_dict(r, 0)
154
+ return result_dict
155
+
156
+ def calculate_cost(token_used, gpt_model):
157
+ st.session_state["total_token"] += int(token_used)
158
+ if "gpt-3" in gpt_model:
159
+ current_cost = st.session_state["total_token"] * 0.002 / 1000
160
+ else:
161
+ current_cost = st.session_state["total_token"] * 0.06 / 1000
162
+ st.session_state["total_cost"] += current_cost
163
+
164
+ # %%
165
+ # main page layout
166
+ header = st.container()
167
+ welcome_page = st.container()
168
+ response_container = st.container()
169
+ input_container = st.container()
170
+ cost_container = st.container()
171
+ load_pdf_popup = st.container()
172
+
173
+ # sidebar layout
174
+ input_details = st.sidebar.container()
175
+ preferences = st.sidebar.container()
176
+ chat_download = st.sidebar.container()
177
+ # %%
178
+ # instantiate session states
179
+ if "api_key" not in st.session_state:
180
+ st.session_state["api_key"] = False
181
+
182
+ if "generated" not in st.session_state:
183
+ st.session_state["generated"] = ["Hello there. I'm pdfGPT-chat. Do you have any question about your PDF?"]
184
+
185
+ if "loaded" not in st.session_state:
186
+ st.session_state["loaded"] = False
187
+
188
+ if "past" not in st.session_state:
189
+ st.session_state["past"] = ["Hi"]
190
+
191
+ if "pdf_change" not in st.session_state:
192
+ st.session_state["pdf_change"] = True
193
+
194
+ if "total_cost" not in st.session_state:
195
+ st.session_state["total_cost"] = 0
196
+
197
+ if "total_token" not in st.session_state:
198
+ st.session_state["total_token"] = 0
199
+
200
+ # %%
201
+ # constants
202
+ E5_URL = "https://github.com/microsoft/unilm/tree/master/e5"
203
+ EMBEDDING_CHOICES = {
204
+ "multilingual-e5-base": "Multilingual-E5 (default)",
205
+ "e5-small-v2": "English-E5-small (faster)",
206
+ }
207
+ GPT_CHOICES = {
208
+ "gpt-3.5-turbo": "GPT-3.5-turbo (default)",
209
+ "gpt-4": "GPT-4 (smarter, costlier)",
210
+ }
211
+ LCSERVE_HOST = "http://localhost:8080"
212
+ PDFGPT_URL = "https://github.com/bhaskatripathi/pdfGPT"
213
+ SIGNATURE = """<style>
214
+ .footer {
215
+ position: static;
216
+ left: 0;
217
+ bottom: 0;
218
+ width: 100%;
219
+ background: rgba(0,0,0,0);
220
+ text-align: center;
221
+ }
222
+ </style>
223
+
224
+ <div class="footer">
225
+ <p style='display: block;
226
+ text-align: center;
227
+ font-size:14px;
228
+ color:darkgray'>Developed with ❤ by asyafiqe</p>
229
+ </div>
230
+ """
231
+
232
+ with header:
233
+ st.title(":page_facing_up: pdfGPT-chat")
234
+ with st.expander(
235
+ "A fork of [pdfGPT](%s) with several improvements. With pdfGPT-chat, you can chat with your PDF files using [**Microsoft E5 Multilingual Text Embeddings**](%s) and **OpenAI**."
236
+ % (PDFGPT_URL, E5_URL)
237
+ ):
238
+ st.markdown(
239
+ "Compared to other tools, pdfGPT-chat provides **hallucinations-free** response, thanks to its superior embeddings and tailored prompt.<br />The generated responses from pdfGPT-chat include **citations** in square brackets ([]), indicating the **page numbers** where the relevant information is found.<br />This feature not only enhances the credibility of the responses but also aids in swiftly locating the pertinent information within the PDF file.",
240
+ unsafe_allow_html=True,
241
+ )
242
+
243
+ colored_header(
244
+ label="",
245
+ description="",
246
+ color_name="blue-40",
247
+ )
248
+
249
+ with preferences:
250
+ colored_header(
251
+ label="",
252
+ description="",
253
+ color_name="blue-40",
254
+ )
255
+ st.write("**Preferences**")
256
+ embedding_model = st.selectbox(
257
+ "Embedding",
258
+ EMBEDDING_CHOICES.keys(),
259
+ help="""[Multilingual-E5](%s) supports 100 languages.
260
+ E5-small is much faster and suitable for PC without GPU."""
261
+ % E5_URL,
262
+ on_change=pdf_change,
263
+ format_func=lambda x: EMBEDDING_CHOICES[x],
264
+ )
265
+ gpt_model = st.selectbox(
266
+ "GPT Model",
267
+ GPT_CHOICES.keys(),
268
+ help="For GPT-4 you might have to join the waitlist: https://openai.com/waitlist/gpt-4-api",
269
+ format_func=lambda x: GPT_CHOICES[x],
270
+ )
271
+
272
+ # %%
273
+ # sidebar
274
+ with input_details:
275
+ # sidebar
276
+ st.title("Input details")
277
+ OPENAI_URL = "https://platform.openai.com/account/api-keys"
278
+ openai_key = st.text_input(
279
+ ":key: Enter your OpenAI API key here",
280
+ type="password",
281
+ help="Get your Open AI API key [here](%s)" % OPENAI_URL,
282
+ )
283
+ colored_header(
284
+ label="",
285
+ description="",
286
+ color_name="blue-40",
287
+ )
288
+
289
+ pdf_url = st.text_input(
290
+ ":globe_with_meridians: Enter PDF URL here", on_change=pdf_change
291
+ )
292
+
293
+ st.markdown(
294
+ "<h2 style='text-align: center; color: black;'>OR</h2>",
295
+ unsafe_allow_html=True,
296
+ )
297
+
298
+ file = st.file_uploader(
299
+ ":page_facing_up: Upload your PDF/ Research Paper / Book here",
300
+ type=["pdf"],
301
+ on_change=pdf_change,
302
+ )
303
+
304
+ if st.button("Load PDF"):
305
+ st.session_state["loaded"] = True
306
+ with st.spinner("Loading PDF"):
307
+ with load_pdf_popup:
308
+ load_pdf()
309
+
310
+ # %%
311
+
312
+ # main tab
313
+ if st.session_state["loaded"]:
314
+ with input_container:
315
+ with st.form(key="input_form", clear_on_submit=True):
316
+ user_input = st.text_area("Question:", key="input", height=100)
317
+ submit_button = st.form_submit_button(label="Send")
318
+
319
+ if user_input and submit_button:
320
+ with st.spinner("Processing your question"):
321
+ response = generate_response(
322
+ LCSERVE_HOST,
323
+ pdf_url,
324
+ file,
325
+ user_input,
326
+ os.environ.get("OPENAI_API_KEY"),
327
+ )
328
+ st.session_state.past.append(user_input)
329
+ st.session_state.generated.append(response["answer"])
330
+
331
+ # calculate cost
332
+ calculate_cost(response["token_used"], response["gpt_model"])
333
+
334
+ if not user_input and submit_button:
335
+ st.error("Please write your question.")
336
+
337
+ with response_container:
338
+ if st.session_state["generated"]:
339
+ for i in range(len(st.session_state["generated"])):
340
+ message(
341
+ st.session_state["past"][i], is_user=True, key=str(i) + "_user"
342
+ )
343
+ message(st.session_state["generated"][i], key=str(i))
344
+
345
+ cost_container.caption(
346
+ f"Estimated cost: $ {st.session_state['total_cost']:.4f}"
347
+ )
348
+
349
+ else:
350
+ with welcome_page:
351
+ st.write("")
352
+ st.subheader(
353
+ """:arrow_left: To start please fill input details in the sidebar and click **Load PDF**"""
354
+ )
355
+ # %%
356
+ # placed in the end to include the last conversation
357
+ with chat_download:
358
+ chat_history = pd.DataFrame(
359
+ {
360
+ "Question": st.session_state["past"],
361
+ "Answer": st.session_state["generated"],
362
+ }
363
+ )
364
+
365
+ csv = convert_df(chat_history)
366
+
367
+ st.download_button(
368
+ label="Download chat history",
369
+ data=csv,
370
+ file_name="chat history.csv",
371
+ mime="text/csv",
372
+ )
373
+ add_vertical_space(2)
374
+ st.markdown(SIGNATURE, unsafe_allow_html=True)
375
+
376
+ # %%
377
+ # # javascript
378
+ #
379
+ # # scroll halfway through the page
380
+ js = f"""
381
+ <script>
382
+ function scroll() {{
383
+ var textAreas = parent.document.querySelectorAll('section.main');
384
+ var halfwayScroll = 0.4 * textAreas[0].scrollHeight; // Calculate halfway scroll position
385
+
386
+ for (let index = 0; index < textAreas.length; index++) {{
387
+ textAreas[index].scrollTop = halfwayScroll; // Set scroll position to halfway
388
+ }}
389
+ }}
390
+
391
+ scroll(); // Call the scroll function
392
+ </script>
393
+ """
394
+ st.components.v1.html(js)
395
+
396
+ # reduce main top padding
397
+ st.markdown(
398
+ "<style>div.block-container{padding-top:1.5em;}</style>",
399
+ unsafe_allow_html=True,
400
+ )
401
+ # reduce sidebar top padding
402
+ st.markdown(
403
+ "<style>.css-ysnqb2.e1g8pov64 {margin-top: -90px;}</style>",
404
+ unsafe_allow_html=True,
405
+ )
406
+
407
+
408
+ if __name__ == "__main__":
409
+ main()
docker-compose.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3'
2
+
3
+ services:
4
+ langchain-serve:
5
+ build:
6
+ context: .
7
+ target: langchain-serve-img
8
+ ports:
9
+ - '8080:8080'
10
+ pdf-gpt:
11
+ build:
12
+ context: .
13
+ target: pdf-gpt-img
14
+ ports:
15
+ - '7860:7860'
intfloat/e5-small-v2/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "tmp/",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 384,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 1536,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.29.0.dev0",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
intfloat/e5-small-v2/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4790fed2919e70bff573d01cd3aede75970f219ab4c0b0aeadd0f4b98084a17d
3
+ size 133508397
intfloat/e5-small-v2/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
intfloat/e5-small-v2/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
intfloat/e5-small-v2/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": true,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
intfloat/e5-small-v2/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
intfloat/multilingual-e5-base/README.md ADDED
The diff for this file is too large to render. See raw diff
 
intfloat/multilingual-e5-base/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "tmp/",
3
+ "architectures": [
4
+ "XLMRobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "xlm-roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.29.0.dev0",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 250002
28
+ }
intfloat/multilingual-e5-base/gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
intfloat/multilingual-e5-base/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f061cb7641880f52895cbacab7c4ab39b0844e2e6b73794f2798de460d9fa418
3
+ size 1112242989
intfloat/multilingual-e5-base/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
intfloat/multilingual-e5-base/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
intfloat/multilingual-e5-base/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "model_max_length": 512,
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "tokenizer_class": "XLMRobertaTokenizer",
18
+ "unk_token": "<unk>"
19
+ }
requirements_api.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi==0.96.0
2
+ langchain_serve==0.0.41
3
+ openai==0.27.7
4
+ optimum==1.8.6
5
+ PyMuPDF==1.22.3
6
+ scikit_learn==1.0.2
7
+ transformers==4.29.2
requirements_app.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ pandas==2.0.2
2
+ streamlit==1.23.1
3
+ streamlit_chat==0.0.2.2
4
+ streamlit_extras==0.2.7
requirements_pytorch.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu117
2
+ torch
3
+ torchvision
4
+ torchaudio