Spaces:
Build error
Build error
mphycx
commited on
Commit
•
d3c3946
1
Parent(s):
3fcca8e
Push large file
Browse files- .gitattributes +1 -0
- Dockerfile +28 -0
- LICENSE +21 -0
- api.py +336 -0
- app.py +409 -0
- docker-compose.yaml +15 -0
- intfloat/e5-small-v2/config.json +25 -0
- intfloat/e5-small-v2/pytorch_model.bin +3 -0
- intfloat/e5-small-v2/special_tokens_map.json +7 -0
- intfloat/e5-small-v2/tokenizer.json +0 -0
- intfloat/e5-small-v2/tokenizer_config.json +15 -0
- intfloat/e5-small-v2/vocab.txt +0 -0
- intfloat/multilingual-e5-base/README.md +0 -0
- intfloat/multilingual-e5-base/config.json +28 -0
- intfloat/multilingual-e5-base/gitattributes +34 -0
- intfloat/multilingual-e5-base/pytorch_model.bin +3 -0
- intfloat/multilingual-e5-base/sentencepiece.bpe.model +3 -0
- intfloat/multilingual-e5-base/special_tokens_map.json +15 -0
- intfloat/multilingual-e5-base/tokenizer_config.json +19 -0
- requirements_api.txt +7 -0
- requirements_app.txt +4 -0
- requirements_pytorch.txt +4 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim-bullseye as langchain-serve-img
|
2 |
+
|
3 |
+
COPY requirements_pytorch.txt requirements_pytorch.txt
|
4 |
+
COPY requirements_api.txt requirements_api.txt
|
5 |
+
RUN pip3 install -r requirements_pytorch.txt
|
6 |
+
RUN pip3 install -r requirements_api.txt
|
7 |
+
|
8 |
+
COPY api.py api.py
|
9 |
+
|
10 |
+
EXPOSE 8080
|
11 |
+
|
12 |
+
ENTRYPOINT [ "lc-serve", "deploy", "local", "api.py" ]
|
13 |
+
|
14 |
+
FROM python:3.9-slim-bullseye as pdfgpt-chat-img
|
15 |
+
|
16 |
+
COPY requirements_app.txt requirements_app.txt
|
17 |
+
RUN pip3 install -r requirements_app.txt
|
18 |
+
|
19 |
+
WORKDIR /app
|
20 |
+
|
21 |
+
COPY intfloat /app/intfloat
|
22 |
+
COPY app.py app.py
|
23 |
+
|
24 |
+
EXPOSE 7860
|
25 |
+
|
26 |
+
HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
|
27 |
+
|
28 |
+
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=7860"]
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 mphycx
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
api.py
ADDED
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import shutil
|
5 |
+
import urllib.request
|
6 |
+
from pathlib import Path
|
7 |
+
from tempfile import NamedTemporaryFile
|
8 |
+
|
9 |
+
import fitz
|
10 |
+
import numpy as np
|
11 |
+
import openai
|
12 |
+
import torch
|
13 |
+
import torch.nn.functional as F
|
14 |
+
from fastapi import UploadFile
|
15 |
+
from lcserve import serving
|
16 |
+
from optimum.bettertransformer import BetterTransformer
|
17 |
+
from sklearn import svm
|
18 |
+
from sklearn.cluster import KMeans
|
19 |
+
from sklearn.metrics import pairwise_distances_argmin_min
|
20 |
+
from torch import Tensor
|
21 |
+
from transformers import AutoModel, AutoTokenizer
|
22 |
+
|
23 |
+
recommender = None
|
24 |
+
|
25 |
+
|
26 |
+
def download_pdf(url, output_path):
|
27 |
+
urllib.request.urlretrieve(url, output_path)
|
28 |
+
|
29 |
+
|
30 |
+
def preprocess(text):
|
31 |
+
text = text.replace("-\n", "")
|
32 |
+
text = text.replace("\n", " ")
|
33 |
+
text = re.sub("\s+", " ", text)
|
34 |
+
return text
|
35 |
+
|
36 |
+
|
37 |
+
def get_margin(pdf):
|
38 |
+
page = pdf[0]
|
39 |
+
page_size = page.mediabox
|
40 |
+
margin_hor = page.mediabox.width * 0.05
|
41 |
+
margin_ver = page.mediabox.height * 0.05
|
42 |
+
margin_size = page_size + (margin_hor, margin_ver, -margin_hor, -margin_ver)
|
43 |
+
return margin_size
|
44 |
+
|
45 |
+
|
46 |
+
def pdf_to_text(path, start_page=1, end_page=None):
|
47 |
+
doc = fitz.open(path)
|
48 |
+
total_pages = doc.page_count
|
49 |
+
|
50 |
+
if end_page is None:
|
51 |
+
end_page = total_pages
|
52 |
+
|
53 |
+
text_list = []
|
54 |
+
margin_size = get_margin(doc)
|
55 |
+
for i in range(start_page - 1, end_page):
|
56 |
+
page = doc[i]
|
57 |
+
page.set_cropbox(margin_size)
|
58 |
+
text = page.get_text("text")
|
59 |
+
text = preprocess(text)
|
60 |
+
text_list.append(text)
|
61 |
+
|
62 |
+
doc.close()
|
63 |
+
return text_list
|
64 |
+
|
65 |
+
|
66 |
+
def text_to_chunks(texts, word_length=150, start_page=1):
|
67 |
+
text_toks = [t.split(" ") for t in texts]
|
68 |
+
page_nums = []
|
69 |
+
chunks = []
|
70 |
+
|
71 |
+
for idx, words in enumerate(text_toks):
|
72 |
+
for i in range(0, len(words), word_length):
|
73 |
+
chunk = words[i : i + word_length]
|
74 |
+
if (
|
75 |
+
(i + word_length) > len(words)
|
76 |
+
and (len(chunk) < word_length)
|
77 |
+
and (len(text_toks) != (idx + 1))
|
78 |
+
):
|
79 |
+
text_toks[idx + 1] = chunk + text_toks[idx + 1]
|
80 |
+
continue
|
81 |
+
chunk = " ".join(chunk).strip()
|
82 |
+
chunk = f"[Page no. {idx+start_page}]" + " " + '"' + chunk + '"'
|
83 |
+
chunks.append(chunk)
|
84 |
+
return chunks
|
85 |
+
|
86 |
+
|
87 |
+
class SemanticSearch:
|
88 |
+
def __init__(self, embedding_model):
|
89 |
+
self.tokenizer = AutoTokenizer.from_pretrained(f"intfloat/{embedding_model}")
|
90 |
+
self.model = AutoModel.from_pretrained(
|
91 |
+
f"intfloat/{embedding_model}",
|
92 |
+
# cache_dir =,
|
93 |
+
)
|
94 |
+
self.model = BetterTransformer.transform(self.model, keep_original_model=True)
|
95 |
+
|
96 |
+
# set device
|
97 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
98 |
+
self.model = self.model.to(self.device)
|
99 |
+
self.fitted = False
|
100 |
+
|
101 |
+
def fit(self, data, batch_size=32, n_neighbors=5):
|
102 |
+
self.data = data
|
103 |
+
self.embeddings = self.get_text_embedding(self.data, batch_size=batch_size)
|
104 |
+
self.fitted = True
|
105 |
+
|
106 |
+
def __call__(self, text, return_data=True):
|
107 |
+
self.inp_emb = self.get_text_embedding([text], prefix="query")
|
108 |
+
self.matches = self.run_svm(self.inp_emb, self.embeddings)
|
109 |
+
|
110 |
+
if return_data:
|
111 |
+
# return 5 first match, first index is query, so it has to be skipped
|
112 |
+
return [self.data[i - 1] for i in self.matches[1:6]]
|
113 |
+
|
114 |
+
else:
|
115 |
+
return self.matches
|
116 |
+
|
117 |
+
def average_pool(
|
118 |
+
self, last_hidden_states: Tensor, attention_mask: Tensor
|
119 |
+
) -> Tensor:
|
120 |
+
self.last_hidden = last_hidden_states.masked_fill(
|
121 |
+
~attention_mask[..., None].bool(), 0.0
|
122 |
+
)
|
123 |
+
return self.last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
|
124 |
+
|
125 |
+
def get_text_embedding(self, texts, prefix="passage", batch_size=32):
|
126 |
+
# Tokenize the input texts
|
127 |
+
texts = [f"{prefix}: {text}" for text in texts]
|
128 |
+
batch_dict = self.tokenizer(
|
129 |
+
texts, max_length=512, padding=True, truncation=True, return_tensors="pt"
|
130 |
+
).to(self.device)
|
131 |
+
|
132 |
+
with torch.no_grad():
|
133 |
+
outputs = self.model(**batch_dict)
|
134 |
+
|
135 |
+
embeddings = self.average_pool(
|
136 |
+
outputs.last_hidden_state, batch_dict["attention_mask"]
|
137 |
+
)
|
138 |
+
|
139 |
+
# Normalize embeddings
|
140 |
+
embeddings = F.normalize(embeddings, p=2, dim=1)
|
141 |
+
|
142 |
+
# Convert pytorch tensor to numpy array (no grad)
|
143 |
+
if self.device == "cuda":
|
144 |
+
embeddings = embeddings.detach().cpu().clone().numpy()
|
145 |
+
else:
|
146 |
+
embeddings = embeddings.detach().numpy()
|
147 |
+
return embeddings
|
148 |
+
|
149 |
+
def run_svm(self, query_emb, passage_emb):
|
150 |
+
joined_emb = np.concatenate((query_emb, passage_emb))
|
151 |
+
|
152 |
+
# create var for SVM label
|
153 |
+
y = np.zeros(joined_emb.shape[0])
|
154 |
+
# mark query as a positive example
|
155 |
+
y[0] = 1
|
156 |
+
|
157 |
+
# declare SVM
|
158 |
+
clf = svm.LinearSVC(
|
159 |
+
class_weight="balanced", verbose=False, max_iter=10000, tol=1e-6, C=0.1
|
160 |
+
)
|
161 |
+
# train (Exemplar) SVM
|
162 |
+
clf.fit(joined_emb, y)
|
163 |
+
|
164 |
+
# infer on original data
|
165 |
+
similarities = clf.decision_function(joined_emb)
|
166 |
+
sorted_ix = np.argsort(-similarities)
|
167 |
+
return sorted_ix
|
168 |
+
|
169 |
+
def summarize(self):
|
170 |
+
n_clusters = int(np.ceil(len(self.embeddings)**0.5))
|
171 |
+
# max cluster 5 (reserve token)
|
172 |
+
n_clusters = n_clusters if n_clusters <= 5 else 5
|
173 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=23)
|
174 |
+
kmeans = kmeans.fit(self.embeddings)
|
175 |
+
|
176 |
+
avg = []
|
177 |
+
closest = []
|
178 |
+
for j in range(n_clusters):
|
179 |
+
# find first chunk index of every cluster
|
180 |
+
idx = np.where(kmeans.labels_ == j)[0]
|
181 |
+
avg.append(np.mean(idx))
|
182 |
+
# find chunk that is closest to the centroid
|
183 |
+
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,
|
184 |
+
self.embeddings)
|
185 |
+
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
|
186 |
+
# concat representative chunks
|
187 |
+
summary = [self.data[i] for i in [closest[idx] for idx in ordering]]
|
188 |
+
return summary
|
189 |
+
|
190 |
+
|
191 |
+
def clear_cache():
|
192 |
+
global recommender
|
193 |
+
if "recommender" in globals():
|
194 |
+
del recommender
|
195 |
+
gc.collect()
|
196 |
+
if torch.cuda.is_available():
|
197 |
+
return torch.cuda.empty_cache()
|
198 |
+
|
199 |
+
|
200 |
+
def load_recommender(path, embedding_model, rebuild_embedding, start_page=1):
|
201 |
+
global recommender
|
202 |
+
if rebuild_embedding:
|
203 |
+
clear_cache()
|
204 |
+
recommender = None
|
205 |
+
if recommender is None:
|
206 |
+
recommender = SemanticSearch(embedding_model)
|
207 |
+
if recommender.fitted:
|
208 |
+
return "Corpus Loaded."
|
209 |
+
else:
|
210 |
+
texts = pdf_to_text(path, start_page=start_page)
|
211 |
+
chunks = text_to_chunks(texts, start_page=start_page)
|
212 |
+
recommender.fit(chunks)
|
213 |
+
return "Corpus Loaded."
|
214 |
+
|
215 |
+
|
216 |
+
def generate_text(openai_key, prompt, model="gpt-3.5-turbo"):
|
217 |
+
openai.api_key = openai_key
|
218 |
+
completions = openai.ChatCompletion.create(
|
219 |
+
model=model,
|
220 |
+
messages=[{"role": "user", "content": prompt}],
|
221 |
+
max_tokens=512,
|
222 |
+
n=1,
|
223 |
+
stop=None,
|
224 |
+
temperature=0.7,
|
225 |
+
)
|
226 |
+
message = f"{prompt}###{completions.choices[0].message.content}###{completions.usage.total_tokens}###{completions.model}"
|
227 |
+
return message
|
228 |
+
|
229 |
+
def generate_answer(question, gpt_model, openai_key):
|
230 |
+
topn_chunks = recommender(question)
|
231 |
+
prompt = ""
|
232 |
+
prompt += "search results:\n\n"
|
233 |
+
for c in topn_chunks:
|
234 |
+
prompt += c + "\n\n"
|
235 |
+
|
236 |
+
prompt += (
|
237 |
+
"Instructions: Compose a comprehensive reply to the query using the search results given. "
|
238 |
+
"Cite each reference using [ Page Number] notation (every result has this number at the beginning). "
|
239 |
+
"Citation should be done at the end of each sentence. If the search results mention multiple subjects "
|
240 |
+
"with the same name, create separate answers for each. Only include information found in the results and "
|
241 |
+
"don't add any additional information. Make sure the answer is correct and don't output false content. "
|
242 |
+
"If the text does not relate to the query, simply state 'Text Not Found in PDF'. Ignore outlier "
|
243 |
+
"search results which has nothing to do with the question. Only answer what is asked. The "
|
244 |
+
"answer should be short and concise. Answer step-by-step.\n\n"
|
245 |
+
)
|
246 |
+
|
247 |
+
prompt += f"Query: {question}"
|
248 |
+
answer = generate_text(openai_key, prompt, gpt_model)
|
249 |
+
return answer
|
250 |
+
|
251 |
+
def generate_summary(gpt_model, openai_key):
|
252 |
+
topn_chunks = recommender.summarize()
|
253 |
+
prompt = ""
|
254 |
+
prompt += (
|
255 |
+
"Summarize the highlights of the search results and output a summary in bulletpoints. "
|
256 |
+
"Do not write anything before the bulletpoints. "
|
257 |
+
"Cite each reference using [Page no.] notation (every result has this number at the beginning). "
|
258 |
+
"Citation should be done at the end of each sentence. "
|
259 |
+
"Give conclusion in the end. "
|
260 |
+
"Write summary in the same language as the search results. "
|
261 |
+
"Search results:\n\n"
|
262 |
+
)
|
263 |
+
for c in topn_chunks:
|
264 |
+
prompt += c + "\n\n"
|
265 |
+
summary = generate_text(openai_key, prompt, gpt_model)
|
266 |
+
return summary
|
267 |
+
|
268 |
+
|
269 |
+
def load_openai_key() -> str:
|
270 |
+
key = os.environ.get("OPENAI_API_KEY")
|
271 |
+
if key is None:
|
272 |
+
raise ValueError(
|
273 |
+
"[ERROR]: Please pass your OPENAI_API_KEY. Get your key here : https://platform.openai.com/account/api-keys"
|
274 |
+
)
|
275 |
+
return key
|
276 |
+
|
277 |
+
|
278 |
+
# %%
|
279 |
+
@serving
|
280 |
+
def ask_url(
|
281 |
+
url: str,
|
282 |
+
question: str,
|
283 |
+
rebuild_embedding: bool,
|
284 |
+
embedding_model: str,
|
285 |
+
gpt_model: str,
|
286 |
+
) -> str:
|
287 |
+
if rebuild_embedding:
|
288 |
+
load_url(url, embedding_model, rebuild_embedding)
|
289 |
+
openai_key = load_openai_key()
|
290 |
+
return generate_answer(question, gpt_model, openai_key)
|
291 |
+
|
292 |
+
|
293 |
+
@serving
|
294 |
+
async def ask_file(
|
295 |
+
file: UploadFile,
|
296 |
+
question: str,
|
297 |
+
rebuild_embedding: bool,
|
298 |
+
embedding_model: str,
|
299 |
+
gpt_model: str,
|
300 |
+
) -> str:
|
301 |
+
if rebuild_embedding:
|
302 |
+
load_file(file, embedding_model, rebuild_embedding)
|
303 |
+
openai_key = load_openai_key()
|
304 |
+
return generate_answer(question, gpt_model, openai_key)
|
305 |
+
|
306 |
+
|
307 |
+
@serving
|
308 |
+
def load_url(url: str,
|
309 |
+
embedding_model: str,
|
310 |
+
rebuild_embedding: bool,
|
311 |
+
gpt_model: str
|
312 |
+
) -> str:
|
313 |
+
download_pdf(url, "corpus.pdf")
|
314 |
+
notification = load_recommender("corpus.pdf", embedding_model, rebuild_embedding)
|
315 |
+
openai_key = load_openai_key()
|
316 |
+
summary = generate_summary(gpt_model, openai_key)
|
317 |
+
response = f"{notification}###{summary}"
|
318 |
+
return response
|
319 |
+
|
320 |
+
|
321 |
+
@serving
|
322 |
+
async def load_file(
|
323 |
+
file: UploadFile,
|
324 |
+
embedding_model: str,
|
325 |
+
rebuild_embedding: bool,
|
326 |
+
gpt_model: str
|
327 |
+
) -> str:
|
328 |
+
suffix = Path(file.filename).suffix
|
329 |
+
with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
330 |
+
shutil.copyfileobj(file.file, tmp)
|
331 |
+
tmp_path = Path(tmp.name)
|
332 |
+
notification = load_recommender(str(tmp_path), embedding_model, rebuild_embedding)
|
333 |
+
openai_key = load_openai_key()
|
334 |
+
summary = generate_summary(gpt_model, openai_key)
|
335 |
+
response = f"{notification}###{summary}"
|
336 |
+
return response
|
app.py
ADDED
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import urllib.parse
|
5 |
+
from tempfile import _TemporaryFileWrapper
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import requests
|
9 |
+
import streamlit as st
|
10 |
+
from streamlit_chat import message
|
11 |
+
from streamlit_extras.add_vertical_space import add_vertical_space
|
12 |
+
from streamlit_extras.colored_header import colored_header
|
13 |
+
|
14 |
+
st.set_page_config(
|
15 |
+
layout="wide",
|
16 |
+
page_title="pdfGPT-chat. Ask your PDF!",
|
17 |
+
page_icon=":robot_face:",
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
def main():
|
22 |
+
@st.cache_data
|
23 |
+
def convert_df(df):
|
24 |
+
return df.to_csv(index=False).encode("utf-8")
|
25 |
+
|
26 |
+
def pdf_change():
|
27 |
+
st.session_state["pdf_change"] = True
|
28 |
+
|
29 |
+
def check_api(api_key):
|
30 |
+
return api_key.startswith("sk-") and len(api_key) == 51
|
31 |
+
|
32 |
+
def check_url(url):
|
33 |
+
parsed_url = urllib.parse.urlparse(url)
|
34 |
+
return all([parsed_url.scheme, parsed_url.netloc])
|
35 |
+
|
36 |
+
def result_to_dict(r, start):
|
37 |
+
result = r.json()["result"]
|
38 |
+
result = result.split("###")[start:]
|
39 |
+
keys = ["prompt", "answer", "token_used", "gpt_model"]
|
40 |
+
# Error in OpenAI server also gives status_code 200
|
41 |
+
if len(result) >= 0:
|
42 |
+
result.extend([result, 0, gpt_model])
|
43 |
+
return dict(zip(keys, result))
|
44 |
+
|
45 |
+
def load_pdf():
|
46 |
+
if file is None and len(pdf_url) == 0:
|
47 |
+
return st.error("Both URL and PDF is empty. Provide at least one.")
|
48 |
+
elif len(pdf_url) > 0:
|
49 |
+
if not check_url(pdf_url):
|
50 |
+
return st.error("Please enter valid URL.")
|
51 |
+
elif file is not None:
|
52 |
+
return st.error(
|
53 |
+
"Both URL and PDF is provided. Please provide only one (either URL or PDF)."
|
54 |
+
)
|
55 |
+
# load pdf from url
|
56 |
+
else:
|
57 |
+
r = requests.post(
|
58 |
+
f"{LCSERVE_HOST}/load_url",
|
59 |
+
json={
|
60 |
+
"url": pdf_url,
|
61 |
+
"rebuild_embedding": st.session_state["pdf_change"],
|
62 |
+
"embedding_model": embedding_model,
|
63 |
+
"gpt_model": gpt_model,
|
64 |
+
"envs": {
|
65 |
+
"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
|
66 |
+
}
|
67 |
+
},
|
68 |
+
)
|
69 |
+
# load file
|
70 |
+
else:
|
71 |
+
_data = {
|
72 |
+
"rebuild_embedding": st.session_state["pdf_change"],
|
73 |
+
"embedding_model": embedding_model,
|
74 |
+
"gpt_model": gpt_model,
|
75 |
+
"envs": {
|
76 |
+
"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
|
77 |
+
}
|
78 |
+
}
|
79 |
+
|
80 |
+
r = requests.post(
|
81 |
+
f"{LCSERVE_HOST}/load_file",
|
82 |
+
params={"input_data": json.dumps(_data)},
|
83 |
+
files={"file": file},
|
84 |
+
)
|
85 |
+
if r.status_code != 200:
|
86 |
+
if "error" in r.json():
|
87 |
+
if "message" in r.json()["error"]:
|
88 |
+
return st.error(r.json()["error"]["message"])
|
89 |
+
else:
|
90 |
+
return str(r.json())
|
91 |
+
elif r.json()["result"].startswith("Corpus Loaded."):
|
92 |
+
st.session_state["loaded"] = True
|
93 |
+
st.session_state["pdf_change"] = False
|
94 |
+
# extract result
|
95 |
+
result = result_to_dict(r, 1)
|
96 |
+
|
97 |
+
# concatenate reply
|
98 |
+
reply_summary = "Hello there. I'm **pdfGPT-chat**.\nHere is a **summary** of your PDF:\n\n"
|
99 |
+
reply_summary += result["answer"]
|
100 |
+
reply_summary += "\n\nDo you have any **question** about your PDF?"
|
101 |
+
|
102 |
+
if len(st.session_state["past"]) == 1:
|
103 |
+
st.session_state["generated"][0] = reply_summary
|
104 |
+
else:
|
105 |
+
st.session_state["past"].append("Hi")
|
106 |
+
st.session_state["generated"].append(reply_summary)
|
107 |
+
|
108 |
+
# calculate cost
|
109 |
+
calculate_cost(result["token_used"], result["gpt_model"])
|
110 |
+
return st.success("The PDF file has been loaded.")
|
111 |
+
else:
|
112 |
+
return st.info(r.json()["result"])
|
113 |
+
|
114 |
+
def generate_response(
|
115 |
+
lcserve_host: str,
|
116 |
+
url: str,
|
117 |
+
file: _TemporaryFileWrapper,
|
118 |
+
question: str,
|
119 |
+
openai_key: str,
|
120 |
+
) -> dict:
|
121 |
+
if question.strip() == "":
|
122 |
+
return "[ERROR]: Question field is empty"
|
123 |
+
|
124 |
+
_data = {
|
125 |
+
"question": question,
|
126 |
+
"rebuild_embedding": st.session_state["pdf_change"],
|
127 |
+
"embedding_model": embedding_model,
|
128 |
+
"gpt_model": gpt_model,
|
129 |
+
"envs": {
|
130 |
+
"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
|
131 |
+
},
|
132 |
+
}
|
133 |
+
|
134 |
+
if url.strip() != "":
|
135 |
+
r = requests.post(
|
136 |
+
f"{LCSERVE_HOST}/ask_url",
|
137 |
+
json={"url": url, **_data},
|
138 |
+
)
|
139 |
+
|
140 |
+
else:
|
141 |
+
r = requests.post(
|
142 |
+
f"{LCSERVE_HOST}/ask_file",
|
143 |
+
params={"input_data": json.dumps(_data)},
|
144 |
+
files={"file": file},
|
145 |
+
)
|
146 |
+
|
147 |
+
if r.status_code != 200:
|
148 |
+
content = r.content.decode() # Convert bytes to string
|
149 |
+
with open("langchainlog.txt", "w") as file:
|
150 |
+
file.write(content)
|
151 |
+
return f"[ERROR]: {r.text}"
|
152 |
+
|
153 |
+
result_dict = result_to_dict(r, 0)
|
154 |
+
return result_dict
|
155 |
+
|
156 |
+
def calculate_cost(token_used, gpt_model):
|
157 |
+
st.session_state["total_token"] += int(token_used)
|
158 |
+
if "gpt-3" in gpt_model:
|
159 |
+
current_cost = st.session_state["total_token"] * 0.002 / 1000
|
160 |
+
else:
|
161 |
+
current_cost = st.session_state["total_token"] * 0.06 / 1000
|
162 |
+
st.session_state["total_cost"] += current_cost
|
163 |
+
|
164 |
+
# %%
|
165 |
+
# main page layout
|
166 |
+
header = st.container()
|
167 |
+
welcome_page = st.container()
|
168 |
+
response_container = st.container()
|
169 |
+
input_container = st.container()
|
170 |
+
cost_container = st.container()
|
171 |
+
load_pdf_popup = st.container()
|
172 |
+
|
173 |
+
# sidebar layout
|
174 |
+
input_details = st.sidebar.container()
|
175 |
+
preferences = st.sidebar.container()
|
176 |
+
chat_download = st.sidebar.container()
|
177 |
+
# %%
|
178 |
+
# instantiate session states
|
179 |
+
if "api_key" not in st.session_state:
|
180 |
+
st.session_state["api_key"] = False
|
181 |
+
|
182 |
+
if "generated" not in st.session_state:
|
183 |
+
st.session_state["generated"] = ["Hello there. I'm pdfGPT-chat. Do you have any question about your PDF?"]
|
184 |
+
|
185 |
+
if "loaded" not in st.session_state:
|
186 |
+
st.session_state["loaded"] = False
|
187 |
+
|
188 |
+
if "past" not in st.session_state:
|
189 |
+
st.session_state["past"] = ["Hi"]
|
190 |
+
|
191 |
+
if "pdf_change" not in st.session_state:
|
192 |
+
st.session_state["pdf_change"] = True
|
193 |
+
|
194 |
+
if "total_cost" not in st.session_state:
|
195 |
+
st.session_state["total_cost"] = 0
|
196 |
+
|
197 |
+
if "total_token" not in st.session_state:
|
198 |
+
st.session_state["total_token"] = 0
|
199 |
+
|
200 |
+
# %%
|
201 |
+
# constants
|
202 |
+
E5_URL = "https://github.com/microsoft/unilm/tree/master/e5"
|
203 |
+
EMBEDDING_CHOICES = {
|
204 |
+
"multilingual-e5-base": "Multilingual-E5 (default)",
|
205 |
+
"e5-small-v2": "English-E5-small (faster)",
|
206 |
+
}
|
207 |
+
GPT_CHOICES = {
|
208 |
+
"gpt-3.5-turbo": "GPT-3.5-turbo (default)",
|
209 |
+
"gpt-4": "GPT-4 (smarter, costlier)",
|
210 |
+
}
|
211 |
+
LCSERVE_HOST = "http://localhost:8080"
|
212 |
+
PDFGPT_URL = "https://github.com/bhaskatripathi/pdfGPT"
|
213 |
+
SIGNATURE = """<style>
|
214 |
+
.footer {
|
215 |
+
position: static;
|
216 |
+
left: 0;
|
217 |
+
bottom: 0;
|
218 |
+
width: 100%;
|
219 |
+
background: rgba(0,0,0,0);
|
220 |
+
text-align: center;
|
221 |
+
}
|
222 |
+
</style>
|
223 |
+
|
224 |
+
<div class="footer">
|
225 |
+
<p style='display: block;
|
226 |
+
text-align: center;
|
227 |
+
font-size:14px;
|
228 |
+
color:darkgray'>Developed with ❤ by asyafiqe</p>
|
229 |
+
</div>
|
230 |
+
"""
|
231 |
+
|
232 |
+
with header:
|
233 |
+
st.title(":page_facing_up: pdfGPT-chat")
|
234 |
+
with st.expander(
|
235 |
+
"A fork of [pdfGPT](%s) with several improvements. With pdfGPT-chat, you can chat with your PDF files using [**Microsoft E5 Multilingual Text Embeddings**](%s) and **OpenAI**."
|
236 |
+
% (PDFGPT_URL, E5_URL)
|
237 |
+
):
|
238 |
+
st.markdown(
|
239 |
+
"Compared to other tools, pdfGPT-chat provides **hallucinations-free** response, thanks to its superior embeddings and tailored prompt.<br />The generated responses from pdfGPT-chat include **citations** in square brackets ([]), indicating the **page numbers** where the relevant information is found.<br />This feature not only enhances the credibility of the responses but also aids in swiftly locating the pertinent information within the PDF file.",
|
240 |
+
unsafe_allow_html=True,
|
241 |
+
)
|
242 |
+
|
243 |
+
colored_header(
|
244 |
+
label="",
|
245 |
+
description="",
|
246 |
+
color_name="blue-40",
|
247 |
+
)
|
248 |
+
|
249 |
+
with preferences:
|
250 |
+
colored_header(
|
251 |
+
label="",
|
252 |
+
description="",
|
253 |
+
color_name="blue-40",
|
254 |
+
)
|
255 |
+
st.write("**Preferences**")
|
256 |
+
embedding_model = st.selectbox(
|
257 |
+
"Embedding",
|
258 |
+
EMBEDDING_CHOICES.keys(),
|
259 |
+
help="""[Multilingual-E5](%s) supports 100 languages.
|
260 |
+
E5-small is much faster and suitable for PC without GPU."""
|
261 |
+
% E5_URL,
|
262 |
+
on_change=pdf_change,
|
263 |
+
format_func=lambda x: EMBEDDING_CHOICES[x],
|
264 |
+
)
|
265 |
+
gpt_model = st.selectbox(
|
266 |
+
"GPT Model",
|
267 |
+
GPT_CHOICES.keys(),
|
268 |
+
help="For GPT-4 you might have to join the waitlist: https://openai.com/waitlist/gpt-4-api",
|
269 |
+
format_func=lambda x: GPT_CHOICES[x],
|
270 |
+
)
|
271 |
+
|
272 |
+
# %%
|
273 |
+
# sidebar
|
274 |
+
with input_details:
|
275 |
+
# sidebar
|
276 |
+
st.title("Input details")
|
277 |
+
OPENAI_URL = "https://platform.openai.com/account/api-keys"
|
278 |
+
openai_key = st.text_input(
|
279 |
+
":key: Enter your OpenAI API key here",
|
280 |
+
type="password",
|
281 |
+
help="Get your Open AI API key [here](%s)" % OPENAI_URL,
|
282 |
+
)
|
283 |
+
colored_header(
|
284 |
+
label="",
|
285 |
+
description="",
|
286 |
+
color_name="blue-40",
|
287 |
+
)
|
288 |
+
|
289 |
+
pdf_url = st.text_input(
|
290 |
+
":globe_with_meridians: Enter PDF URL here", on_change=pdf_change
|
291 |
+
)
|
292 |
+
|
293 |
+
st.markdown(
|
294 |
+
"<h2 style='text-align: center; color: black;'>OR</h2>",
|
295 |
+
unsafe_allow_html=True,
|
296 |
+
)
|
297 |
+
|
298 |
+
file = st.file_uploader(
|
299 |
+
":page_facing_up: Upload your PDF/ Research Paper / Book here",
|
300 |
+
type=["pdf"],
|
301 |
+
on_change=pdf_change,
|
302 |
+
)
|
303 |
+
|
304 |
+
if st.button("Load PDF"):
|
305 |
+
st.session_state["loaded"] = True
|
306 |
+
with st.spinner("Loading PDF"):
|
307 |
+
with load_pdf_popup:
|
308 |
+
load_pdf()
|
309 |
+
|
310 |
+
# %%
|
311 |
+
|
312 |
+
# main tab
|
313 |
+
if st.session_state["loaded"]:
|
314 |
+
with input_container:
|
315 |
+
with st.form(key="input_form", clear_on_submit=True):
|
316 |
+
user_input = st.text_area("Question:", key="input", height=100)
|
317 |
+
submit_button = st.form_submit_button(label="Send")
|
318 |
+
|
319 |
+
if user_input and submit_button:
|
320 |
+
with st.spinner("Processing your question"):
|
321 |
+
response = generate_response(
|
322 |
+
LCSERVE_HOST,
|
323 |
+
pdf_url,
|
324 |
+
file,
|
325 |
+
user_input,
|
326 |
+
os.environ.get("OPENAI_API_KEY"),
|
327 |
+
)
|
328 |
+
st.session_state.past.append(user_input)
|
329 |
+
st.session_state.generated.append(response["answer"])
|
330 |
+
|
331 |
+
# calculate cost
|
332 |
+
calculate_cost(response["token_used"], response["gpt_model"])
|
333 |
+
|
334 |
+
if not user_input and submit_button:
|
335 |
+
st.error("Please write your question.")
|
336 |
+
|
337 |
+
with response_container:
|
338 |
+
if st.session_state["generated"]:
|
339 |
+
for i in range(len(st.session_state["generated"])):
|
340 |
+
message(
|
341 |
+
st.session_state["past"][i], is_user=True, key=str(i) + "_user"
|
342 |
+
)
|
343 |
+
message(st.session_state["generated"][i], key=str(i))
|
344 |
+
|
345 |
+
cost_container.caption(
|
346 |
+
f"Estimated cost: $ {st.session_state['total_cost']:.4f}"
|
347 |
+
)
|
348 |
+
|
349 |
+
else:
|
350 |
+
with welcome_page:
|
351 |
+
st.write("")
|
352 |
+
st.subheader(
|
353 |
+
""":arrow_left: To start please fill input details in the sidebar and click **Load PDF**"""
|
354 |
+
)
|
355 |
+
# %%
|
356 |
+
# placed in the end to include the last conversation
|
357 |
+
with chat_download:
|
358 |
+
chat_history = pd.DataFrame(
|
359 |
+
{
|
360 |
+
"Question": st.session_state["past"],
|
361 |
+
"Answer": st.session_state["generated"],
|
362 |
+
}
|
363 |
+
)
|
364 |
+
|
365 |
+
csv = convert_df(chat_history)
|
366 |
+
|
367 |
+
st.download_button(
|
368 |
+
label="Download chat history",
|
369 |
+
data=csv,
|
370 |
+
file_name="chat history.csv",
|
371 |
+
mime="text/csv",
|
372 |
+
)
|
373 |
+
add_vertical_space(2)
|
374 |
+
st.markdown(SIGNATURE, unsafe_allow_html=True)
|
375 |
+
|
376 |
+
# %%
|
377 |
+
# # javascript
|
378 |
+
#
|
379 |
+
# # scroll halfway through the page
|
380 |
+
js = f"""
|
381 |
+
<script>
|
382 |
+
function scroll() {{
|
383 |
+
var textAreas = parent.document.querySelectorAll('section.main');
|
384 |
+
var halfwayScroll = 0.4 * textAreas[0].scrollHeight; // Calculate halfway scroll position
|
385 |
+
|
386 |
+
for (let index = 0; index < textAreas.length; index++) {{
|
387 |
+
textAreas[index].scrollTop = halfwayScroll; // Set scroll position to halfway
|
388 |
+
}}
|
389 |
+
}}
|
390 |
+
|
391 |
+
scroll(); // Call the scroll function
|
392 |
+
</script>
|
393 |
+
"""
|
394 |
+
st.components.v1.html(js)
|
395 |
+
|
396 |
+
# reduce main top padding
|
397 |
+
st.markdown(
|
398 |
+
"<style>div.block-container{padding-top:1.5em;}</style>",
|
399 |
+
unsafe_allow_html=True,
|
400 |
+
)
|
401 |
+
# reduce sidebar top padding
|
402 |
+
st.markdown(
|
403 |
+
"<style>.css-ysnqb2.e1g8pov64 {margin-top: -90px;}</style>",
|
404 |
+
unsafe_allow_html=True,
|
405 |
+
)
|
406 |
+
|
407 |
+
|
408 |
+
if __name__ == "__main__":
|
409 |
+
main()
|
docker-compose.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3'
|
2 |
+
|
3 |
+
services:
|
4 |
+
langchain-serve:
|
5 |
+
build:
|
6 |
+
context: .
|
7 |
+
target: langchain-serve-img
|
8 |
+
ports:
|
9 |
+
- '8080:8080'
|
10 |
+
pdf-gpt:
|
11 |
+
build:
|
12 |
+
context: .
|
13 |
+
target: pdf-gpt-img
|
14 |
+
ports:
|
15 |
+
- '7860:7860'
|
intfloat/e5-small-v2/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "tmp/",
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 384,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 1536,
|
13 |
+
"layer_norm_eps": 1e-12,
|
14 |
+
"max_position_embeddings": 512,
|
15 |
+
"model_type": "bert",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_hidden_layers": 12,
|
18 |
+
"pad_token_id": 0,
|
19 |
+
"position_embedding_type": "absolute",
|
20 |
+
"torch_dtype": "float32",
|
21 |
+
"transformers_version": "4.29.0.dev0",
|
22 |
+
"type_vocab_size": 2,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 30522
|
25 |
+
}
|
intfloat/e5-small-v2/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4790fed2919e70bff573d01cd3aede75970f219ab4c0b0aeadd0f4b98084a17d
|
3 |
+
size 133508397
|
intfloat/e5-small-v2/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
intfloat/e5-small-v2/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
intfloat/e5-small-v2/tokenizer_config.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"clean_up_tokenization_spaces": true,
|
3 |
+
"cls_token": "[CLS]",
|
4 |
+
"do_basic_tokenize": true,
|
5 |
+
"do_lower_case": true,
|
6 |
+
"mask_token": "[MASK]",
|
7 |
+
"model_max_length": 1000000000000000019884624838656,
|
8 |
+
"never_split": null,
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"sep_token": "[SEP]",
|
11 |
+
"strip_accents": null,
|
12 |
+
"tokenize_chinese_chars": true,
|
13 |
+
"tokenizer_class": "BertTokenizer",
|
14 |
+
"unk_token": "[UNK]"
|
15 |
+
}
|
intfloat/e5-small-v2/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
intfloat/multilingual-e5-base/README.md
ADDED
The diff for this file is too large to render.
See raw diff
|
|
intfloat/multilingual-e5-base/config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "tmp/",
|
3 |
+
"architectures": [
|
4 |
+
"XLMRobertaModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 768,
|
13 |
+
"initializer_range": 0.02,
|
14 |
+
"intermediate_size": 3072,
|
15 |
+
"layer_norm_eps": 1e-05,
|
16 |
+
"max_position_embeddings": 514,
|
17 |
+
"model_type": "xlm-roberta",
|
18 |
+
"num_attention_heads": 12,
|
19 |
+
"num_hidden_layers": 12,
|
20 |
+
"output_past": true,
|
21 |
+
"pad_token_id": 1,
|
22 |
+
"position_embedding_type": "absolute",
|
23 |
+
"torch_dtype": "float32",
|
24 |
+
"transformers_version": "4.29.0.dev0",
|
25 |
+
"type_vocab_size": 1,
|
26 |
+
"use_cache": true,
|
27 |
+
"vocab_size": 250002
|
28 |
+
}
|
intfloat/multilingual-e5-base/gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
intfloat/multilingual-e5-base/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f061cb7641880f52895cbacab7c4ab39b0844e2e6b73794f2798de460d9fa418
|
3 |
+
size 1112242989
|
intfloat/multilingual-e5-base/sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
intfloat/multilingual-e5-base/special_tokens_map.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"cls_token": "<s>",
|
4 |
+
"eos_token": "</s>",
|
5 |
+
"mask_token": {
|
6 |
+
"content": "<mask>",
|
7 |
+
"lstrip": true,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false
|
11 |
+
},
|
12 |
+
"pad_token": "<pad>",
|
13 |
+
"sep_token": "</s>",
|
14 |
+
"unk_token": "<unk>"
|
15 |
+
}
|
intfloat/multilingual-e5-base/tokenizer_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"clean_up_tokenization_spaces": true,
|
4 |
+
"cls_token": "<s>",
|
5 |
+
"eos_token": "</s>",
|
6 |
+
"mask_token": {
|
7 |
+
"__type": "AddedToken",
|
8 |
+
"content": "<mask>",
|
9 |
+
"lstrip": true,
|
10 |
+
"normalized": true,
|
11 |
+
"rstrip": false,
|
12 |
+
"single_word": false
|
13 |
+
},
|
14 |
+
"model_max_length": 512,
|
15 |
+
"pad_token": "<pad>",
|
16 |
+
"sep_token": "</s>",
|
17 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
18 |
+
"unk_token": "<unk>"
|
19 |
+
}
|
requirements_api.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.96.0
|
2 |
+
langchain_serve==0.0.41
|
3 |
+
openai==0.27.7
|
4 |
+
optimum==1.8.6
|
5 |
+
PyMuPDF==1.22.3
|
6 |
+
scikit_learn==1.0.2
|
7 |
+
transformers==4.29.2
|
requirements_app.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==2.0.2
|
2 |
+
streamlit==1.23.1
|
3 |
+
streamlit_chat==0.0.2.2
|
4 |
+
streamlit_extras==0.2.7
|
requirements_pytorch.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
--extra-index-url https://download.pytorch.org/whl/cu117
|
2 |
+
torch
|
3 |
+
torchvision
|
4 |
+
torchaudio
|