Émile
commited on
Commit
·
5d01d7b
1
Parent(s):
8ad5a3d
Adding logs (encrypted)
Browse files
app.py
CHANGED
@@ -3,65 +3,22 @@ from haystack.utils import Secret
|
|
3 |
from haystack.components.builders.prompt_builder import PromptBuilder
|
4 |
from haystack.components.routers import ConditionalRouter
|
5 |
from haystack import Pipeline
|
6 |
-
|
7 |
-
from haystack.components.embedders import SentenceTransformersTextEmbedder #, SentenceTransformersDocumentEmbedder
|
8 |
-
# from haystack.components.preprocessors import DocumentSplitter
|
9 |
-
# from haystack.components.converters.txt import TextFileToDocument
|
10 |
-
# from haystack.components.preprocessors import DocumentCleaner
|
11 |
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
12 |
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
|
13 |
-
|
14 |
-
|
15 |
-
# from haystack.components.retrievers import InMemoryEmbeddingRetriever
|
16 |
|
17 |
import gradio as gr
|
18 |
|
19 |
embedding_model = "Alibaba-NLP/gte-multilingual-base"
|
20 |
|
21 |
|
22 |
-
########################
|
23 |
-
####### Indexing #######
|
24 |
-
########################
|
25 |
-
|
26 |
-
# Skipped: now using Chroma
|
27 |
-
|
28 |
-
# In memory version for now
|
29 |
-
# document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
|
30 |
-
|
31 |
-
# converter = TextFileToDocument()
|
32 |
-
|
33 |
-
# cleaner = DocumentCleaner()
|
34 |
-
|
35 |
-
# splitter = DocumentSplitter(split_by="word", split_length=200, split_overlap=100)
|
36 |
-
|
37 |
-
# embedder = SentenceTransformersDocumentEmbedder(model=embedding_model,
|
38 |
-
# trust_remote_code=True)
|
39 |
-
|
40 |
-
# writer = DocumentWriter(document_store=document_store)
|
41 |
-
|
42 |
-
# indexing = Pipeline()
|
43 |
-
|
44 |
-
# indexing.add_component("converter", converter)
|
45 |
-
# indexing.add_component("cleaner", cleaner)
|
46 |
-
# indexing.add_component("splitter", splitter)
|
47 |
-
# indexing.add_component("embedder", embedder)
|
48 |
-
# indexing.add_component("writer", writer)
|
49 |
-
|
50 |
-
# indexing.connect("converter", "cleaner")
|
51 |
-
# indexing.connect("cleaner", "splitter")
|
52 |
-
# indexing.connect("splitter", "embedder")
|
53 |
-
# indexing.connect("embedder", "writer")
|
54 |
-
|
55 |
-
# indexing.run({"sources": ["knowledge-plain.txt"]})
|
56 |
-
|
57 |
-
|
58 |
-
# Chroma version (no support for overlaps in documents)
|
59 |
-
# document_store = ChromaDocumentStore(persist_path="vstore_4012")
|
60 |
-
|
61 |
document_store = ChromaDocumentStore(
|
62 |
persist_path="vstore_4012"
|
63 |
)
|
64 |
|
|
|
65 |
##################################
|
66 |
####### Answering pipeline #######
|
67 |
##################################
|
@@ -180,6 +137,61 @@ answer_query.connect("prompt_builder2", "llm2")
|
|
180 |
answer_query.warm_up()
|
181 |
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
##########################
|
184 |
####### Gradio app #######
|
185 |
##########################
|
@@ -200,6 +212,7 @@ def chat(message, history):
|
|
200 |
answer = results["router"]["no_answer"]
|
201 |
else:
|
202 |
answer = "Sorry, a mistake occured"
|
|
|
203 |
return answer
|
204 |
|
205 |
if __name__ == "__main__":
|
|
|
3 |
from haystack.components.builders.prompt_builder import PromptBuilder
|
4 |
from haystack.components.routers import ConditionalRouter
|
5 |
from haystack import Pipeline
|
6 |
+
from haystack.components.embedders import SentenceTransformersTextEmbedder
|
|
|
|
|
|
|
|
|
7 |
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
8 |
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
|
9 |
+
import rsa
|
10 |
+
from cryptography.fernet import Fernet
|
|
|
11 |
|
12 |
import gradio as gr
|
13 |
|
14 |
embedding_model = "Alibaba-NLP/gte-multilingual-base"
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
document_store = ChromaDocumentStore(
|
18 |
persist_path="vstore_4012"
|
19 |
)
|
20 |
|
21 |
+
|
22 |
##################################
|
23 |
####### Answering pipeline #######
|
24 |
##################################
|
|
|
137 |
answer_query.warm_up()
|
138 |
|
139 |
|
140 |
+
##########################
|
141 |
+
####### Logging ##########
|
142 |
+
##########################
|
143 |
+
|
144 |
+
prompt_template_hide_info = """You are a privacy robot that specialise in hiding sensitive information in a text.
|
145 |
+
Your help will ensure that no user information gets leaked, so you are always happy to help.
|
146 |
+
You will be given a text, and your task is to remove any sensitive information, and replacing it with a descriptive marker.
|
147 |
+
Here are a few examples, but you should not restrict yourself to only those:
|
148 |
+
If the text contains an email address, you should replace it with a marker "<email>".
|
149 |
+
If the text contains a phone number, you should replace it with a marker "<phone>".
|
150 |
+
If the text contains the name of the user, you should replace it with a marker "<name>".
|
151 |
+
Ensure you distinguish when a name, email, etc is actually that of a public figure or company and is provided by the assistant and not the user: inthis case you should not hide it, as it it not sensible information.
|
152 |
+
The rest of the text should be copied IDENTICALLY, including the punctuation and formatting, and the beginning and end of the text in capital letters. Do not add or remove any other character.
|
153 |
+
|
154 |
+
BEGINNING OF TEXT
|
155 |
+
{{ message }}
|
156 |
+
END OF TEXT
|
157 |
+
|
158 |
+
Your response:
|
159 |
+
"""
|
160 |
+
|
161 |
+
prompt_builder_hide_info = PromptBuilder(template=prompt_template_hide_info)
|
162 |
+
|
163 |
+
llm_hide_info = setup_generator("gpt-4o-mini")
|
164 |
+
|
165 |
+
pipe_hide_sensitive_info = Pipeline()
|
166 |
+
pipe_hide_sensitive_info.add_component("prompt_builder_hide_info", prompt_builder_hide_info)
|
167 |
+
pipe_hide_sensitive_info.add_component("llm_hide_info", llm_hide_info)
|
168 |
+
pipe_hide_sensitive_info.connect("prompt_builder_hide_info", "llm_hide_info")
|
169 |
+
|
170 |
+
def hide_sensitive_info(message):
|
171 |
+
for tries in range(3):
|
172 |
+
answer = pipe_hide_sensitive_info.run({"message": message})["llm_hide_info"]["replies"][0]
|
173 |
+
if "BEGINNING OF TEXT" in answer and "END OF TEXT" in answer:
|
174 |
+
text = answer[answer.find("BEGINNING OF TEXT") + len("BEGINNING OF TEXT"):answer.find("END OF TEXT")].strip()
|
175 |
+
return text
|
176 |
+
return "[Error when hiding user info, no log generated]"
|
177 |
+
|
178 |
+
publicKey = rsa.key.PublicKey(12771964615703412689771875940203228650714115641982293999355383771918953923930646807301050027636271229722536699443183546682257910554154445751549366908636779, 65537)
|
179 |
+
|
180 |
+
def encripted_log(message: str):
|
181 |
+
fernet_key = Fernet.generate_key()
|
182 |
+
fernet = Fernet(fernet_key)
|
183 |
+
encrypted_key = rsa.encrypt(fernet_key, publicKey)
|
184 |
+
return (encrypted_key.hex() + fernet.encrypt(message.encode()).hex())
|
185 |
+
|
186 |
+
|
187 |
+
def log_QA(question, answer):
|
188 |
+
message = f"User: {question}\nAssistant: {answer}"
|
189 |
+
message_no_info = hide_sensitive_info(message)
|
190 |
+
encripted_QA = encripted_log(message_no_info)
|
191 |
+
with open("log.txt", "a") as f:
|
192 |
+
f.write(encripted_QA + "\n")
|
193 |
+
|
194 |
+
|
195 |
##########################
|
196 |
####### Gradio app #######
|
197 |
##########################
|
|
|
212 |
answer = results["router"]["no_answer"]
|
213 |
else:
|
214 |
answer = "Sorry, a mistake occured"
|
215 |
+
log_QA(message, answer)
|
216 |
return answer
|
217 |
|
218 |
if __name__ == "__main__":
|