Afritz commited on
Commit
8f1e175
Β·
1 Parent(s): 17cae0c

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +168 -33
utils.py CHANGED
@@ -67,7 +67,7 @@ def make_html_source(paragraph, meta_doc, i):
67
  return f"""
68
  <div class="card" id="document-{i}">
69
  <div class="card-content">
70
- <h2>Doc {i} - {meta_doc['short_name']} - Page {meta_paragraph['page_number']}</h2>
71
  <p>{content}</p>
72
  </div>
73
  <div class="card-footer">
@@ -79,6 +79,26 @@ def make_html_source(paragraph, meta_doc, i):
79
  </div>
80
  """
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  def preprocess_message(text: str, docs_url: dict) -> str:
84
  return re.sub(
@@ -108,7 +128,6 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
108
  def chat(
109
  query: str,
110
  history: list,
111
- query_mode : str = 'HYDE',
112
  threshold: float = CFG_APP.THRESHOLD,
113
  k_total: int = CFG_APP.K_TOTAL,
114
  ) -> tuple:
@@ -121,25 +140,16 @@ def chat(
121
  Yields:
122
  tuple: chat gradio format, chat openai format, sources used.
123
  """
124
- if query_mode == 'Reformulation':
125
-
126
- reformulated_query = openai.ChatCompletion.create(
127
- model=CFG_APP.MODEL_NAME,
128
- messages=get_reformulation_prompt(parse_glossary(query)),
129
- temperature=0,
130
- max_tokens=CFG_APP.MAX_TOKENS_REF_QUESTION,
131
- )
132
 
133
- else :
134
-
135
- reformulated_query = openai.ChatCompletion.create(
136
- model=CFG_APP.MODEL_NAME,
137
- messages=get_hyde_prompt(parse_glossary(query)),
138
- temperature=0,
139
- max_tokens=CFG_APP.MAX_TOKENS_REF_QUESTION,
140
- )
141
 
142
  reformulated_query = reformulated_query["choices"][0]["message"]["content"]
 
143
  if len(reformulated_query.split("\n")) == 2:
144
  reformulated_query, language = reformulated_query.split("\n")
145
  language = language.split(":")[1].strip()
@@ -152,21 +162,21 @@ def chat(
152
  k_total=k_total,
153
  threshold=threshold,
154
  )
 
155
  if CFG_APP.DEBUG == True:
156
  print("Scores : \n", scores)
157
 
158
  messages = history + [{"role": "user", "content": query}]
159
 
160
- if query_mode is None or query_mode == 'HYDE' :
161
- reformulated_query = reformulated_query.split("?")[0] + '?'
162
-
163
  docs_url = defaultdict(str)
164
 
165
  if len(sources) > 0:
166
  docs_string = []
167
  docs_html = []
 
168
 
169
  num_tokens = num_tokens_from_string(CFG_APP.SOURCES_PROMPT, CFG_APP.MODEL_NAME)
 
170
 
171
  for i, data in enumerate(sources, 1):
172
  meta_doc = retrieve_doc_metadata(doc_metadata, data["meta"]["document_id"])
@@ -176,17 +186,26 @@ def chat(
176
  break
177
  num_tokens += num_tokens_doc
178
  docs_string.append(doc_content)
 
 
 
 
 
 
 
 
 
179
  docs_html.append(make_html_source(data, meta_doc, i))
180
 
181
  url_doc = f'<a href="{meta_doc["url"]}#page={data["meta"]["page_number"]}" target="_blank" class="pdf-link">'
182
  docs_url[i] = url_doc
183
 
184
- docs_string = "\n\n".join(
185
- [f"Query used for retrieval:\n{reformulated_query}"] + docs_string
186
- )
187
- docs_html = "\n\n".join(
188
- [f"Query used for retrieval:\n{reformulated_query}"] + docs_html
189
- )
190
  messages.append(
191
  {
192
  "role": "system",
@@ -219,7 +238,7 @@ def chat(
219
  {"role": "user", "content": reformulated_query},
220
  {
221
  "role": "system",
222
- "content": f"{CFG_APP.SOURCES_PROMPT}\n\n{docs_string}\n\nAnswer in {language}:",
223
  },
224
  ],
225
  temperature=0, # deterministic
@@ -239,8 +258,124 @@ def chat(
239
  yield gradio_format, messages, docs_html
240
 
241
  else:
242
- docs_string = "⚠️ No relevant passages found in this report"
243
- complete_response = "**⚠️ No relevant passages found in this report, you may want to ask a more specific question.**"
244
- messages.append({"role": "assistant", "content": complete_response})
245
- gradio_format = make_pairs([a["content"] for a in messages[1:]])
246
- yield gradio_format, messages, docs_string
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  return f"""
68
  <div class="card" id="document-{i}">
69
  <div class="card-content">
70
+ <h2>Excerpts {i} - Document {meta_doc['num_doc']} - Page {meta_paragraph['page_number']}</h2>
71
  <p>{content}</p>
72
  </div>
73
  <div class="card-footer">
 
79
  </div>
80
  """
81
 
82
+ def make_citations_source(citation_dic, query, Hyde: False):
83
+ citation_list = [f'Doc {values[0]} - {keys} (excerpts {values[1]})' for keys, values in citation_dic.items()]
84
+
85
+ html_output = '<div class="source">\n'
86
+ html_output += ' <div class="title">Sources</div>\n'
87
+ if Hyde :
88
+ html_output += f' <div>Query used for retrieval (with the HyDE technique after no response): {query}</div>\n'
89
+ else :
90
+ html_output += f' <div>Query used for retrieval: {query}</div>\n'
91
+ html_output += ' <br>\n'
92
+ html_output += ' <ul>\n'
93
+
94
+ for row in citation_list :
95
+ html_output += f'<li>{row}</li>'
96
+
97
+ html_output += ' </ul>\n'
98
+ html_output += '</div>\n'
99
+
100
+ return html_output
101
+
102
 
103
  def preprocess_message(text: str, docs_url: dict) -> str:
104
  return re.sub(
 
128
  def chat(
129
  query: str,
130
  history: list,
 
131
  threshold: float = CFG_APP.THRESHOLD,
132
  k_total: int = CFG_APP.K_TOTAL,
133
  ) -> tuple:
 
140
  Yields:
141
  tuple: chat gradio format, chat openai format, sources used.
142
  """
 
 
 
 
 
 
 
 
143
 
144
+ reformulated_query = openai.ChatCompletion.create(
145
+ model=CFG_APP.MODEL_NAME,
146
+ messages=get_reformulation_prompt(parse_glossary(query)),
147
+ temperature=0,
148
+ max_tokens=CFG_APP.MAX_TOKENS_REF_QUESTION,
149
+ )
 
 
150
 
151
  reformulated_query = reformulated_query["choices"][0]["message"]["content"]
152
+
153
  if len(reformulated_query.split("\n")) == 2:
154
  reformulated_query, language = reformulated_query.split("\n")
155
  language = language.split(":")[1].strip()
 
162
  k_total=k_total,
163
  threshold=threshold,
164
  )
165
+
166
  if CFG_APP.DEBUG == True:
167
  print("Scores : \n", scores)
168
 
169
  messages = history + [{"role": "user", "content": query}]
170
 
 
 
 
171
  docs_url = defaultdict(str)
172
 
173
  if len(sources) > 0:
174
  docs_string = []
175
  docs_html = []
176
+ citations = {}
177
 
178
  num_tokens = num_tokens_from_string(CFG_APP.SOURCES_PROMPT, CFG_APP.MODEL_NAME)
179
+ num_doc = 1
180
 
181
  for i, data in enumerate(sources, 1):
182
  meta_doc = retrieve_doc_metadata(doc_metadata, data["meta"]["document_id"])
 
186
  break
187
  num_tokens += num_tokens_doc
188
  docs_string.append(doc_content)
189
+
190
+ if meta_doc['short_name'] in citations.keys():
191
+ citations[meta_doc['short_name']][1] += f', {i}'
192
+ else :
193
+ citations[meta_doc['short_name']] = [num_doc, f'{i}']
194
+ num_doc += 1
195
+
196
+ meta_doc["num_doc"] = citations[meta_doc['short_name']][0]
197
+
198
  docs_html.append(make_html_source(data, meta_doc, i))
199
 
200
  url_doc = f'<a href="{meta_doc["url"]}#page={data["meta"]["page_number"]}" target="_blank" class="pdf-link">'
201
  docs_url[i] = url_doc
202
 
203
+ html_cit = [make_citations_source(citations, reformulated_query, Hyde=False)]
204
+
205
+ docs_string = "\n\n".join( [f"Query used for retrieval:\n{reformulated_query}"] + docs_string)
206
+
207
+ docs_html = "\n\n".join(html_cit + docs_html)
208
+
209
  messages.append(
210
  {
211
  "role": "system",
 
238
  {"role": "user", "content": reformulated_query},
239
  {
240
  "role": "system",
241
+ "content": f"{CFG_APP.SOURCES_PROMPT}\n\nVery important : Answer in {language}.\n\n{docs_string}:",
242
  },
243
  ],
244
  temperature=0, # deterministic
 
258
  yield gradio_format, messages, docs_html
259
 
260
  else:
261
+ reformulated_query = openai.ChatCompletion.create(
262
+ model=CFG_APP.MODEL_NAME,
263
+ messages=get_hyde_prompt(parse_glossary(query)),
264
+ temperature=0,
265
+ max_tokens=CFG_APP.MAX_TOKENS_REF_QUESTION,
266
+ )
267
+
268
+ reformulated_query = reformulated_query["choices"][0]["message"]["content"]
269
+
270
+ if len(reformulated_query.split("\n")) == 2:
271
+ reformulated_query, language = reformulated_query.split("\n")
272
+ language = language.split(":")[1].strip()
273
+ else:
274
+ reformulated_query = reformulated_query.split("\n")[0]
275
+ language = "English"
276
+
277
+ sources, scores = text_embedder.retrieve_faiss(
278
+ reformulated_query,
279
+ k_total=k_total,
280
+ threshold=threshold,
281
+ )
282
+
283
+ if CFG_APP.DEBUG == True:
284
+ print("Scores : \n", scores)
285
+
286
+ if len(sources) > 0 :
287
+ docs_string = []
288
+ docs_html = []
289
+ citations = {}
290
+
291
+ num_tokens = num_tokens_from_string(CFG_APP.SOURCES_PROMPT, CFG_APP.MODEL_NAME)
292
+
293
+ num_doc = 1
294
+
295
+ for i, data in enumerate(sources, 1):
296
+ meta_doc = retrieve_doc_metadata(doc_metadata, data["meta"]["document_id"])
297
+ doc_content = f"πŸ“ƒ Doc {i}: \n{data['content']}"
298
+ num_tokens_doc = num_tokens_from_string(doc_content, CFG_APP.MODEL_NAME)
299
+ if num_tokens + num_tokens_doc > CFG_APP.MAX_TOKENS_API:
300
+ break
301
+ num_tokens += num_tokens_doc
302
+ docs_string.append(doc_content)
303
+
304
+ if meta_doc['short_name'] in citations.keys():
305
+ citations[meta_doc['short_name']][1] += f', {i}'
306
+ else:
307
+ citations[meta_doc['short_name']] = [num_doc, f'{i}']
308
+ num_doc += 1
309
+
310
+ meta_doc["num_doc"] = citations[meta_doc['short_name']][0]
311
+
312
+ docs_html.append(make_html_source(data, meta_doc, i))
313
+
314
+ url_doc = f'<a href="{meta_doc["url"]}#page={data["meta"]["page_number"]}" target="_blank" class="pdf-link">'
315
+ docs_url[i] = url_doc
316
+
317
+ html_cit = [make_citations_source(citations, reformulated_query, Hyde=True)]
318
+
319
+ docs_string = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_string)
320
+
321
+ docs_html = "\n\n".join(html_cit + docs_html)
322
+
323
+ messages.append(
324
+ {
325
+ "role": "system",
326
+ "content": f"{CFG_APP.SOURCES_PROMPT}\n\n{docs_string}\n\nAnswer in {language}:",
327
+ }
328
+ )
329
+
330
+ if CFG_APP.DEBUG == True:
331
+ print(f" πŸ‘¨β€πŸ’» question asked by the user : {query}")
332
+ print(f" πŸ•› time : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
333
+
334
+ print(" πŸ”Œ messages sent to the API :")
335
+ api_messages = [
336
+ {"role": "system", "content": CFG_APP.INIT_PROMPT},
337
+ {"role": "user", "content": reformulated_query},
338
+ {
339
+ "role": "system",
340
+ "content": f"{CFG_APP.SOURCES_PROMPT}\n\nVery important : Answer in {language}.\n\n{docs_string}:",
341
+ },
342
+ ]
343
+ for message in api_messages:
344
+ print(
345
+ f"length : {len(message['content'])}, content : {message['content']}"
346
+ )
347
+
348
+ response = openai.ChatCompletion.create(
349
+ model=CFG_APP.MODEL_NAME,
350
+ messages=[
351
+ {"role": "system", "content": CFG_APP.INIT_PROMPT},
352
+ {"role": "user", "content": reformulated_query},
353
+ {
354
+ "role": "system",
355
+ "content": f"{CFG_APP.SOURCES_PROMPT}\n\nVery important : Answer in {language}.\n\n{docs_string}:",
356
+ },
357
+ ],
358
+ temperature=0, # deterministic
359
+ stream=True,
360
+ max_tokens=CFG_APP.MAX_TOKENS_ANSWER,
361
+ )
362
+ complete_response = ""
363
+ messages.pop()
364
+ messages.append({"role": "assistant", "content": complete_response})
365
+ for chunk in response:
366
+ chunk_message = chunk["choices"][0]["delta"].get("content")
367
+ if chunk_message:
368
+ complete_response += chunk_message
369
+ complete_response = preprocess_message(complete_response, docs_url)
370
+ messages[-1]["content"] = complete_response
371
+ gradio_format = make_pairs([a["content"] for a in messages[1:]])
372
+ yield gradio_format, messages, docs_html
373
+
374
+ else :
375
+ docs_string = "⚠️ No relevant passages found in this report"
376
+ complete_response = "**⚠️ No relevant passages found in this report, you may want to ask a more specific question.**"
377
+ messages.append({"role": "assistant", "content": complete_response})
378
+ gradio_format = make_pairs([a["content"] for a in messages[1:]])
379
+ yield gradio_format, messages, docs_string
380
+
381
+