KevinHuSh commited on
Commit
365a2ed
·
1 Parent(s): cfd888e

refine presentation parser (#110)

Browse files
api/apps/conversation_app.py CHANGED
@@ -212,14 +212,17 @@ def chat(dialog, messages, **kwargs):
212
  if "max_tokens" in gen_conf:
213
  gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count)
214
  answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf)
 
215
 
216
  if knowledges:
217
- answer = retrievaler.insert_citations(answer,
218
  [ck["content_ltks"] for ck in kbinfos["chunks"]],
219
  [ck["vector"] for ck in kbinfos["chunks"]],
220
  embd_mdl,
221
  tkweight=1 - dialog.vector_similarity_weight,
222
  vtweight=dialog.vector_similarity_weight)
 
 
223
  for c in kbinfos["chunks"]:
224
  if c.get("vector"): del c["vector"]
225
  return {"answer": answer, "reference": kbinfos}
 
212
  if "max_tokens" in gen_conf:
213
  gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count)
214
  answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf)
215
+ stat_logger.info("User: {}|Assistant: {}".format(msg[-1]["content"], answer))
216
 
217
  if knowledges:
218
+ answer, idx = retrievaler.insert_citations(answer,
219
  [ck["content_ltks"] for ck in kbinfos["chunks"]],
220
  [ck["vector"] for ck in kbinfos["chunks"]],
221
  embd_mdl,
222
  tkweight=1 - dialog.vector_similarity_weight,
223
  vtweight=dialog.vector_similarity_weight)
224
+ idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
225
+ kbinfos["doc_aggs"] = [d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
226
  for c in kbinfos["chunks"]:
227
  if c.get("vector"): del c["vector"]
228
  return {"answer": answer, "reference": kbinfos}
rag/app/presentation.py CHANGED
@@ -88,20 +88,25 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
88
  res = []
89
  if re.search(r"\.pptx?$", filename, re.IGNORECASE):
90
  ppt_parser = Ppt()
91
- for txt,img in ppt_parser(filename if not binary else binary, from_page, 1000000, callback):
92
  d = copy.deepcopy(doc)
 
93
  d["image"] = img
94
- tokenize(d, txt, ppt_parser.is_english)
 
 
 
95
  res.append(d)
96
  return res
97
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
98
  pdf_parser = Pdf()
99
  for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
100
  d = copy.deepcopy(doc)
 
101
  d["image"] = img
102
  d["page_num_int"] = [pn+1]
103
  d["top_int"] = [0]
104
- d["position_int"].append((pn + 1, 0, img.size[0], 0, img.size[1]))
105
  tokenize(d, txt, eng)
106
  res.append(d)
107
  return res
 
88
  res = []
89
  if re.search(r"\.pptx?$", filename, re.IGNORECASE):
90
  ppt_parser = Ppt()
91
+ for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
92
  d = copy.deepcopy(doc)
93
+ pn += from_page
94
  d["image"] = img
95
+ d["page_num_int"] = [pn+1]
96
+ d["top_int"] = [0]
97
+ d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
98
+ tokenize(d, txt, eng)
99
  res.append(d)
100
  return res
101
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
102
  pdf_parser = Pdf()
103
  for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
104
  d = copy.deepcopy(doc)
105
+ pn += from_page
106
  d["image"] = img
107
  d["page_num_int"] = [pn+1]
108
  d["top_int"] = [0]
109
+ d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
110
  tokenize(d, txt, eng)
111
  res.append(d)
112
  return res
rag/nlp/search.py CHANGED
@@ -243,7 +243,7 @@ class Dealer:
243
  res += f" ##{c}$$"
244
  seted.add(c)
245
 
246
- return res
247
 
248
  def rerank(self, sres, query, tkweight=0.3,
249
  vtweight=0.7, cfield="content_ltks"):
@@ -290,7 +290,7 @@ class Dealer:
290
  start_idx -= 1
291
  if start_idx >= 0:
292
  continue
293
- if len(ranks["chunks"]) == page_size:
294
  if aggs:
295
  continue
296
  break
@@ -322,7 +322,7 @@ class Dealer:
322
  if dnm not in ranks["doc_aggs"]:
323
  ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
324
  ranks["doc_aggs"][dnm]["count"] += 1
325
- ranks["doc_aggs"] = []#[{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
326
 
327
  return ranks
328
 
 
243
  res += f" ##{c}$$"
244
  seted.add(c)
245
 
246
+ return res, seted
247
 
248
  def rerank(self, sres, query, tkweight=0.3,
249
  vtweight=0.7, cfield="content_ltks"):
 
290
  start_idx -= 1
291
  if start_idx >= 0:
292
  continue
293
+ if len(ranks["chunks"]) >= page_size:
294
  if aggs:
295
  continue
296
  break
 
322
  if dnm not in ranks["doc_aggs"]:
323
  ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
324
  ranks["doc_aggs"][dnm]["count"] += 1
325
+ ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
326
 
327
  return ranks
328