KevinHuSh
commited on
Commit
·
365a2ed
1
Parent(s):
cfd888e
refine presentation parser (#110)
Browse files- api/apps/conversation_app.py +4 -1
- rag/app/presentation.py +8 -3
- rag/nlp/search.py +3 -3
api/apps/conversation_app.py
CHANGED
@@ -212,14 +212,17 @@ def chat(dialog, messages, **kwargs):
|
|
212 |
if "max_tokens" in gen_conf:
|
213 |
gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count)
|
214 |
answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf)
|
|
|
215 |
|
216 |
if knowledges:
|
217 |
-
answer = retrievaler.insert_citations(answer,
|
218 |
[ck["content_ltks"] for ck in kbinfos["chunks"]],
|
219 |
[ck["vector"] for ck in kbinfos["chunks"]],
|
220 |
embd_mdl,
|
221 |
tkweight=1 - dialog.vector_similarity_weight,
|
222 |
vtweight=dialog.vector_similarity_weight)
|
|
|
|
|
223 |
for c in kbinfos["chunks"]:
|
224 |
if c.get("vector"): del c["vector"]
|
225 |
return {"answer": answer, "reference": kbinfos}
|
|
|
212 |
if "max_tokens" in gen_conf:
|
213 |
gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count)
|
214 |
answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf)
|
215 |
+
stat_logger.info("User: {}|Assistant: {}".format(msg[-1]["content"], answer))
|
216 |
|
217 |
if knowledges:
|
218 |
+
answer, idx = retrievaler.insert_citations(answer,
|
219 |
[ck["content_ltks"] for ck in kbinfos["chunks"]],
|
220 |
[ck["vector"] for ck in kbinfos["chunks"]],
|
221 |
embd_mdl,
|
222 |
tkweight=1 - dialog.vector_similarity_weight,
|
223 |
vtweight=dialog.vector_similarity_weight)
|
224 |
+
idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
|
225 |
+
kbinfos["doc_aggs"] = [d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
|
226 |
for c in kbinfos["chunks"]:
|
227 |
if c.get("vector"): del c["vector"]
|
228 |
return {"answer": answer, "reference": kbinfos}
|
rag/app/presentation.py
CHANGED
@@ -88,20 +88,25 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
88 |
res = []
|
89 |
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
90 |
ppt_parser = Ppt()
|
91 |
-
for txt,img in ppt_parser(filename if not binary else binary, from_page, 1000000, callback):
|
92 |
d = copy.deepcopy(doc)
|
|
|
93 |
d["image"] = img
|
94 |
-
|
|
|
|
|
|
|
95 |
res.append(d)
|
96 |
return res
|
97 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
98 |
pdf_parser = Pdf()
|
99 |
for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
|
100 |
d = copy.deepcopy(doc)
|
|
|
101 |
d["image"] = img
|
102 |
d["page_num_int"] = [pn+1]
|
103 |
d["top_int"] = [0]
|
104 |
-
d["position_int"]
|
105 |
tokenize(d, txt, eng)
|
106 |
res.append(d)
|
107 |
return res
|
|
|
88 |
res = []
|
89 |
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
90 |
ppt_parser = Ppt()
|
91 |
+
for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
92 |
d = copy.deepcopy(doc)
|
93 |
+
pn += from_page
|
94 |
d["image"] = img
|
95 |
+
d["page_num_int"] = [pn+1]
|
96 |
+
d["top_int"] = [0]
|
97 |
+
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
98 |
+
tokenize(d, txt, eng)
|
99 |
res.append(d)
|
100 |
return res
|
101 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
102 |
pdf_parser = Pdf()
|
103 |
for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
|
104 |
d = copy.deepcopy(doc)
|
105 |
+
pn += from_page
|
106 |
d["image"] = img
|
107 |
d["page_num_int"] = [pn+1]
|
108 |
d["top_int"] = [0]
|
109 |
+
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
110 |
tokenize(d, txt, eng)
|
111 |
res.append(d)
|
112 |
return res
|
rag/nlp/search.py
CHANGED
@@ -243,7 +243,7 @@ class Dealer:
|
|
243 |
res += f" ##{c}$$"
|
244 |
seted.add(c)
|
245 |
|
246 |
-
return res
|
247 |
|
248 |
def rerank(self, sres, query, tkweight=0.3,
|
249 |
vtweight=0.7, cfield="content_ltks"):
|
@@ -290,7 +290,7 @@ class Dealer:
|
|
290 |
start_idx -= 1
|
291 |
if start_idx >= 0:
|
292 |
continue
|
293 |
-
if len(ranks["chunks"])
|
294 |
if aggs:
|
295 |
continue
|
296 |
break
|
@@ -322,7 +322,7 @@ class Dealer:
|
|
322 |
if dnm not in ranks["doc_aggs"]:
|
323 |
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
324 |
ranks["doc_aggs"][dnm]["count"] += 1
|
325 |
-
ranks["doc_aggs"] = [
|
326 |
|
327 |
return ranks
|
328 |
|
|
|
243 |
res += f" ##{c}$$"
|
244 |
seted.add(c)
|
245 |
|
246 |
+
return res, seted
|
247 |
|
248 |
def rerank(self, sres, query, tkweight=0.3,
|
249 |
vtweight=0.7, cfield="content_ltks"):
|
|
|
290 |
start_idx -= 1
|
291 |
if start_idx >= 0:
|
292 |
continue
|
293 |
+
if len(ranks["chunks"]) >= page_size:
|
294 |
if aggs:
|
295 |
continue
|
296 |
break
|
|
|
322 |
if dnm not in ranks["doc_aggs"]:
|
323 |
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
324 |
ranks["doc_aggs"][dnm]["count"] += 1
|
325 |
+
ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
|
326 |
|
327 |
return ranks
|
328 |
|