Spaces:

retopara
/

ragflow

Build error

ragflow / python /svr /parse_user_docs.py

KevinHuSh

build dialog server; add thumbnail to docinfo; (#17)

3fc700a over 1 year ago

7.94 kB

	import json, os, sys, hashlib, copy, time, random, re, logging, torch
	from os.path import dirname, realpath
	sys.path.append(dirname(realpath(__file__)) + "/../")
	from util.es_conn import HuEs
	from util.db_conn import Postgres
	from util.minio_conn import HuMinio
	from util import rmSpace, findMaxDt
	from FlagEmbedding import FlagModel
	from nlp import huchunk, huqie, search
	import base64, hashlib
	from io import BytesIO
	import pandas as pd
	from elasticsearch_dsl import Q
	from parser import (
	PdfParser,
	DocxParser,
	ExcelParser
	)
	from nlp.huchunk import (
	PdfChunker,
	DocxChunker,
	ExcelChunker,
	PptChunker,
	TextChunker
	)

	ES = HuEs("infiniflow")
	BATCH_SIZE = 64
	PG = Postgres("infiniflow", "docgpt")
	MINIO = HuMinio("infiniflow")

	PDF = PdfChunker(PdfParser())
	DOC = DocxChunker(DocxParser())
	EXC = ExcelChunker(ExcelParser())
	PPT = PptChunker()

	def chuck_doc(name, binary):
	suff = os.path.split(name)[-1].lower().split(".")[-1]
	if suff.find("pdf") >= 0: return PDF(binary)
	if suff.find("doc") >= 0: return DOC(binary)
	if re.match(r"(xlsx\|xlsm\|xltx\|xltm)", suff): return EXC(binary)
	if suff.find("ppt") >= 0: return PPT(binary)

	return TextChunker()(binary)


	def collect(comm, mod, tm):
	sql = f"""
	select
	id as kb2doc_id,
	kb_id,
	did,
	updated_at,
	is_deleted
	from kb2_doc
	where
	updated_at >= '{tm}'
	and kb_progress = 0
	and MOD(did, {comm}) = {mod}
	order by updated_at asc
	limit 1000
	"""
	kb2doc = PG.select(sql)
	if len(kb2doc) == 0:return pd.DataFrame()

	sql = """
	select
	did,
	uid,
	doc_name,
	location,
	size
	from doc_info
	where
	did in (%s)
	"""%",".join([str(i) for i in kb2doc["did"].unique()])
	docs = PG.select(sql)
	docs = docs.fillna("")
	docs = docs.join(kb2doc.set_index("did"), on="did", how="left")

	mtm = str(docs["updated_at"].max())[:19]
	print("TOTAL:", len(docs), "To: ", mtm)
	return docs


	def set_progress(kb2doc_id, prog, msg="Processing..."):
	sql = f"""
	update kb2_doc set kb_progress={prog}, kb_progress_msg='{msg}'
	where
	id={kb2doc_id}
	"""
	PG.update(sql)


	def build(row):
	if row["size"] > 256000000:
	set_progress(row["kb2doc_id"], -1, "File size exceeds( <= 256Mb )")
	return []
	res = ES.search(Q("term", doc_id=row["did"]))
	if ES.getTotal(res) > 0:
	ES.updateScriptByQuery(Q("term", doc_id=row["did"]),
	scripts="""
	if(!ctx._source.kb_id.contains('%s'))
	ctx._source.kb_id.add('%s');
	"""%(str(row["kb_id"]), str(row["kb_id"])),
	idxnm = search.index_name(row["uid"])
	)
	set_progress(row["kb2doc_id"], 1, "Done")
	return []

	random.seed(time.time())
	set_progress(row["kb2doc_id"], random.randint(0, 20)/100., "Finished preparing! Start to slice file!")
	try:
	obj = chuck_doc(row["doc_name"], MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
	except Exception as e:
	if re.search("(No such file\|not found)", str(e)):
	set_progress(row["kb2doc_id"], -1, "Can not find file <%s>"%row["doc_name"])
	else:
	set_progress(row["kb2doc_id"], -1, f"Internal system error: %s"%str(e).replace("'", ""))
	return []

	print(row["doc_name"], obj)
	if not obj.text_chunks and not obj.table_chunks:
	set_progress(row["kb2doc_id"], 1, "Nothing added! Mostly, file type unsupported yet.")
	return []

	set_progress(row["kb2doc_id"], random.randint(20, 60)/100., "Finished slicing files. Start to embedding the content.")

	doc = {
	"doc_id": row["did"],
	"kb_id": [str(row["kb_id"])],
	"docnm_kwd": os.path.split(row["location"])[-1],
	"title_tks": huqie.qie(os.path.split(row["location"])[-1]),
	"updated_at": str(row["updated_at"]).replace("T", " ")[:19]
	}
	doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
	output_buffer = BytesIO()
	docs = []
	md5 = hashlib.md5()
	for txt, img in obj.text_chunks:
	d = copy.deepcopy(doc)
	md5.update((txt + str(d["doc_id"])).encode("utf-8"))
	d["_id"] = md5.hexdigest()
	d["content_ltks"] = huqie.qie(txt)
	d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
	if not img:
	docs.append(d)
	continue
	img.save(output_buffer, format='JPEG')
	MINIO.put("{}-{}".format(row["uid"], row["kb_id"]), d["_id"],
	output_buffer.getvalue())
	d["img_id"] = "{}-{}".format(row["uid"], row["kb_id"])
	docs.append(d)

	for arr, img in obj.table_chunks:
	for i, txt in enumerate(arr):
	d = copy.deepcopy(doc)
	d["content_ltks"] = huqie.qie(txt)
	md5.update((txt + str(d["doc_id"])).encode("utf-8"))
	d["_id"] = md5.hexdigest()
	if not img:
	docs.append(d)
	continue
	img.save(output_buffer, format='JPEG')
	MINIO.put("{}-{}".format(row["uid"], row["kb_id"]), d["_id"],
	output_buffer.getvalue())
	d["img_id"] = "{}-{}".format(row["uid"], row["kb_id"])
	docs.append(d)
	set_progress(row["kb2doc_id"], random.randint(60, 70)/100., "Continue embedding the content.")

	return docs


	def init_kb(row):
	idxnm = search.index_name(row["uid"])
	if ES.indexExist(idxnm): return
	return ES.createIdx(idxnm, json.load(open("conf/mapping.json", "r")))


	model = None
	def embedding(docs):
	global model
	tts = model.encode([rmSpace(d["title_tks"]) for d in docs])
	cnts = model.encode([rmSpace(d["content_ltks"]) for d in docs])
	vects = 0.1 * tts + 0.9 * cnts
	assert len(vects) == len(docs)
	for i,d in enumerate(docs):d["q_vec"] = vects[i].tolist()


	def rm_doc_from_kb(df):
	if len(df) == 0:return
	for _,r in df.iterrows():
	ES.updateScriptByQuery(Q("term", doc_id=r["did"]),
	scripts="""
	if(ctx._source.kb_id.contains('%s'))
	ctx._source.kb_id.remove(
	ctx._source.kb_id.indexOf('%s')
	);
	"""%(str(r["kb_id"]),str(r["kb_id"])),
	idxnm = search.index_name(r["uid"])
	)
	if len(df) == 0:return
	sql = """
	delete from kb2_doc where id in (%s)
	"""%",".join([str(i) for i in df["kb2doc_id"]])
	PG.update(sql)


	def main(comm, mod):
	global model
	from llm import HuEmbedding
	model = HuEmbedding()
	tm_fnm = f"res/{comm}-{mod}.tm"
	tm = findMaxDt(tm_fnm)
	rows = collect(comm, mod, tm)
	if len(rows) == 0:return

	rm_doc_from_kb(rows.loc[rows.is_deleted == True])
	rows = rows.loc[rows.is_deleted == False].reset_index(drop=True)
	if len(rows) == 0:return
	tmf = open(tm_fnm, "a+")
	for _, r in rows.iterrows():
	cks = build(r)
	if not cks:
	tmf.write(str(r["updated_at"]) + "\n")
	continue
	## TODO: exception handler
	## set_progress(r["did"], -1, "ERROR: ")
	embedding(cks)

	set_progress(r["kb2doc_id"], random.randint(70, 95)/100.,
	"Finished embedding! Start to build index!")
	init_kb(r)
	es_r = ES.bulk(cks, search.index_name(r["uid"]))
	if es_r:
	set_progress(r["kb2doc_id"], -1, "Index failure!")
	print(es_r)
	else: set_progress(r["kb2doc_id"], 1., "Done!")
	tmf.write(str(r["updated_at"]) + "\n")
	tmf.close()


	if __name__ == "__main__":
	from mpi4py import MPI
	comm = MPI.COMM_WORLD
	main(comm.Get_size(), comm.Get_rank())