File size: 7,939 Bytes
1eb186a 249b27c 3fc700a 249b27c 1eb186a 249b27c 1eb186a 249b27c 3245107 1eb186a 3245107 249b27c 3245107 249b27c 1eb186a 249b27c 1eb186a 249b27c 1eb186a 249b27c 1eb186a 249b27c 1eb186a 249b27c 1eb186a 249b27c 1eb186a 249b27c 1eb186a 3fc700a 1eb186a 249b27c 1eb186a 3245107 1eb186a 249b27c 1eb186a 249b27c 1eb186a 3245107 1eb186a 3245107 249b27c 3245107 249b27c 3245107 249b27c 1eb186a 249b27c 1eb186a 249b27c 3fc700a 249b27c 1eb186a 249b27c 1eb186a 3fc700a 1eb186a 249b27c 1eb186a 6858ec5 249b27c 1eb186a 249b27c 1eb186a 249b27c 1eb186a 249b27c 1eb186a 3fc700a 1eb186a 249b27c 1eb186a 249b27c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
import json, os, sys, hashlib, copy, time, random, re, logging, torch
from os.path import dirname, realpath
sys.path.append(dirname(realpath(__file__)) + "/../")
from util.es_conn import HuEs
from util.db_conn import Postgres
from util.minio_conn import HuMinio
from util import rmSpace, findMaxDt
from FlagEmbedding import FlagModel
from nlp import huchunk, huqie, search
import base64, hashlib
from io import BytesIO
import pandas as pd
from elasticsearch_dsl import Q
from parser import (
PdfParser,
DocxParser,
ExcelParser
)
from nlp.huchunk import (
PdfChunker,
DocxChunker,
ExcelChunker,
PptChunker,
TextChunker
)
ES = HuEs("infiniflow")
BATCH_SIZE = 64
PG = Postgres("infiniflow", "docgpt")
MINIO = HuMinio("infiniflow")
PDF = PdfChunker(PdfParser())
DOC = DocxChunker(DocxParser())
EXC = ExcelChunker(ExcelParser())
PPT = PptChunker()
def chuck_doc(name, binary):
suff = os.path.split(name)[-1].lower().split(".")[-1]
if suff.find("pdf") >= 0: return PDF(binary)
if suff.find("doc") >= 0: return DOC(binary)
if re.match(r"(xlsx|xlsm|xltx|xltm)", suff): return EXC(binary)
if suff.find("ppt") >= 0: return PPT(binary)
return TextChunker()(binary)
def collect(comm, mod, tm):
sql = f"""
select
id as kb2doc_id,
kb_id,
did,
updated_at,
is_deleted
from kb2_doc
where
updated_at >= '{tm}'
and kb_progress = 0
and MOD(did, {comm}) = {mod}
order by updated_at asc
limit 1000
"""
kb2doc = PG.select(sql)
if len(kb2doc) == 0:return pd.DataFrame()
sql = """
select
did,
uid,
doc_name,
location,
size
from doc_info
where
did in (%s)
"""%",".join([str(i) for i in kb2doc["did"].unique()])
docs = PG.select(sql)
docs = docs.fillna("")
docs = docs.join(kb2doc.set_index("did"), on="did", how="left")
mtm = str(docs["updated_at"].max())[:19]
print("TOTAL:", len(docs), "To: ", mtm)
return docs
def set_progress(kb2doc_id, prog, msg="Processing..."):
sql = f"""
update kb2_doc set kb_progress={prog}, kb_progress_msg='{msg}'
where
id={kb2doc_id}
"""
PG.update(sql)
def build(row):
if row["size"] > 256000000:
set_progress(row["kb2doc_id"], -1, "File size exceeds( <= 256Mb )")
return []
res = ES.search(Q("term", doc_id=row["did"]))
if ES.getTotal(res) > 0:
ES.updateScriptByQuery(Q("term", doc_id=row["did"]),
scripts="""
if(!ctx._source.kb_id.contains('%s'))
ctx._source.kb_id.add('%s');
"""%(str(row["kb_id"]), str(row["kb_id"])),
idxnm = search.index_name(row["uid"])
)
set_progress(row["kb2doc_id"], 1, "Done")
return []
random.seed(time.time())
set_progress(row["kb2doc_id"], random.randint(0, 20)/100., "Finished preparing! Start to slice file!")
try:
obj = chuck_doc(row["doc_name"], MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
except Exception as e:
if re.search("(No such file|not found)", str(e)):
set_progress(row["kb2doc_id"], -1, "Can not find file <%s>"%row["doc_name"])
else:
set_progress(row["kb2doc_id"], -1, f"Internal system error: %s"%str(e).replace("'", ""))
return []
print(row["doc_name"], obj)
if not obj.text_chunks and not obj.table_chunks:
set_progress(row["kb2doc_id"], 1, "Nothing added! Mostly, file type unsupported yet.")
return []
set_progress(row["kb2doc_id"], random.randint(20, 60)/100., "Finished slicing files. Start to embedding the content.")
doc = {
"doc_id": row["did"],
"kb_id": [str(row["kb_id"])],
"docnm_kwd": os.path.split(row["location"])[-1],
"title_tks": huqie.qie(os.path.split(row["location"])[-1]),
"updated_at": str(row["updated_at"]).replace("T", " ")[:19]
}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
output_buffer = BytesIO()
docs = []
md5 = hashlib.md5()
for txt, img in obj.text_chunks:
d = copy.deepcopy(doc)
md5.update((txt + str(d["doc_id"])).encode("utf-8"))
d["_id"] = md5.hexdigest()
d["content_ltks"] = huqie.qie(txt)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
if not img:
docs.append(d)
continue
img.save(output_buffer, format='JPEG')
MINIO.put("{}-{}".format(row["uid"], row["kb_id"]), d["_id"],
output_buffer.getvalue())
d["img_id"] = "{}-{}".format(row["uid"], row["kb_id"])
docs.append(d)
for arr, img in obj.table_chunks:
for i, txt in enumerate(arr):
d = copy.deepcopy(doc)
d["content_ltks"] = huqie.qie(txt)
md5.update((txt + str(d["doc_id"])).encode("utf-8"))
d["_id"] = md5.hexdigest()
if not img:
docs.append(d)
continue
img.save(output_buffer, format='JPEG')
MINIO.put("{}-{}".format(row["uid"], row["kb_id"]), d["_id"],
output_buffer.getvalue())
d["img_id"] = "{}-{}".format(row["uid"], row["kb_id"])
docs.append(d)
set_progress(row["kb2doc_id"], random.randint(60, 70)/100., "Continue embedding the content.")
return docs
def init_kb(row):
idxnm = search.index_name(row["uid"])
if ES.indexExist(idxnm): return
return ES.createIdx(idxnm, json.load(open("conf/mapping.json", "r")))
model = None
def embedding(docs):
global model
tts = model.encode([rmSpace(d["title_tks"]) for d in docs])
cnts = model.encode([rmSpace(d["content_ltks"]) for d in docs])
vects = 0.1 * tts + 0.9 * cnts
assert len(vects) == len(docs)
for i,d in enumerate(docs):d["q_vec"] = vects[i].tolist()
def rm_doc_from_kb(df):
if len(df) == 0:return
for _,r in df.iterrows():
ES.updateScriptByQuery(Q("term", doc_id=r["did"]),
scripts="""
if(ctx._source.kb_id.contains('%s'))
ctx._source.kb_id.remove(
ctx._source.kb_id.indexOf('%s')
);
"""%(str(r["kb_id"]),str(r["kb_id"])),
idxnm = search.index_name(r["uid"])
)
if len(df) == 0:return
sql = """
delete from kb2_doc where id in (%s)
"""%",".join([str(i) for i in df["kb2doc_id"]])
PG.update(sql)
def main(comm, mod):
global model
from llm import HuEmbedding
model = HuEmbedding()
tm_fnm = f"res/{comm}-{mod}.tm"
tm = findMaxDt(tm_fnm)
rows = collect(comm, mod, tm)
if len(rows) == 0:return
rm_doc_from_kb(rows.loc[rows.is_deleted == True])
rows = rows.loc[rows.is_deleted == False].reset_index(drop=True)
if len(rows) == 0:return
tmf = open(tm_fnm, "a+")
for _, r in rows.iterrows():
cks = build(r)
if not cks:
tmf.write(str(r["updated_at"]) + "\n")
continue
## TODO: exception handler
## set_progress(r["did"], -1, "ERROR: ")
embedding(cks)
set_progress(r["kb2doc_id"], random.randint(70, 95)/100.,
"Finished embedding! Start to build index!")
init_kb(r)
es_r = ES.bulk(cks, search.index_name(r["uid"]))
if es_r:
set_progress(r["kb2doc_id"], -1, "Index failure!")
print(es_r)
else: set_progress(r["kb2doc_id"], 1., "Done!")
tmf.write(str(r["updated_at"]) + "\n")
tmf.close()
if __name__ == "__main__":
from mpi4py import MPI
comm = MPI.COMM_WORLD
main(comm.Get_size(), comm.Get_rank())
|