Spaces:
Paused
Paused
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
import copy | |
import re | |
from io import BytesIO | |
from xpinyin import Pinyin | |
import numpy as np | |
import pandas as pd | |
from openpyxl import load_workbook | |
from dateutil.parser import parse as datetime_parse | |
from api.db.services.knowledgebase_service import KnowledgebaseService | |
from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec | |
from deepdoc.parser import ExcelParser | |
class Excel(ExcelParser): | |
def __call__(self, fnm, binary=None, from_page=0, | |
to_page=10000000000, callback=None): | |
if not binary: | |
wb = load_workbook(fnm) | |
else: | |
wb = load_workbook(BytesIO(binary)) | |
total = 0 | |
for sheetname in wb.sheetnames: | |
total += len(list(wb[sheetname].rows)) | |
res, fails, done = [], [], 0 | |
rn = 0 | |
for sheetname in wb.sheetnames: | |
ws = wb[sheetname] | |
rows = list(ws.rows) | |
if not rows:continue | |
headers = [cell.value for cell in rows[0]] | |
missed = set([i for i, h in enumerate(headers) if h is None]) | |
headers = [ | |
cell.value for i, | |
cell in enumerate( | |
rows[0]) if i not in missed] | |
if not headers:continue | |
data = [] | |
for i, r in enumerate(rows[1:]): | |
rn += 1 | |
if rn - 1 < from_page: | |
continue | |
if rn - 1 >= to_page: | |
break | |
row = [ | |
cell.value for ii, | |
cell in enumerate(r) if ii not in missed] | |
if len(row) != len(headers): | |
fails.append(str(i)) | |
continue | |
data.append(row) | |
done += 1 | |
res.append(pd.DataFrame(np.array(data), columns=headers)) | |
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + ( | |
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |
return res | |
def trans_datatime(s): | |
try: | |
return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S") | |
except Exception as e: | |
pass | |
def trans_bool(s): | |
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", | |
str(s).strip(), flags=re.IGNORECASE): | |
return "yes" | |
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE): | |
return "no" | |
def column_data_type(arr): | |
arr = list(arr) | |
uni = len(set([a for a in arr if a is not None])) | |
counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0} | |
trans = {t: f for f, t in | |
[(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]} | |
for a in arr: | |
if a is None: | |
continue | |
if re.match(r"[+-]?[0-9]+(\.0+)?$", str(a).replace("%%", "")): | |
counts["int"] += 1 | |
elif re.match(r"[+-]?[0-9.]+$", str(a).replace("%%", "")): | |
counts["float"] += 1 | |
elif re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√|false|no|否|⍻|×)$", str(a), flags=re.IGNORECASE): | |
counts["bool"] += 1 | |
elif trans_datatime(str(a)): | |
counts["datetime"] += 1 | |
else: | |
counts["text"] += 1 | |
counts = sorted(counts.items(), key=lambda x: x[1] * -1) | |
ty = counts[0][0] | |
for i in range(len(arr)): | |
if arr[i] is None: | |
continue | |
try: | |
arr[i] = trans[ty](str(arr[i])) | |
except Exception as e: | |
arr[i] = None | |
# if ty == "text": | |
# if len(arr) > 128 and uni / len(arr) < 0.1: | |
# ty = "keyword" | |
return arr, ty | |
def chunk(filename, binary=None, from_page=0, to_page=10000000000, | |
lang="Chinese", callback=None, **kwargs): | |
""" | |
Excel and csv(txt) format files are supported. | |
For csv or txt file, the delimiter between columns is TAB. | |
The first line must be column headers. | |
Column headers must be meaningful terms inorder to make our NLP model understanding. | |
It's good to enumerate some synonyms using slash '/' to separate, and even better to | |
enumerate values using brackets like 'gender/sex(male, female)'. | |
Here are some examples for headers: | |
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL) | |
2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA) | |
Every row in table will be treated as a chunk. | |
""" | |
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): | |
callback(0.1, "Start to parse.") | |
excel_parser = Excel() | |
dfs = excel_parser( | |
filename, | |
binary, | |
from_page=from_page, | |
to_page=to_page, | |
callback=callback) | |
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): | |
callback(0.1, "Start to parse.") | |
txt = "" | |
if binary: | |
encoding = find_codec(binary) | |
txt = binary.decode(encoding, errors="ignore") | |
else: | |
with open(filename, "r") as f: | |
while True: | |
l = f.readline() | |
if not l: | |
break | |
txt += l | |
lines = txt.split("\n") | |
fails = [] | |
headers = lines[0].split(kwargs.get("delimiter", "\t")) | |
rows = [] | |
for i, line in enumerate(lines[1:]): | |
if i < from_page: | |
continue | |
if i >= to_page: | |
break | |
row = [l for l in line.split(kwargs.get("delimiter", "\t"))] | |
if len(row) != len(headers): | |
fails.append(str(i)) | |
continue | |
rows.append(row) | |
callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + ( | |
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |
dfs = [pd.DataFrame(np.array(rows), columns=headers)] | |
else: | |
raise NotImplementedError( | |
"file type not supported yet(excel, text, csv supported)") | |
res = [] | |
PY = Pinyin() | |
fieds_map = { | |
"text": "_tks", | |
"int": "_long", | |
"keyword": "_kwd", | |
"float": "_flt", | |
"datetime": "_dt", | |
"bool": "_kwd"} | |
for df in dfs: | |
for n in ["id", "_id", "index", "idx"]: | |
if n in df.columns: | |
del df[n] | |
clmns = df.columns.values | |
txts = list(copy.deepcopy(clmns)) | |
py_clmns = [ | |
PY.get_pinyins( | |
re.sub( | |
r"(/.*|([^()]+?)|\([^()]+?\))", | |
"", | |
str(n)), | |
'_')[0] for n in clmns] | |
clmn_tys = [] | |
for j in range(len(clmns)): | |
cln, ty = column_data_type(df[clmns[j]]) | |
clmn_tys.append(ty) | |
df[clmns[j]] = cln | |
if ty == "text": | |
txts.extend([str(c) for c in cln if c]) | |
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) | |
for i in range(len(clmns))] | |
eng = lang.lower() == "english" # is_english(txts) | |
for ii, row in df.iterrows(): | |
d = { | |
"docnm_kwd": filename, | |
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |
} | |
row_txt = [] | |
for j in range(len(clmns)): | |
if row[clmns[j]] is None: | |
continue | |
if not str(row[clmns[j]]): | |
continue | |
if pd.isna(row[clmns[j]]): | |
continue | |
fld = clmns_map[j][0] | |
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize( | |
row[clmns[j]]) | |
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) | |
if not row_txt: | |
continue | |
tokenize(d, "; ".join(row_txt), eng) | |
res.append(d) | |
KnowledgebaseService.update_parser_config( | |
kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}}) | |
callback(0.35, "") | |
return res | |
if __name__ == "__main__": | |
import sys | |
def dummy(prog=None, msg=""): | |
pass | |
chunk(sys.argv[1], callback=dummy) | |