liuhua
liuhua
commited on
Commit
·
3a77303
1
Parent(s):
9cfd69b
Fix bugs in chunk api (#4293)
Browse files### What problem does this PR solve?
Fix bugs in chunk api #4149
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
---------
Co-authored-by: liuhua <[email protected]>
- api/apps/chunk_app.py +1 -1
- api/apps/sdk/doc.py +38 -41
- docs/references/http_api_reference.md +10 -8
api/apps/chunk_app.py
CHANGED
|
@@ -220,7 +220,7 @@ def create():
|
|
| 220 |
e, doc = DocumentService.get_by_id(req["doc_id"])
|
| 221 |
if not e:
|
| 222 |
return get_data_error_result(message="Document not found!")
|
| 223 |
-
d["kb_id"] =
|
| 224 |
d["docnm_kwd"] = doc.name
|
| 225 |
d["title_tks"] = rag_tokenizer.tokenize(doc.name)
|
| 226 |
d["doc_id"] = doc.id
|
|
|
|
| 220 |
e, doc = DocumentService.get_by_id(req["doc_id"])
|
| 221 |
if not e:
|
| 222 |
return get_data_error_result(message="Document not found!")
|
| 223 |
+
d["kb_id"] = doc.kb_id
|
| 224 |
d["docnm_kwd"] = doc.name
|
| 225 |
d["title_tks"] = rag_tokenizer.tokenize(doc.name)
|
| 226 |
d["doc_id"] = doc.id
|
api/apps/sdk/doc.py
CHANGED
|
@@ -847,59 +847,55 @@ def list_chunks(tenant_id, dataset_id, document_id):
|
|
| 847 |
renamed_doc["run"] = run_mapping.get(str(value))
|
| 848 |
|
| 849 |
res = {"total": 0, "chunks": [], "doc": renamed_doc}
|
| 850 |
-
|
| 851 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 852 |
sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None,
|
| 853 |
highlight=True)
|
| 854 |
res["total"] = sres.total
|
| 855 |
-
sign = 0
|
| 856 |
for id in sres.ids:
|
| 857 |
d = {
|
| 858 |
"id": id,
|
| 859 |
-
"
|
| 860 |
rmSpace(sres.highlight[id])
|
| 861 |
if question and id in sres.highlight
|
| 862 |
else sres.field[id].get("content_with_weight", "")
|
| 863 |
),
|
| 864 |
-
"
|
| 865 |
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
| 866 |
-
"
|
| 867 |
-
"
|
| 868 |
-
"
|
| 869 |
-
"
|
| 870 |
-
"
|
|
|
|
| 871 |
}
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
if req.get("id") == id:
|
| 875 |
-
origin_chunks.clear()
|
| 876 |
-
origin_chunks.append(d)
|
| 877 |
-
sign = 1
|
| 878 |
-
break
|
| 879 |
-
if req.get("id"):
|
| 880 |
-
if sign == 0:
|
| 881 |
-
return get_error_data_result(f"Can't find this chunk {req.get('id')}")
|
| 882 |
-
|
| 883 |
-
for chunk in origin_chunks:
|
| 884 |
-
key_mapping = {
|
| 885 |
-
"id": "id",
|
| 886 |
-
"content_with_weight": "content",
|
| 887 |
-
"doc_id": "document_id",
|
| 888 |
-
"important_kwd": "important_keywords",
|
| 889 |
-
"question_kwd": "questions",
|
| 890 |
-
"img_id": "image_id",
|
| 891 |
-
"available_int": "available",
|
| 892 |
-
}
|
| 893 |
-
renamed_chunk = {}
|
| 894 |
-
for key, value in chunk.items():
|
| 895 |
-
new_key = key_mapping.get(key, key)
|
| 896 |
-
renamed_chunk[new_key] = value
|
| 897 |
-
if renamed_chunk["available"] == 0:
|
| 898 |
-
renamed_chunk["available"] = False
|
| 899 |
-
if renamed_chunk["available"] == 1:
|
| 900 |
-
renamed_chunk["available"] = True
|
| 901 |
-
res["chunks"].append(renamed_chunk)
|
| 902 |
-
_ = Chunk(**renamed_chunk) # validate the chunk
|
| 903 |
return get_result(data=res)
|
| 904 |
|
| 905 |
|
|
@@ -1377,6 +1373,7 @@ def retrieval_test(tenant_id):
|
|
| 1377 |
"important_kwd": "important_keywords",
|
| 1378 |
"question_kwd": "questions",
|
| 1379 |
"docnm_kwd": "document_keyword",
|
|
|
|
| 1380 |
}
|
| 1381 |
rename_chunk = {}
|
| 1382 |
for key, value in chunk.items():
|
|
|
|
| 847 |
renamed_doc["run"] = run_mapping.get(str(value))
|
| 848 |
|
| 849 |
res = {"total": 0, "chunks": [], "doc": renamed_doc}
|
| 850 |
+
if req.get("id"):
|
| 851 |
+
chunk = settings.docStoreConn.get(req.get("id"), search.index_name(tenant_id), [dataset_id])
|
| 852 |
+
k = []
|
| 853 |
+
for n in chunk.keys():
|
| 854 |
+
if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
|
| 855 |
+
k.append(n)
|
| 856 |
+
for n in k:
|
| 857 |
+
del chunk[n]
|
| 858 |
+
if not chunk:
|
| 859 |
+
return get_error_data_result(f"Chunk `{req.get('id')}` not found.")
|
| 860 |
+
res['total'] = 1
|
| 861 |
+
final_chunk = {
|
| 862 |
+
"id":chunk.get("id",chunk.get("chunk_id")),
|
| 863 |
+
"content":chunk["content_with_weight"],
|
| 864 |
+
"document_id":chunk.get("doc_id",chunk.get("document_id")),
|
| 865 |
+
"docnm_kwd":chunk["docnm_kwd"],
|
| 866 |
+
"important_keywords":chunk.get("important_kwd",[]),
|
| 867 |
+
"questions":chunk.get("question_kwd",[]),
|
| 868 |
+
"dataset_id":chunk.get("kb_id",chunk.get("dataset_id")),
|
| 869 |
+
"image_id":chunk["img_id"],
|
| 870 |
+
"available":bool(chunk.get("available_int",1)),
|
| 871 |
+
"positions":chunk.get("position_int",[]),
|
| 872 |
+
}
|
| 873 |
+
res["chunks"].append(final_chunk)
|
| 874 |
+
_ = Chunk(**final_chunk)
|
| 875 |
+
|
| 876 |
+
elif settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
|
| 877 |
sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None,
|
| 878 |
highlight=True)
|
| 879 |
res["total"] = sres.total
|
|
|
|
| 880 |
for id in sres.ids:
|
| 881 |
d = {
|
| 882 |
"id": id,
|
| 883 |
+
"content": (
|
| 884 |
rmSpace(sres.highlight[id])
|
| 885 |
if question and id in sres.highlight
|
| 886 |
else sres.field[id].get("content_with_weight", "")
|
| 887 |
),
|
| 888 |
+
"document_id": sres.field[id]["doc_id"],
|
| 889 |
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
| 890 |
+
"important_keywords": sres.field[id].get("important_kwd", []),
|
| 891 |
+
"questions": sres.field[id].get("question_kwd", []),
|
| 892 |
+
"dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")),
|
| 893 |
+
"image_id": sres.field[id].get("img_id", ""),
|
| 894 |
+
"available": bool(sres.field[id].get("available_int", 1)),
|
| 895 |
+
"positions": sres.field[id].get("position_int",[]),
|
| 896 |
}
|
| 897 |
+
res["chunks"].append(d)
|
| 898 |
+
_ = Chunk(**d) # validate the chunk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 899 |
return get_result(data=res)
|
| 900 |
|
| 901 |
|
|
|
|
| 1373 |
"important_kwd": "important_keywords",
|
| 1374 |
"question_kwd": "questions",
|
| 1375 |
"docnm_kwd": "document_keyword",
|
| 1376 |
+
"kb_id":"dataset_id"
|
| 1377 |
}
|
| 1378 |
rename_chunk = {}
|
| 1379 |
for key, value in chunk.items():
|
docs/references/http_api_reference.md
CHANGED
|
@@ -927,7 +927,8 @@ curl --request POST \
|
|
| 927 |
The text content of the chunk.
|
| 928 |
- `"important_keywords`(*Body parameter*), `list[string]`
|
| 929 |
The key terms or phrases to tag with the chunk.
|
| 930 |
-
|
|
|
|
| 931 |
#### Response
|
| 932 |
|
| 933 |
Success:
|
|
@@ -937,13 +938,14 @@ Success:
|
|
| 937 |
"code": 0,
|
| 938 |
"data": {
|
| 939 |
"chunk": {
|
| 940 |
-
"content": "
|
| 941 |
-
"create_time": "2024-
|
| 942 |
-
"create_timestamp":
|
| 943 |
-
"dataset_id": "
|
| 944 |
-
"document_id": "
|
| 945 |
-
"id": "
|
| 946 |
-
"important_keywords": []
|
|
|
|
| 947 |
}
|
| 948 |
}
|
| 949 |
}
|
|
|
|
| 927 |
The text content of the chunk.
|
| 928 |
- `"important_keywords`(*Body parameter*), `list[string]`
|
| 929 |
The key terms or phrases to tag with the chunk.
|
| 930 |
+
- `"questions"`(*Body parameter*), `list[string]`
|
| 931 |
+
If there is a given question, the embedded chunks will be based on them
|
| 932 |
#### Response
|
| 933 |
|
| 934 |
Success:
|
|
|
|
| 938 |
"code": 0,
|
| 939 |
"data": {
|
| 940 |
"chunk": {
|
| 941 |
+
"content": "who are you",
|
| 942 |
+
"create_time": "2024-12-30 16:59:55",
|
| 943 |
+
"create_timestamp": 1735549195.969164,
|
| 944 |
+
"dataset_id": "72f36e1ebdf411efb7250242ac120006",
|
| 945 |
+
"document_id": "61d68474be0111ef98dd0242ac120006",
|
| 946 |
+
"id": "12ccdc56e59837e5",
|
| 947 |
+
"important_keywords": [],
|
| 948 |
+
"questions": []
|
| 949 |
}
|
| 950 |
}
|
| 951 |
}
|