liuhua liuhua commited on
Commit
eabf8a3
·
1 Parent(s): 5b9e61c

Fix issues in API (#3008)

Browse files

### What problem does this PR solve?

Fix issues in API

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: liuhua <[email protected]>

api/apps/sdk/dataset.py CHANGED
@@ -64,7 +64,12 @@ def create(tenant_id):
64
  if not req.get("embedding_model"):
65
  req['embedding_model'] = t.embd_id
66
  else:
67
- if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
 
 
 
 
 
68
  return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
69
  key_mapping = {
70
  "chunk_num": "chunk_count",
@@ -133,6 +138,9 @@ def update(tenant_id,dataset_id):
133
  return get_error_data_result(
134
  retmsg="Can't change `tenant_id`.")
135
  e, kb = KnowledgebaseService.get_by_id(dataset_id)
 
 
 
136
  if "chunk_count" in req:
137
  if req["chunk_count"] != kb.chunk_num:
138
  return get_error_data_result(
@@ -153,10 +161,15 @@ def update(tenant_id,dataset_id):
153
  if "embedding_model" in req:
154
  if kb.chunk_num != 0 and req['embedding_model'] != kb.embd_id:
155
  return get_error_data_result(
156
- retmsg="If `chunk_count` is not 0, `embedding_method` is not changeable.")
157
  if not req.get("embedding_model"):
158
  return get_error_data_result("`embedding_model` can't be empty")
159
- if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
 
 
 
 
 
160
  return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
161
  req['embd_id'] = req.pop('embedding_model')
162
  if "name" in req:
 
64
  if not req.get("embedding_model"):
65
  req['embedding_model'] = t.embd_id
66
  else:
67
+ valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
68
+ "BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
69
+ "nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
70
+ "text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
71
+ if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model"))\
72
+ and req.get("embedding_model") not in valid_embedding_models:
73
  return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
74
  key_mapping = {
75
  "chunk_num": "chunk_count",
 
138
  return get_error_data_result(
139
  retmsg="Can't change `tenant_id`.")
140
  e, kb = KnowledgebaseService.get_by_id(dataset_id)
141
+ if "parser_config" in req:
142
+ print(kb.parser_config,flush=True)
143
+ req["parser_config"]=kb.parser_config.update(req["parser_config"])
144
  if "chunk_count" in req:
145
  if req["chunk_count"] != kb.chunk_num:
146
  return get_error_data_result(
 
161
  if "embedding_model" in req:
162
  if kb.chunk_num != 0 and req['embedding_model'] != kb.embd_id:
163
  return get_error_data_result(
164
+ retmsg="If `chunk_count` is not 0, `embedding_model` is not changeable.")
165
  if not req.get("embedding_model"):
166
  return get_error_data_result("`embedding_model` can't be empty")
167
+ valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
168
+ "BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
169
+ "nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
170
+ "text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
171
+ if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model"))\
172
+ and req.get("embedding_model") not in valid_embedding_models:
173
  return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
174
  req['embd_id'] = req.pop('embedding_model')
175
  if "name" in req:
api/apps/sdk/doc.py CHANGED
@@ -163,9 +163,6 @@ def update_doc(tenant_id, dataset_id, document_id):
163
  doc.process_duation * -1)
164
  if not e:
165
  return get_error_data_result(retmsg="Document not found!")
166
- tenant_id = DocumentService.get_tenant_id(req["id"])
167
- if not tenant_id:
168
- return get_error_data_result(retmsg="Tenant not found!")
169
  ELASTICSEARCH.deleteByQuery(
170
  Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
171
 
@@ -245,14 +242,22 @@ def delete(tenant_id,dataset_id):
245
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
246
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
247
  req = request.json
248
- if not req.get("ids"):
249
- return get_error_data_result(retmsg="`ids` is required")
250
- doc_ids = req["ids"]
 
 
 
 
 
 
 
 
251
  root_folder = FileService.get_root_folder(tenant_id)
252
  pf_id = root_folder["id"]
253
  FileService.init_knowledgebase_docs(pf_id, tenant_id)
254
  errors = ""
255
- for doc_id in doc_ids:
256
  try:
257
  e, doc = DocumentService.get_by_id(doc_id)
258
  if not e:
@@ -290,8 +295,11 @@ def parse(tenant_id,dataset_id):
290
  if not req.get("document_ids"):
291
  return get_error_data_result("`document_ids` is required")
292
  for id in req["document_ids"]:
293
- if not DocumentService.query(id=id,kb_id=dataset_id):
 
294
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
 
 
295
  info = {"run": "1", "progress": 0}
296
  info["progress_msg"] = ""
297
  info["chunk_num"] = 0
@@ -349,7 +357,27 @@ def list_chunks(tenant_id,dataset_id,document_id):
349
  "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
350
  }
351
  sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
352
- res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  origin_chunks = []
354
  sign = 0
355
  for id in sres.ids:
@@ -388,7 +416,7 @@ def list_chunks(tenant_id,dataset_id,document_id):
388
  "content_with_weight": "content",
389
  "doc_id": "document_id",
390
  "important_kwd": "important_keywords",
391
- "img_id": "image_id",
392
  }
393
  renamed_chunk = {}
394
  for key, value in chunk.items():
 
163
  doc.process_duation * -1)
164
  if not e:
165
  return get_error_data_result(retmsg="Document not found!")
 
 
 
166
  ELASTICSEARCH.deleteByQuery(
167
  Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
168
 
 
242
  if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
243
  return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
244
  req = request.json
245
+ if not req:
246
+ doc_ids=None
247
+ else:
248
+ doc_ids=req.get("ids")
249
+ if not doc_ids:
250
+ doc_list = []
251
+ docs=DocumentService.query(kb_id=dataset_id)
252
+ for doc in docs:
253
+ doc_list.append(doc.id)
254
+ else:
255
+ doc_list=doc_ids
256
  root_folder = FileService.get_root_folder(tenant_id)
257
  pf_id = root_folder["id"]
258
  FileService.init_knowledgebase_docs(pf_id, tenant_id)
259
  errors = ""
260
+ for doc_id in doc_list:
261
  try:
262
  e, doc = DocumentService.get_by_id(doc_id)
263
  if not e:
 
295
  if not req.get("document_ids"):
296
  return get_error_data_result("`document_ids` is required")
297
  for id in req["document_ids"]:
298
+ doc = DocumentService.query(id=id,kb_id=dataset_id)
299
+ if not doc:
300
  return get_error_data_result(retmsg=f"You don't own the document {id}.")
301
+ if doc[0].progress != 0.0:
302
+ return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
303
  info = {"run": "1", "progress": 0}
304
  info["progress_msg"] = ""
305
  info["chunk_num"] = 0
 
357
  "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
358
  }
359
  sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
360
+ key_mapping = {
361
+ "chunk_num": "chunk_count",
362
+ "kb_id": "dataset_id",
363
+ "token_num": "token_count",
364
+ "parser_id": "chunk_method"
365
+ }
366
+ run_mapping = {
367
+ "0": "UNSTART",
368
+ "1": "RUNNING",
369
+ "2": "CANCEL",
370
+ "3": "DONE",
371
+ "4": "FAIL"
372
+ }
373
+ doc=doc.to_dict()
374
+ renamed_doc = {}
375
+ for key, value in doc.items():
376
+ if key == "run":
377
+ renamed_doc["run"] = run_mapping.get(str(value))
378
+ new_key = key_mapping.get(key, key)
379
+ renamed_doc[new_key] = value
380
+ res = {"total": sres.total, "chunks": [], "doc": renamed_doc}
381
  origin_chunks = []
382
  sign = 0
383
  for id in sres.ids:
 
416
  "content_with_weight": "content",
417
  "doc_id": "document_id",
418
  "important_kwd": "important_keywords",
419
+ "img_id": "image_id"
420
  }
421
  renamed_chunk = {}
422
  for key, value in chunk.items():
api/http_api_reference.md CHANGED
@@ -31,7 +31,7 @@ Creates a dataset.
31
  - `"language"`: `string`
32
  - `"embedding_model"`: `string`
33
  - `"permission"`: `string`
34
- - `"parse_method"`: `string`
35
  - `"parser_config"`: `Dataset.ParserConfig`
36
 
37
  #### Request example
@@ -41,11 +41,9 @@ curl --request POST \
41
  --url http://{address}/api/v1/dataset \
42
  --header 'Content-Type: application/json' \
43
  --header 'Authorization: Bearer {YOUR_API_KEY}' \
44
- --data '
45
- {
46
- "name": "test",
47
- "chunk_method": "naive"
48
- }'
49
  ```
50
 
51
  #### Request parameters
@@ -109,31 +107,32 @@ Success:
109
  "data": {
110
  "avatar": null,
111
  "chunk_count": 0,
112
- "create_date": "Thu, 10 Oct 2024 05:57:37 GMT",
113
- "create_time": 1728539857641,
 
114
  "created_by": "69736c5e723611efb51b0242ac120007",
115
  "description": null,
116
  "document_count": 0,
117
  "embedding_model": "BAAI/bge-large-zh-v1.5",
118
- "id": "8d73076886cc11ef8c270242ac120006",
119
  "language": "English",
120
  "name": "test_1",
121
- "parse_method": "naive",
122
  "parser_config": {
123
- "pages": [
124
- [
125
- 1,
126
- 1000000
127
- ]
128
- ]
 
129
  },
130
  "permission": "me",
131
  "similarity_threshold": 0.2,
132
  "status": "1",
133
  "tenant_id": "69736c5e723611efb51b0242ac120007",
134
  "token_num": 0,
135
- "update_date": "Thu, 10 Oct 2024 05:57:37 GMT",
136
- "update_time": 1728539857641,
137
  "vector_similarity_weight": 0.3
138
  }
139
  }
@@ -229,9 +228,7 @@ curl --request PUT \
229
  --header 'Authorization: Bearer {YOUR_API_KEY}' \
230
  --data '
231
  {
232
- "name": "test",
233
- "embedding_model": "BAAI/bge-zh-v1.5",
234
- "chunk_method": "naive"
235
  }'
236
  ```
237
 
@@ -336,7 +333,7 @@ Success:
336
  "id": "6e211ee0723611efa10a0242ac120007",
337
  "language": "English",
338
  "name": "mysql",
339
- "parse_method": "knowledge_graph",
340
  "parser_config": {
341
  "chunk_token_num": 8192,
342
  "delimiter": "\\n!?;。;!?",
@@ -418,7 +415,30 @@ Success:
418
 
419
  ```json
420
  {
421
- "code": 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  }
423
  ```
424
 
@@ -552,7 +572,7 @@ curl --request GET \
552
  Success:
553
 
554
  ```text
555
- test_2.
556
  ```
557
 
558
  Failure:
@@ -953,40 +973,54 @@ Success:
953
  {
954
  "code": 0,
955
  "data": {
956
- "chunks": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
957
  "doc": {
958
- "chunk_num": 0,
959
- "create_date": "Sun, 29 Sep 2024 03:47:29 GMT",
960
- "create_time": 1727581649216,
 
961
  "created_by": "69736c5e723611efb51b0242ac120007",
962
- "id": "8cb781ec7e1511ef98ac0242ac120006",
963
- "kb_id": "c7ee74067a2c11efb21c0242ac120006",
964
- "location": "sunny_tomorrow.txt",
965
- "name": "sunny_tomorrow.txt",
966
  "parser_config": {
967
- "pages": [
968
- [
969
- 1,
970
- 1000000
971
- ]
972
- ]
 
973
  },
974
- "parser_id": "naive",
975
- "process_begin_at": "Tue, 15 Oct 2024 10:23:51 GMT",
976
- "process_duation": 1435.37,
977
- "progress": 0.0370833,
978
- "progress_msg": "\nTask has been received.",
979
- "run": "1",
980
- "size": 24,
981
  "source_type": "local",
982
  "status": "1",
983
- "thumbnail": null,
984
- "token_num": 0,
985
  "type": "doc",
986
- "update_date": "Tue, 15 Oct 2024 10:47:46 GMT",
987
- "update_time": 1728989266371
988
  },
989
- "total": 0
990
  }
991
  }
992
  ```
@@ -1287,29 +1321,7 @@ curl --request POST \
1287
  --header 'Content-Type: application/json' \
1288
  --header 'Authorization: Bearer {YOUR_API_KEY}'
1289
  --data '{
1290
- "dataset_ids": [
1291
- {
1292
- "avatar": null,
1293
- "chunk_count": 0,
1294
- "description": null,
1295
- "document_count": 0,
1296
- "embedding_model": "",
1297
- "id": "0b2cbc8c877f11ef89070242ac120005",
1298
- "language": "English",
1299
- "name": "Test_assistant",
1300
- "parse_method": "naive",
1301
- "parser_config": {
1302
- "pages": [
1303
- [
1304
- 1,
1305
- 1000000
1306
- ]
1307
- ]
1308
- },
1309
- "permission": "me",
1310
- "tenant_id": "4fb0cd625f9311efba4a0242ac120006"
1311
- }
1312
- ],
1313
  "name":"new_chat_1"
1314
  }'
1315
  ```
@@ -1363,49 +1375,29 @@ Success:
1363
  "code": 0,
1364
  "data": {
1365
  "avatar": "",
1366
- "create_date": "Fri, 11 Oct 2024 03:23:24 GMT",
1367
- "create_time": 1728617004635,
 
 
 
1368
  "description": "A helpful Assistant",
1369
  "do_refer": "1",
1370
- "id": "2ca4b22e878011ef88fe0242ac120005",
1371
- "knowledgebases": [
1372
- {
1373
- "avatar": null,
1374
- "chunk_count": 0,
1375
- "description": null,
1376
- "document_count": 0,
1377
- "embedding_model": "",
1378
- "id": "0b2cbc8c877f11ef89070242ac120005",
1379
- "language": "English",
1380
- "name": "Test_assistant",
1381
- "parse_method": "naive",
1382
- "parser_config": {
1383
- "pages": [
1384
- [
1385
- 1,
1386
- 1000000
1387
- ]
1388
- ]
1389
- },
1390
- "permission": "me",
1391
- "tenant_id": "4fb0cd625f9311efba4a0242ac120006"
1392
- }
1393
- ],
1394
  "language": "English",
1395
  "llm": {
1396
  "frequency_penalty": 0.7,
1397
  "max_tokens": 512,
1398
- "model_name": "deepseek-chat___OpenAI-API@OpenAI-API-Compatible",
1399
  "presence_penalty": 0.4,
1400
  "temperature": 0.1,
1401
  "top_p": 0.3
1402
  },
1403
- "name": "new_chat_1",
1404
  "prompt": {
1405
- "empty_response": "Sorry! 知识库中未找到相关内容!",
1406
  "keywords_similarity_weight": 0.3,
1407
- "opener": "您好,我是您的助手小樱,长得可爱又善良,can I help you?",
1408
- "prompt": "你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\n 以下是知识库:\n {knowledge}\n 以上是知识库。",
1409
  "rerank_model": "",
1410
  "similarity_threshold": 0.2,
1411
  "top_n": 6,
@@ -1420,8 +1412,8 @@ Success:
1420
  "status": "1",
1421
  "tenant_id": "69736c5e723611efb51b0242ac120007",
1422
  "top_k": 1024,
1423
- "update_date": "Fri, 11 Oct 2024 03:23:24 GMT",
1424
- "update_time": 1728617004635
1425
  }
1426
  }
1427
  ```
@@ -1636,56 +1628,27 @@ Success:
1636
  "data": [
1637
  {
1638
  "avatar": "",
1639
- "create_date": "Fri, 11 Oct 2024 03:23:24 GMT",
1640
- "create_time": 1728617004635,
1641
  "description": "A helpful Assistant",
1642
  "do_refer": "1",
1643
- "id": "2ca4b22e878011ef88fe0242ac120005",
1644
- "knowledgebases": [
1645
- {
1646
- "avatar": "",
1647
- "chunk_num": 0,
1648
- "create_date": "Fri, 11 Oct 2024 03:15:18 GMT",
1649
- "create_time": 1728616518986,
1650
- "created_by": "69736c5e723611efb51b0242ac120007",
1651
- "description": "",
1652
- "doc_num": 0,
1653
- "embd_id": "BAAI/bge-large-zh-v1.5",
1654
- "id": "0b2cbc8c877f11ef89070242ac120005",
1655
- "language": "English",
1656
- "name": "test_delete_chat",
1657
- "parser_config": {
1658
- "chunk_token_count": 128,
1659
- "delimiter": "\n!?。;!?",
1660
- "layout_recognize": true,
1661
- "task_page_size": 12
1662
- },
1663
- "parser_id": "naive",
1664
- "permission": "me",
1665
- "similarity_threshold": 0.2,
1666
- "status": "1",
1667
- "tenant_id": "69736c5e723611efb51b0242ac120007",
1668
- "token_num": 0,
1669
- "update_date": "Fri, 11 Oct 2024 04:01:31 GMT",
1670
- "update_time": 1728619291228,
1671
- "vector_similarity_weight": 0.3
1672
- }
1673
- ],
1674
  "language": "English",
1675
  "llm": {
1676
  "frequency_penalty": 0.7,
1677
  "max_tokens": 512,
1678
- "model_name": "deepseek-chat___OpenAI-API@OpenAI-API-Compatible",
1679
  "presence_penalty": 0.4,
1680
  "temperature": 0.1,
1681
  "top_p": 0.3
1682
  },
1683
- "name": "Test",
1684
  "prompt": {
1685
- "empty_response": "Sorry! 知识库中未找到相关内容!",
1686
  "keywords_similarity_weight": 0.3,
1687
- "opener": "您好,我是您的助手小樱,长得可爱又善良,can I help you?",
1688
- "prompt": "你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\n 以下是知识库:\n {knowledge}\n 以上是知识库。",
1689
  "rerank_model": "",
1690
  "similarity_threshold": 0.2,
1691
  "top_n": 6,
@@ -1700,8 +1663,8 @@ Success:
1700
  "status": "1",
1701
  "tenant_id": "69736c5e723611efb51b0242ac120007",
1702
  "top_k": 1024,
1703
- "update_date": "Fri, 11 Oct 2024 03:47:58 GMT",
1704
- "update_time": 1728618478392
1705
  }
1706
  ]
1707
  }
@@ -2035,78 +1998,55 @@ Success:
2035
  data: {
2036
  "code": 0,
2037
  "data": {
2038
- "answer": "您好!有什么具体的问题或者需要的帮助",
2039
- "reference": {},
2040
- "audio_binary": null,
2041
- "id": "31153052-7bac-4741-a513-ed07d853f29e"
2042
- }
2043
- }
2044
-
2045
- data: {
2046
- "code": 0,
2047
- "data": {
2048
- "answer": "您好!有什么具体的问题或者需要的帮助可以告诉我吗?我在这里是为了帮助",
2049
  "reference": {},
2050
  "audio_binary": null,
2051
- "id": "31153052-7bac-4741-a513-ed07d853f29e"
 
2052
  }
2053
  }
2054
 
2055
  data: {
2056
  "code": 0,
2057
  "data": {
2058
- "answer": "您好!有什么具体的问题或者需要的帮助可以告诉我吗?我在这里是为了帮助您的。如果您有任何疑问或是需要获取",
2059
  "reference": {},
2060
  "audio_binary": null,
2061
- "id": "31153052-7bac-4741-a513-ed07d853f29e"
 
2062
  }
2063
  }
2064
 
2065
  data: {
2066
  "code": 0,
2067
  "data": {
2068
- "answer": "您好!有什么具体的问题或者需要的帮助可以告诉我吗?我在这里是为了帮助您的。如果您有任何疑问或是需要获取某些信息,请随时提出。",
2069
  "reference": {},
2070
  "audio_binary": null,
2071
- "id": "31153052-7bac-4741-a513-ed07d853f29e"
 
2072
  }
2073
  }
2074
 
2075
  data: {
2076
  "code": 0,
2077
  "data": {
2078
- "answer": "您好!有什么具体的问题或者需要的帮助可以告诉我吗 ##0$$?我在这里是为了帮助您的。如果您有任何疑问或是需要获取某些信息,请随时提出。",
2079
  "reference": {
2080
- "total": 19,
2081
  "chunks": [
2082
- {
2083
- "chunk_id": "9d87f9d70a0d8a7565694a81fd4c5d5f",
2084
- "content_ltks": "当所有知识库内容都与问题无关时 ,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\r\n以下是知识库:\r\n{knowledg}\r\n以上是知识库\r\n\"\"\"\r\n 1\r\n 2\r\n 3\r\n 4\r\n 5\r\n 6\r\n总结\r\n通过上面的介绍,可以对开源的 ragflow有了一个大致的了解,与前面的有道qanyth整体流程还是比较类似的。 ",
2085
- "content_with_weight": "当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\r\n 以下是知识库:\r\n {knowledge}\r\n 以上是知识库\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n总结\r\n通过上面的介绍,可以对开源的 RagFlow 有了一个大致的了解,与前面的 有道 QAnything 整体流程还是比较类似的。",
2086
- "doc_id": "5c5999ec7be811ef9cab0242ac120005",
2087
- "docnm_kwd": "1.txt",
2088
- "kb_id": "c7ee74067a2c11efb21c0242ac120006",
2089
- "important_kwd": [],
2090
- "img_id": "",
2091
- "similarity": 0.38337178633282265,
2092
- "vector_similarity": 0.3321336754679629,
2093
- "term_similarity": 0.4053309767034769,
2094
- "positions": [
2095
- ""
2096
- ]
2097
- },
2098
  {
2099
  "chunk_id": "895d34de762e674b43e8613c6fb54c6d",
2100
- "content_ltks": "\r\n\r\n实际内容可能会超过大模型的输入token数量,因此在调用大模型前会调用api/db/servic/dialog_service.py文件中 messag_fit_in ()根据大模型可用的 token数量进行过滤。这部分与有道的 qanyth的实现大同小异,就不额外展开了。\r\n\r\n将检索的内容,历史聊天记录以及问题构造为 prompt ,即可作为大模型的输入了 ,默认的英文prompt如下所示:\r\n\r\n\"\"\"\r\nyou are an intellig assistant. pleas summar the content of the knowledg base to answer the question. pleas list thedata in the knowledg base and answer in detail. when all knowledg base content is irrelev to the question , your answer must includ the sentenc\"the answer you are lookfor isnot found in the knowledg base!\" answer needto consid chat history.\r\n here is the knowledg base:\r\n{ knowledg}\r\nthe abov is the knowledg base.\r\n\"\"\"\r\n1\r\n 2\r\n 3\r\n 4\r\n 5\r\n 6\r\n对应���中文prompt如下所示:\r\n\r\n\"\"\"\r\n你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。 ",
2101
- "content_with_weight": "\r\n\r\n实际内容可能会超过大模型的输入 token 数量,因此在调用大模型前会调用 api/db/services/dialog_service.py 文件中 message_fit_in() 根据大模型可用的 token 数量进行过滤。这部分与有道的 QAnything 的实现大同小异,就不额外展开了。\r\n\r\n将检索的内容,历史聊天记录以及问题构造为 prompt,即可作为大模型的输入了,默认的英文 prompt 如下所示:\r\n\r\n\"\"\"\r\nYou are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\r\n Here is the knowledge base:\r\n {knowledge}\r\n The above is the knowledge base.\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n对应的中文 prompt 如下所示:\r\n\r\n\"\"\"\r\n你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。",
2102
  "doc_id": "5c5999ec7be811ef9cab0242ac120005",
2103
  "docnm_kwd": "1.txt",
2104
  "kb_id": "c7ee74067a2c11efb21c0242ac120006",
2105
  "important_kwd": [],
2106
  "img_id": "",
2107
- "similarity": 0.2788204323926715,
2108
- "vector_similarity": 0.35489427679953667,
2109
- "term_similarity": 0.2462173562183008,
2110
  "positions": [
2111
  ""
2112
  ]
@@ -2116,12 +2056,13 @@ data: {
2116
  {
2117
  "doc_name": "1.txt",
2118
  "doc_id": "5c5999ec7be811ef9cab0242ac120005",
2119
- "count": 2
2120
  }
2121
  ]
2122
  },
2123
- "prompt": "你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\n 以下是知识库:\n 当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。\r\n 以下是知识库:\r\n {knowledge}\r\n 以上是知识库\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n总结\r\n通过上面的介绍,可以对开源的 RagFlow 有了一个大致的了解,与前面的 有道 QAnything 整体流程还是比较类似的。\n\n------\n\n\r\n\r\n实际内容可能会超过大模型的输入 token 数量,因此在调用大模型前会调用 api/db/services/dialog_service.py 文件中 message_fit_in() 根据大模型可用的 token 数量进行过滤。这部分与有道的 QAnything 的实现大同小异,就不额外展开了。\r\n\r\n将检索的内容,历史聊天记录以及问题构造为 prompt,即可作为大模型的输入了,默认的英文 prompt 如下所示:\r\n\r\n\"\"\"\r\nYou are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\r\n Here is the knowledge base:\r\n {knowledge}\r\n The above is the knowledge base.\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\n对应的中文 prompt 如下所示:\r\n\r\n\"\"\"\r\n你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。\n 以上是知识库。\n\n### Query:\n你好,请问有什么问题需要我帮忙解答吗?\n\n### Elapsed\n - Retrieval: 9131.1 ms\n - LLM: 12802.6 ms",
2124
- "id": "31153052-7bac-4741-a513-ed07d853f29e"
 
2125
  }
2126
  }
2127
 
 
31
  - `"language"`: `string`
32
  - `"embedding_model"`: `string`
33
  - `"permission"`: `string`
34
+ - `"chunk_method"`: `string`
35
  - `"parser_config"`: `Dataset.ParserConfig`
36
 
37
  #### Request example
 
41
  --url http://{address}/api/v1/dataset \
42
  --header 'Content-Type: application/json' \
43
  --header 'Authorization: Bearer {YOUR_API_KEY}' \
44
+ --data '{
45
+ "name": "test_1"
46
+ }'
 
 
47
  ```
48
 
49
  #### Request parameters
 
107
  "data": {
108
  "avatar": null,
109
  "chunk_count": 0,
110
+ "chunk_method": "naive",
111
+ "create_date": "Thu, 24 Oct 2024 09:14:07 GMT",
112
+ "create_time": 1729761247434,
113
  "created_by": "69736c5e723611efb51b0242ac120007",
114
  "description": null,
115
  "document_count": 0,
116
  "embedding_model": "BAAI/bge-large-zh-v1.5",
117
+ "id": "527fa74891e811ef9c650242ac120006",
118
  "language": "English",
119
  "name": "test_1",
 
120
  "parser_config": {
121
+ "chunk_token_num": 128,
122
+ "delimiter": "\\n!?;。;!?",
123
+ "html4excel": false,
124
+ "layout_recognize": true,
125
+ "raptor": {
126
+ "user_raptor": false
127
+ }
128
  },
129
  "permission": "me",
130
  "similarity_threshold": 0.2,
131
  "status": "1",
132
  "tenant_id": "69736c5e723611efb51b0242ac120007",
133
  "token_num": 0,
134
+ "update_date": "Thu, 24 Oct 2024 09:14:07 GMT",
135
+ "update_time": 1729761247434,
136
  "vector_similarity_weight": 0.3
137
  }
138
  }
 
228
  --header 'Authorization: Bearer {YOUR_API_KEY}' \
229
  --data '
230
  {
231
+ "name": "updated_dataset",
 
 
232
  }'
233
  ```
234
 
 
333
  "id": "6e211ee0723611efa10a0242ac120007",
334
  "language": "English",
335
  "name": "mysql",
336
+ "chunk_method": "knowledge_graph",
337
  "parser_config": {
338
  "chunk_token_num": 8192,
339
  "delimiter": "\\n!?;。;!?",
 
415
 
416
  ```json
417
  {
418
+ "code": 0,
419
+ "data": [
420
+ {
421
+ "chunk_method": "naive",
422
+ "created_by": "69736c5e723611efb51b0242ac120007",
423
+ "dataset_id": "527fa74891e811ef9c650242ac120006",
424
+ "id": "b330ec2e91ec11efbc510242ac120004",
425
+ "location": "1.txt",
426
+ "name": "1.txt",
427
+ "parser_config": {
428
+ "chunk_token_num": 128,
429
+ "delimiter": "\\n!?;。;!?",
430
+ "html4excel": false,
431
+ "layout_recognize": true,
432
+ "raptor": {
433
+ "user_raptor": false
434
+ }
435
+ },
436
+ "run": "UNSTART",
437
+ "size": 17966,
438
+ "thumbnail": "",
439
+ "type": "doc"
440
+ }
441
+ ]
442
  }
443
  ```
444
 
 
572
  Success:
573
 
574
  ```text
575
+ This is a test to verify the file download functionality.
576
  ```
577
 
578
  Failure:
 
973
  {
974
  "code": 0,
975
  "data": {
976
+ "chunks": [
977
+ {
978
+ "available_int": 1,
979
+ "content": "This is a test content.",
980
+ "docnm_kwd": "1.txt",
981
+ "document_id": "b330ec2e91ec11efbc510242ac120004",
982
+ "id": "b48c170e90f70af998485c1065490726",
983
+ "image_id": "",
984
+ "important_keywords": "",
985
+ "positions": [
986
+ ""
987
+ ]
988
+ }
989
+ ],
990
  "doc": {
991
+ "chunk_count": 1,
992
+ "chunk_method": "naive",
993
+ "create_date": "Thu, 24 Oct 2024 09:45:27 GMT",
994
+ "create_time": 1729763127646,
995
  "created_by": "69736c5e723611efb51b0242ac120007",
996
+ "dataset_id": "527fa74891e811ef9c650242ac120006",
997
+ "id": "b330ec2e91ec11efbc510242ac120004",
998
+ "location": "1.txt",
999
+ "name": "1.txt",
1000
  "parser_config": {
1001
+ "chunk_token_num": 128,
1002
+ "delimiter": "\\n!?;。;!?",
1003
+ "html4excel": false,
1004
+ "layout_recognize": true,
1005
+ "raptor": {
1006
+ "user_raptor": false
1007
+ }
1008
  },
1009
+ "process_begin_at": "Thu, 24 Oct 2024 09:56:44 GMT",
1010
+ "process_duation": 0.54213,
1011
+ "progress": 0.0,
1012
+ "progress_msg": "Task dispatched...",
1013
+ "run": "2",
1014
+ "size": 17966,
 
1015
  "source_type": "local",
1016
  "status": "1",
1017
+ "thumbnail": "",
1018
+ "token_count": 8,
1019
  "type": "doc",
1020
+ "update_date": "Thu, 24 Oct 2024 11:03:15 GMT",
1021
+ "update_time": 1729767795721
1022
  },
1023
+ "total": 1
1024
  }
1025
  }
1026
  ```
 
1321
  --header 'Content-Type: application/json' \
1322
  --header 'Authorization: Bearer {YOUR_API_KEY}'
1323
  --data '{
1324
+ "dataset_ids": ["0b2cbc8c877f11ef89070242ac120005"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1325
  "name":"new_chat_1"
1326
  }'
1327
  ```
 
1375
  "code": 0,
1376
  "data": {
1377
  "avatar": "",
1378
+ "create_date": "Thu, 24 Oct 2024 11:18:29 GMT",
1379
+ "create_time": 1729768709023,
1380
+ "dataset_ids": [
1381
+ "527fa74891e811ef9c650242ac120006"
1382
+ ],
1383
  "description": "A helpful Assistant",
1384
  "do_refer": "1",
1385
+ "id": "b1f2f15691f911ef81180242ac120003",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1386
  "language": "English",
1387
  "llm": {
1388
  "frequency_penalty": 0.7,
1389
  "max_tokens": 512,
1390
+ "model_name": "qwen-plus@Tongyi-Qianwen",
1391
  "presence_penalty": 0.4,
1392
  "temperature": 0.1,
1393
  "top_p": 0.3
1394
  },
1395
+ "name": "12234",
1396
  "prompt": {
1397
+ "empty_response": "Sorry! No relevant content was found in the knowledge base!",
1398
  "keywords_similarity_weight": 0.3,
1399
+ "opener": "Hi! I'm your assistant, what can I do for you?",
1400
+ "prompt": "You are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\n Here is the knowledge base:\n {knowledge}\n The above is the knowledge base.",
1401
  "rerank_model": "",
1402
  "similarity_threshold": 0.2,
1403
  "top_n": 6,
 
1412
  "status": "1",
1413
  "tenant_id": "69736c5e723611efb51b0242ac120007",
1414
  "top_k": 1024,
1415
+ "update_date": "Thu, 24 Oct 2024 11:18:29 GMT",
1416
+ "update_time": 1729768709023
1417
  }
1418
  }
1419
  ```
 
1628
  "data": [
1629
  {
1630
  "avatar": "",
1631
+ "create_date": "Fri, 18 Oct 2024 06:20:06 GMT",
1632
+ "create_time": 1729232406637,
1633
  "description": "A helpful Assistant",
1634
  "do_refer": "1",
1635
+ "id": "04d0d8e28d1911efa3630242ac120006",
1636
+ "dataset_ids": ["527fa74891e811ef9c650242ac120006"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1637
  "language": "English",
1638
  "llm": {
1639
  "frequency_penalty": 0.7,
1640
  "max_tokens": 512,
1641
+ "model_name": "qwen-plus@Tongyi-Qianwen",
1642
  "presence_penalty": 0.4,
1643
  "temperature": 0.1,
1644
  "top_p": 0.3
1645
  },
1646
+ "name": "13243",
1647
  "prompt": {
1648
+ "empty_response": "Sorry! No relevant content was found in the knowledge base!",
1649
  "keywords_similarity_weight": 0.3,
1650
+ "opener": "Hi! I'm your assistant, what can I do for you?",
1651
+ "prompt": "You are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\n Here is the knowledge base:\n {knowledge}\n The above is the knowledge base.",
1652
  "rerank_model": "",
1653
  "similarity_threshold": 0.2,
1654
  "top_n": 6,
 
1663
  "status": "1",
1664
  "tenant_id": "69736c5e723611efb51b0242ac120007",
1665
  "top_k": 1024,
1666
+ "update_date": "Fri, 18 Oct 2024 06:20:06 GMT",
1667
+ "update_time": 1729232406638
1668
  }
1669
  ]
1670
  }
 
1998
  data: {
1999
  "code": 0,
2000
  "data": {
2001
+ "answer": "I am an intelligent assistant designed to help you with your inquiries. I can provide",
 
 
 
 
 
 
 
 
 
 
2002
  "reference": {},
2003
  "audio_binary": null,
2004
+ "id": "d8e5ebb6-6b52-4fd1-bd02-35b52ba3acaa",
2005
+ "session_id": "e14344d08d1a11efb6210242ac120004"
2006
  }
2007
  }
2008
 
2009
  data: {
2010
  "code": 0,
2011
  "data": {
2012
+ "answer": "I am an intelligent assistant designed to help you with your inquiries. I can provide information, answer questions, and assist with tasks based on the knowledge available to me",
2013
  "reference": {},
2014
  "audio_binary": null,
2015
+ "id": "d8e5ebb6-6b52-4fd1-bd02-35b52ba3acaa",
2016
+ "session_id": "e14344d08d1a11efb6210242ac120004"
2017
  }
2018
  }
2019
 
2020
  data: {
2021
  "code": 0,
2022
  "data": {
2023
+ "answer": "I am an intelligent assistant designed to help you with your inquiries. I can provide information, answer questions, and assist with tasks based on the knowledge available to me. How can I assist you today?",
2024
  "reference": {},
2025
  "audio_binary": null,
2026
+ "id": "d8e5ebb6-6b52-4fd1-bd02-35b52ba3acaa",
2027
+ "session_id": "e14344d08d1a11efb6210242ac120004"
2028
  }
2029
  }
2030
 
2031
  data: {
2032
  "code": 0,
2033
  "data": {
2034
+ "answer": "I am an intelligent assistant designed to help you with your inquiries. I can provide information, answer questions, and assist with tasks based on the knowledge available to me ##0$$. How can I assist you today?",
2035
  "reference": {
2036
+ "total": 8,
2037
  "chunks": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2038
  {
2039
  "chunk_id": "895d34de762e674b43e8613c6fb54c6d",
2040
+ "content_ltks": "xxxx\r\n\r\n\"\"\"\r\nyou are an intellig assistant. pleas summar the content of the knowledg base to answer the question. pleas list thedata in the knowledg base and answer in detail. when all knowledg base content is irrelev to the question , your answer must includ the sentenc\"the answer you are lookfor isnot found in the knowledg base!\" answer needto consid chat history.\r\n here is the knowledg base:\r\n{ knowledg}\r\nthe abov is the knowledg base.\r\n\"\"\"\r\n1\r\n 2\r\n 3\r\n 4\r\n 5\r\n 6\r\nxxxx ",
2041
+ "content_with_weight": "xxxx\r\n\r\n\"\"\"\r\nYou are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\r\n Here is the knowledge base:\r\n {knowledge}\r\n The above is the knowledge base.\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\nxxxx\r\n\r\n\"\"\"\r\nxxxx",
2042
  "doc_id": "5c5999ec7be811ef9cab0242ac120005",
2043
  "docnm_kwd": "1.txt",
2044
  "kb_id": "c7ee74067a2c11efb21c0242ac120006",
2045
  "important_kwd": [],
2046
  "img_id": "",
2047
+ "similarity": 0.4442746624416507,
2048
+ "vector_similarity": 0.3843936320913369,
2049
+ "term_similarity": 0.4699379611632138,
2050
  "positions": [
2051
  ""
2052
  ]
 
2056
  {
2057
  "doc_name": "1.txt",
2058
  "doc_id": "5c5999ec7be811ef9cab0242ac120005",
2059
+ "count": 1
2060
  }
2061
  ]
2062
  },
2063
+ "prompt": "xxxx\r\n\r\n\"\"\"\r\nYou are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\r\n Here is the knowledge base:\r\n {knowledge}\r\n The above is the knowledge base.\r\n\"\"\"\r\n1\r\n2\r\n3\r\n4\r\n5\r\n6\r\nxxxx\n\n### Query:\nwho are you,please answer me in English\n\n### Elapsed\n - Retrieval: 332.2 ms\n - LLM: 2972.1 ms",
2064
+ "id": "d8e5ebb6-6b52-4fd1-bd02-35b52ba3acaa",
2065
+ "session_id": "e14344d08d1a11efb6210242ac120004"
2066
  }
2067
  }
2068
 
api/utils/api_utils.py CHANGED
@@ -337,7 +337,7 @@ def valid(permission,valid_permission,language,valid_language,chunk_method,valid
337
 
338
  def valid_parameter(parameter,valid_values):
339
  if parameter and parameter not in valid_values:
340
- return get_error_data_result(f"{parameter} is not in {valid_values}")
341
 
342
  def get_parser_config(chunk_method,parser_config):
343
  if parser_config:
 
337
 
338
  def valid_parameter(parameter,valid_values):
339
  if parameter and parameter not in valid_values:
340
+ return get_error_data_result(f"`{parameter}` is not in {valid_values}")
341
 
342
  def get_parser_config(chunk_method,parser_config):
343
  if parser_config:
sdk/python/ragflow/modules/document.py CHANGED
@@ -70,7 +70,7 @@ class Document(Base):
70
  return Chunk(self.rag,res["data"].get("chunk"))
71
  raise Exception(res.get("message"))
72
 
73
- def delete_chunks(self,ids:List[str]):
74
  res = self.rm(f"dataset/{self.dataset_id}/document/{self.id}/chunk",{"ids":ids})
75
  res = res.json()
76
  if res.get("code")!=0:
 
70
  return Chunk(self.rag,res["data"].get("chunk"))
71
  raise Exception(res.get("message"))
72
 
73
+ def delete_chunks(self,ids:List[str] = None):
74
  res = self.rm(f"dataset/{self.dataset_id}/document/{self.id}/chunk",{"ids":ids})
75
  res = res.json()
76
  if res.get("code")!=0: