cecilia-uu commited on
Commit
4856f42
·
1 Parent(s): 596e145

API: Download doc api (#1354)

Browse files

### What problem does this PR solve?

Adds download_document api

### Type of change


- [x] New Feature (non-breaking change which adds functionality)

api/apps/dataset_api.py CHANGED
@@ -12,15 +12,16 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
-
16
  import os
17
  import pathlib
18
  import re
19
  import warnings
 
20
 
21
- from flask import request
22
  from flask_login import login_required, current_user
23
  from httpx import HTTPError
 
24
 
25
  from api.contants import NAME_LENGTH_LIMIT
26
  from api.db import FileType, ParserType, FileSource
@@ -283,9 +284,12 @@ def upload_documents(dataset_id):
283
  return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
284
  f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
285
 
 
 
 
 
 
286
  for file_obj in file_objs:
287
- # the content of the file
288
- file_content = file_obj.read()
289
  file_name = file_obj.filename
290
  # no name
291
  if not file_name:
@@ -296,15 +300,6 @@ def upload_documents(dataset_id):
296
  if 'http' in file_name:
297
  return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
298
 
299
- # the content is empty, raising a warning
300
- if file_content == b'':
301
- warnings.warn(f"[WARNING]: The file {file_name} is empty.")
302
-
303
- # no dataset
304
- exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
305
- if not exist:
306
- return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
307
-
308
  # get the root_folder
309
  root_folder = FileService.get_root_folder(current_user.id)
310
  # get the id of the root_folder
@@ -342,8 +337,14 @@ def upload_documents(dataset_id):
342
  location = filename
343
  while MINIO.obj_exist(dataset_id, location):
344
  location += "_"
 
345
  blob = file.read()
 
 
 
 
346
  MINIO.put(dataset_id, location, blob)
 
347
  doc = {
348
  "id": get_uuid(),
349
  "kb_id": dataset.id,
@@ -555,6 +556,40 @@ def is_illegal_value_for_enum(value, enum_class):
555
  return value not in enum_class.__members__.values()
556
 
557
  # ----------------------------download a file-----------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
 
559
  # ----------------------------start parsing-----------------------------------------------------
560
 
@@ -564,7 +599,7 @@ def is_illegal_value_for_enum(value, enum_class):
564
 
565
  # ----------------------------list the chunks of the file-----------------------------------------------------
566
 
567
- # ----------------------------delete the chunk-----------------------------------------------------
568
 
569
  # ----------------------------edit the status of the chunk-----------------------------------------------------
570
 
@@ -576,3 +611,5 @@ def is_illegal_value_for_enum(value, enum_class):
576
 
577
  # ----------------------------retrieval test-----------------------------------------------------
578
 
 
 
 
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
 
15
  import os
16
  import pathlib
17
  import re
18
  import warnings
19
+ from io import BytesIO
20
 
21
+ from flask import request, send_file
22
  from flask_login import login_required, current_user
23
  from httpx import HTTPError
24
+ from minio import S3Error
25
 
26
  from api.contants import NAME_LENGTH_LIMIT
27
  from api.db import FileType, ParserType, FileSource
 
284
  return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
285
  f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
286
 
287
+ # no dataset
288
+ exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
289
+ if not exist:
290
+ return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
291
+
292
  for file_obj in file_objs:
 
 
293
  file_name = file_obj.filename
294
  # no name
295
  if not file_name:
 
300
  if 'http' in file_name:
301
  return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
302
 
 
 
 
 
 
 
 
 
 
303
  # get the root_folder
304
  root_folder = FileService.get_root_folder(current_user.id)
305
  # get the id of the root_folder
 
337
  location = filename
338
  while MINIO.obj_exist(dataset_id, location):
339
  location += "_"
340
+
341
  blob = file.read()
342
+ # the content is empty, raising a warning
343
+ if blob == b'':
344
+ warnings.warn(f"[WARNING]: The file {filename} is empty.")
345
+
346
  MINIO.put(dataset_id, location, blob)
347
+
348
  doc = {
349
  "id": get_uuid(),
350
  "kb_id": dataset.id,
 
556
  return value not in enum_class.__members__.values()
557
 
558
  # ----------------------------download a file-----------------------------------------------------
559
+ @manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
560
+ @login_required
561
+ def download_document(dataset_id, document_id):
562
+ try:
563
+ # Check whether there is this dataset
564
+ exist, _ = KnowledgebaseService.get_by_id(dataset_id)
565
+ if not exist:
566
+ return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset '{dataset_id}' cannot be found!")
567
+
568
+ # Check whether there is this document
569
+ exist, document = DocumentService.get_by_id(document_id)
570
+ if not exist:
571
+ return construct_json_result(message=f"This document '{document_id}' cannot be found!",
572
+ code=RetCode.ARGUMENT_ERROR)
573
+
574
+ # The process of downloading
575
+ doc_id, doc_location = File2DocumentService.get_minio_address(doc_id=document_id) # minio address
576
+ file_stream = MINIO.get(doc_id, doc_location)
577
+ if not file_stream:
578
+ return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
579
+
580
+ file = BytesIO(file_stream)
581
+
582
+ # Use send_file with a proper filename and MIME type
583
+ return send_file(
584
+ file,
585
+ as_attachment=True,
586
+ download_name=document.name,
587
+ mimetype='application/octet-stream' # Set a default MIME type
588
+ )
589
+
590
+ # Error
591
+ except Exception as e:
592
+ return construct_error_response(e)
593
 
594
  # ----------------------------start parsing-----------------------------------------------------
595
 
 
599
 
600
  # ----------------------------list the chunks of the file-----------------------------------------------------
601
 
602
+ # -- --------------------------delete the chunk-----------------------------------------------------
603
 
604
  # ----------------------------edit the status of the chunk-----------------------------------------------------
605
 
 
611
 
612
  # ----------------------------retrieval test-----------------------------------------------------
613
 
614
+
615
+
sdk/python/ragflow/ragflow.py CHANGED
@@ -12,12 +12,12 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
-
16
  import json
17
  import os
18
 
19
  import requests
20
 
 
21
  from api.settings import RetCode
22
 
23
 
@@ -126,7 +126,22 @@ class RAGFlow:
126
  return response.json()
127
 
128
  # ----------------------------download a file-----------------------------------------------------
129
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  # ----------------------------start parsing-----------------------------------------------------
131
 
132
  # ----------------------------stop parsing-----------------------------------------------------
@@ -144,3 +159,4 @@ class RAGFlow:
144
  # ----------------------------get a specific chunk-----------------------------------------------------
145
 
146
  # ----------------------------retrieval test-----------------------------------------------------
 
 
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
 
15
  import json
16
  import os
17
 
18
  import requests
19
 
20
+ from api.db.services.document_service import DocumentService
21
  from api.settings import RetCode
22
 
23
 
 
126
  return response.json()
127
 
128
  # ----------------------------download a file-----------------------------------------------------
129
+ def download_file(self, dataset_id, document_id):
130
+ endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}"
131
+ res = requests.get(endpoint, headers=self.authorization_header)
132
+
133
+ content = res.content # binary data
134
+ # decode the binary data
135
+ try:
136
+ decoded_content = content.decode("utf-8")
137
+ json_data = json.loads(decoded_content)
138
+ return json_data # message
139
+ except json.JSONDecodeError: # binary data
140
+ _, document = DocumentService.get_by_id(document_id)
141
+ file_path = os.path.join(os.getcwd(), document.name)
142
+ with open(file_path, "wb") as file:
143
+ file.write(content)
144
+ return {"code": RetCode.SUCCESS, "data": content}
145
  # ----------------------------start parsing-----------------------------------------------------
146
 
147
  # ----------------------------stop parsing-----------------------------------------------------
 
159
  # ----------------------------get a specific chunk-----------------------------------------------------
160
 
161
  # ----------------------------retrieval test-----------------------------------------------------
162
+
sdk/python/test/common.py CHANGED
@@ -1,4 +1,4 @@
1
 
2
 
3
- API_KEY = 'ImFhMmJhZmUwMmQxNzExZWZhZDdmMzA0M2Q3ZWU1MzdlIg.ZnDsIQ.u-0-_qCRU6a4WICxyAPsjaafyOo'
4
  HOST_ADDRESS = 'http://127.0.0.1:9380'
 
1
 
2
 
3
+ API_KEY = 'IjJkOGQ4ZDE2MzkyMjExZWZhYTk0MzA0M2Q3ZWU1MzdlIg.ZoUfug.RmqcYyCrlAnLtkzk6bYXiXN3eEY'
4
  HOST_ADDRESS = 'http://127.0.0.1:9380'
sdk/python/test/test_document.py CHANGED
@@ -3,7 +3,6 @@ from test_sdkbase import TestSdk
3
  from ragflow import RAGFlow
4
  import pytest
5
  from common import API_KEY, HOST_ADDRESS
6
- from api.contants import NAME_LENGTH_LIMIT
7
 
8
 
9
  class TestFile(TestSdk):
@@ -625,8 +624,76 @@ class TestFile(TestSdk):
625
  update_res = ragflow.update_file(created_res_id, doc_id, **params)
626
  assert (update_res["code"] == RetCode.DATA_ERROR and
627
  update_res["message"] == "Illegal value ? for 'template_type' field.")
 
628
  # ----------------------------download a file-----------------------------------------------------
629
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
630
  # ----------------------------start parsing-----------------------------------------------------
631
 
632
  # ----------------------------stop parsing-----------------------------------------------------
 
3
  from ragflow import RAGFlow
4
  import pytest
5
  from common import API_KEY, HOST_ADDRESS
 
6
 
7
 
8
  class TestFile(TestSdk):
 
624
  update_res = ragflow.update_file(created_res_id, doc_id, **params)
625
  assert (update_res["code"] == RetCode.DATA_ERROR and
626
  update_res["message"] == "Illegal value ? for 'template_type' field.")
627
+
628
  # ----------------------------download a file-----------------------------------------------------
629
 
630
+ def test_download_nonexistent_document(self):
631
+ """
632
+ Test downloading a document which does not exist.
633
+ """
634
+ # create a dataset
635
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
636
+ created_res = ragflow.create_dataset("test_download_nonexistent_document")
637
+ created_res_id = created_res["data"]["dataset_id"]
638
+ res = ragflow.download_file(created_res_id, "imagination")
639
+ assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == f"This document 'imagination' cannot be found!"
640
+
641
+ def test_download_document_in_nonexistent_dataset(self):
642
+ """
643
+ Test downloading a document whose dataset is nonexistent.
644
+ """
645
+ # create a dataset
646
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
647
+ created_res = ragflow.create_dataset("test_download_nonexistent_document")
648
+ created_res_id = created_res["data"]["dataset_id"]
649
+ # upload files
650
+ file_paths = ["test_data/test.txt"]
651
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
652
+ # get the doc_id
653
+ data = uploading_res["data"][0]
654
+ doc_id = data["id"]
655
+ # download file
656
+ res = ragflow.download_file("imagination", doc_id)
657
+ assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"This dataset 'imagination' cannot be found!"
658
+
659
+ def test_download_document_with_success(self):
660
+ """
661
+ Test the downloading of a document with success.
662
+ """
663
+ # create a dataset
664
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
665
+ created_res = ragflow.create_dataset("test_download_nonexistent_document")
666
+ created_res_id = created_res["data"]["dataset_id"]
667
+ # upload files
668
+ file_paths = ["test_data/test.txt"]
669
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
670
+ # get the doc_id
671
+ data = uploading_res["data"][0]
672
+ doc_id = data["id"]
673
+ # download file
674
+ with open("test_data/test.txt", "rb") as file:
675
+ binary_data = file.read()
676
+ res = ragflow.download_file(created_res_id, doc_id)
677
+ assert res["code"] == RetCode.SUCCESS and res["data"] == binary_data
678
+
679
+ def test_download_an_empty_document(self):
680
+ """
681
+ Test the downloading of an empty document.
682
+ """
683
+ # create a dataset
684
+ ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
685
+ created_res = ragflow.create_dataset("test_download_nonexistent_document")
686
+ created_res_id = created_res["data"]["dataset_id"]
687
+ # upload files
688
+ file_paths = ["test_data/empty.txt"]
689
+ uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
690
+ # get the doc_id
691
+ data = uploading_res["data"][0]
692
+ doc_id = data["id"]
693
+ # download file
694
+ res = ragflow.download_file(created_res_id, doc_id)
695
+ assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty."
696
+
697
  # ----------------------------start parsing-----------------------------------------------------
698
 
699
  # ----------------------------stop parsing-----------------------------------------------------