dsmultimedika commited on
Commit
661a3cb
·
1 Parent(s): 63d3882

fix : improve uploader

Browse files
api/function.py CHANGED
@@ -29,21 +29,22 @@ async def data_ingestion(reference, file: UploadFile) -> Any:
29
  user_id="admin_book_uploaded",
30
  )
31
 
32
- # # Upload to AWS
33
- file_name = f"{reference['title']}"
34
- aws_loader = Loader()
35
-
36
- file_obj = file
37
- aws_loader.upload_to_s3(file_obj, file_name)
38
-
39
  uploader = Uploader(reference, file)
40
-
41
- nodes_with_metadata = await uploader.process_documents()
42
 
43
  # Build indexes using IndexManager
44
  index = IndexManager()
45
  index.build_indexes(nodes_with_metadata)
46
 
 
 
 
 
 
 
 
 
 
47
  return json.dumps(
48
  {"status": "success", "message": "Vector Index loaded successfully."}
49
  )
 
29
  user_id="admin_book_uploaded",
30
  )
31
 
 
 
 
 
 
 
 
32
  uploader = Uploader(reference, file)
33
+ nodes_with_metadata, file_stream = await uploader.process_documents()
 
34
 
35
  # Build indexes using IndexManager
36
  index = IndexManager()
37
  index.build_indexes(nodes_with_metadata)
38
 
39
+
40
+ # # Upload to AWS
41
+ file_name = f"{reference['title']}"
42
+ aws_loader = Loader()
43
+
44
+ # file_obj = file
45
+ aws_loader.upload_to_s3(file_stream, file_name)
46
+
47
+
48
  return json.dumps(
49
  {"status": "success", "message": "Vector Index loaded successfully."}
50
  )
api/router/book.py CHANGED
@@ -84,8 +84,8 @@ async def upload_file(
84
 
85
  # Create a new Metadata object
86
  book_query = BookQuery(user)
87
- book_query.add_book(db, title, author, category_id, year, publisher)
88
- logging.info("Database Inserted")
89
 
90
  return {
91
  "filename": file.filename,
 
84
 
85
  # Create a new Metadata object
86
  book_query = BookQuery(user)
87
+ # book_query.add_book(db, title, author, category_id, year, publisher)
88
+ # logging.info("Database Inserted")
89
 
90
  return {
91
  "filename": file.filename,
script/document_uploader.py CHANGED
@@ -58,7 +58,7 @@ class Uploader:
58
 
59
  # Get metadata
60
  # documents_with_metadata = self.metadata.apply_metadata(documents)
61
- documents_with_metadata = await upload_file(self.reference, self.file)
62
 
63
  # Get Topic
64
  # topic_extractor = extract_topic(self.reference, self.content_table)
@@ -85,7 +85,7 @@ class Uploader:
85
  try:
86
  nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
87
  # nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
88
- return nodes_with_metadata
89
 
90
  except Exception as e:
91
  try:
 
58
 
59
  # Get metadata
60
  # documents_with_metadata = self.metadata.apply_metadata(documents)
61
+ documents_with_metadata, file_stream = await upload_file(self.reference, self.file)
62
 
63
  # Get Topic
64
  # topic_extractor = extract_topic(self.reference, self.content_table)
 
85
  try:
86
  nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
87
  # nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
88
+ return nodes_with_metadata, file_stream
89
 
90
  except Exception as e:
91
  try:
service/aws_loader.py CHANGED
@@ -19,14 +19,14 @@ class Loader:
19
  region_name="us-west-2",
20
  )
21
 
22
- def upload_to_s3(self, file, object_name, folder_name="summarizer"):
23
  try:
24
  # If folder_name is provided, prepend it to the object_name
25
  if folder_name:
26
  object_name = f"{folder_name}/{object_name}"
27
 
28
  # Open the PDF with PyMuPDF (fitz)
29
- pdf_document = fitz.open(stream=file.file.read(), filetype="pdf")
30
  print("Jumlah halaman : ", pdf_document.page_count)
31
  # Loop through each page of the PDF
32
  for page_num in range(pdf_document.page_count):
 
19
  region_name="us-west-2",
20
  )
21
 
22
+ def upload_to_s3(self, file_stream: BytesIO, object_name, folder_name="summarizer"):
23
  try:
24
  # If folder_name is provided, prepend it to the object_name
25
  if folder_name:
26
  object_name = f"{folder_name}/{object_name}"
27
 
28
  # Open the PDF with PyMuPDF (fitz)
29
+ pdf_document = fitz.open(stream=file_stream.getvalue(), filetype="pdf")
30
  print("Jumlah halaman : ", pdf_document.page_count)
31
  # Loop through each page of the PDF
32
  for page_num in range(pdf_document.page_count):
service/reader_v3.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import nest_asyncio
 
3
 
4
  from llama_parse import LlamaParse
5
  from llama_index.core.node_parser import SimpleNodeParser
@@ -65,22 +66,22 @@ async def upload_file(reference, file: UploadFile):
65
  try:
66
  # Read the binary content of the uploaded file once
67
  content = await file.read()
 
 
 
 
68
  # Parse the journal
69
  parsed_documents = parse_journal(content, file.filename)
70
- # Extract metadata
71
- # metadata_dict = await extract_metadata(content)
72
- # print("Metadata Dictionary : \n\n", metadata_dict)
73
 
 
74
  metadata_gen = Metadata(reference)
75
  documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
76
-
77
- # document_with_metadata =
78
 
79
- print("Document with Metadata : \n\n", documents_with_metadata)
80
- print("Banyak documents : \n", len(documents_with_metadata))
81
 
82
- # Return both parsed documents and metadata
83
- return documents_with_metadata
84
 
85
  except Exception as e:
86
  return JSONResponse(status_code=500, content=f"Error processing file: {e}")
 
1
  import os
2
  import nest_asyncio
3
+ from io import BytesIO
4
 
5
  from llama_parse import LlamaParse
6
  from llama_index.core.node_parser import SimpleNodeParser
 
66
  try:
67
  # Read the binary content of the uploaded file once
68
  content = await file.read()
69
+
70
+ # Store the file content in a BytesIO stream for reuse later
71
+ file_stream = BytesIO(content)
72
+
73
  # Parse the journal
74
  parsed_documents = parse_journal(content, file.filename)
 
 
 
75
 
76
+ # Generate metadata
77
  metadata_gen = Metadata(reference)
78
  documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
 
 
79
 
80
+ print("Document with Metadata: \n\n", documents_with_metadata)
81
+ print("Number of documents: \n", len(documents_with_metadata))
82
 
83
+ # Return the parsed documents with metadata and the file stream
84
+ return documents_with_metadata, file_stream
85
 
86
  except Exception as e:
87
  return JSONResponse(status_code=500, content=f"Error processing file: {e}")