Spaces:

Multimedika
/

Bot_Development

Sleeping

dsmultimedika commited on Oct 16, 2024

Commit

661a3cb

1 Parent(s): 63d3882

fix : improve uploader

Files changed (5) hide show

api/function.py CHANGED Viewed

@@ -29,21 +29,22 @@ async def data_ingestion(reference, file: UploadFile) -> Any:
             user_id="admin_book_uploaded",
         )
-        # # Upload to AWS
-        file_name = f"{reference['title']}"
-        aws_loader = Loader()
-        file_obj = file
-        aws_loader.upload_to_s3(file_obj, file_name)
         uploader = Uploader(reference, file)
-        nodes_with_metadata = await uploader.process_documents()
         # Build indexes using IndexManager
         index = IndexManager()
         index.build_indexes(nodes_with_metadata)
         return json.dumps(
             {"status": "success", "message": "Vector Index loaded successfully."}
         )

             user_id="admin_book_uploaded",
         )
         uploader = Uploader(reference, file)
+        nodes_with_metadata, file_stream = await uploader.process_documents()
         # Build indexes using IndexManager
         index = IndexManager()
         index.build_indexes(nodes_with_metadata)
+        # # Upload to AWS
+        file_name = f"{reference['title']}"
+        aws_loader = Loader()
+        # file_obj = file
+        aws_loader.upload_to_s3(file_stream, file_name)
         return json.dumps(
             {"status": "success", "message": "Vector Index loaded successfully."}
         )

api/router/book.py CHANGED Viewed

@@ -84,8 +84,8 @@ async def upload_file(
         # Create a new Metadata object
         book_query = BookQuery(user)
-        book_query.add_book(db, title, author, category_id, year, publisher)
-        logging.info("Database Inserted")
         return {
             "filename": file.filename,

         # Create a new Metadata object
         book_query = BookQuery(user)
+        # book_query.add_book(db, title, author, category_id, year, publisher)
+        # logging.info("Database Inserted")
         return {
             "filename": file.filename,

script/document_uploader.py CHANGED Viewed

@@ -58,7 +58,7 @@ class Uploader:
         # Get metadata
         # documents_with_metadata = self.metadata.apply_metadata(documents)
-        documents_with_metadata = await upload_file(self.reference, self.file)
         # Get Topic
         # topic_extractor = extract_topic(self.reference, self.content_table)
@@ -85,7 +85,7 @@ class Uploader:
         try:
             nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
             # nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
-            return nodes_with_metadata
         except Exception as e:
             try:

         # Get metadata
         # documents_with_metadata = self.metadata.apply_metadata(documents)
+        documents_with_metadata, file_stream = await upload_file(self.reference, self.file)
         # Get Topic
         # topic_extractor = extract_topic(self.reference, self.content_table)
         try:
             nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
             # nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
+            return nodes_with_metadata, file_stream
         except Exception as e:
             try:

service/aws_loader.py CHANGED Viewed

@@ -19,14 +19,14 @@ class Loader:
             region_name="us-west-2",
         )
-    def upload_to_s3(self, file, object_name, folder_name="summarizer"):
         try:
             # If folder_name is provided, prepend it to the object_name
             if folder_name:
                 object_name = f"{folder_name}/{object_name}"
             # Open the PDF with PyMuPDF (fitz)
-            pdf_document = fitz.open(stream=file.file.read(), filetype="pdf")
             print("Jumlah halaman : ", pdf_document.page_count)
             # Loop through each page of the PDF
             for page_num in range(pdf_document.page_count):

             region_name="us-west-2",
         )
+    def upload_to_s3(self, file_stream: BytesIO, object_name, folder_name="summarizer"):
         try:
             # If folder_name is provided, prepend it to the object_name
             if folder_name:
                 object_name = f"{folder_name}/{object_name}"
             # Open the PDF with PyMuPDF (fitz)
+            pdf_document = fitz.open(stream=file_stream.getvalue(), filetype="pdf")
             print("Jumlah halaman : ", pdf_document.page_count)
             # Loop through each page of the PDF
             for page_num in range(pdf_document.page_count):

service/reader_v3.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import nest_asyncio
 from llama_parse import LlamaParse
 from llama_index.core.node_parser import SimpleNodeParser
@@ -65,22 +66,22 @@ async def upload_file(reference, file: UploadFile):
     try:
         # Read the binary content of the uploaded file once
         content = await file.read()
         # Parse the journal
         parsed_documents = parse_journal(content, file.filename)
-        # Extract metadata
-        # metadata_dict = await extract_metadata(content)
-        # print("Metadata Dictionary : \n\n", metadata_dict)
         metadata_gen = Metadata(reference)
         documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
-        # document_with_metadata =
-        print("Document with Metadata : \n\n", documents_with_metadata)
-        print("Banyak documents : \n", len(documents_with_metadata))
-        # Return both parsed documents and metadata
-        return documents_with_metadata
     except Exception as e:
         return JSONResponse(status_code=500, content=f"Error processing file: {e}")

 import os
 import nest_asyncio
+from io import BytesIO
 from llama_parse import LlamaParse
 from llama_index.core.node_parser import SimpleNodeParser
     try:
         # Read the binary content of the uploaded file once
         content = await file.read()
+        # Store the file content in a BytesIO stream for reuse later
+        file_stream = BytesIO(content)
         # Parse the journal
         parsed_documents = parse_journal(content, file.filename)
+        # Generate metadata
         metadata_gen = Metadata(reference)
         documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
+        print("Document with Metadata: \n\n", documents_with_metadata)
+        print("Number of documents: \n", len(documents_with_metadata))
+        # Return the parsed documents with metadata and the file stream
+        return documents_with_metadata, file_stream
     except Exception as e:
         return JSONResponse(status_code=500, content=f"Error processing file: {e}")