Spaces:
Sleeping
Sleeping
Commit
·
661a3cb
1
Parent(s):
63d3882
fix : improve uploader
Browse files- api/function.py +10 -9
- api/router/book.py +2 -2
- script/document_uploader.py +2 -2
- service/aws_loader.py +2 -2
- service/reader_v3.py +10 -9
api/function.py
CHANGED
@@ -29,21 +29,22 @@ async def data_ingestion(reference, file: UploadFile) -> Any:
|
|
29 |
user_id="admin_book_uploaded",
|
30 |
)
|
31 |
|
32 |
-
# # Upload to AWS
|
33 |
-
file_name = f"{reference['title']}"
|
34 |
-
aws_loader = Loader()
|
35 |
-
|
36 |
-
file_obj = file
|
37 |
-
aws_loader.upload_to_s3(file_obj, file_name)
|
38 |
-
|
39 |
uploader = Uploader(reference, file)
|
40 |
-
|
41 |
-
nodes_with_metadata = await uploader.process_documents()
|
42 |
|
43 |
# Build indexes using IndexManager
|
44 |
index = IndexManager()
|
45 |
index.build_indexes(nodes_with_metadata)
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
return json.dumps(
|
48 |
{"status": "success", "message": "Vector Index loaded successfully."}
|
49 |
)
|
|
|
29 |
user_id="admin_book_uploaded",
|
30 |
)
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
uploader = Uploader(reference, file)
|
33 |
+
nodes_with_metadata, file_stream = await uploader.process_documents()
|
|
|
34 |
|
35 |
# Build indexes using IndexManager
|
36 |
index = IndexManager()
|
37 |
index.build_indexes(nodes_with_metadata)
|
38 |
|
39 |
+
|
40 |
+
# # Upload to AWS
|
41 |
+
file_name = f"{reference['title']}"
|
42 |
+
aws_loader = Loader()
|
43 |
+
|
44 |
+
# file_obj = file
|
45 |
+
aws_loader.upload_to_s3(file_stream, file_name)
|
46 |
+
|
47 |
+
|
48 |
return json.dumps(
|
49 |
{"status": "success", "message": "Vector Index loaded successfully."}
|
50 |
)
|
api/router/book.py
CHANGED
@@ -84,8 +84,8 @@ async def upload_file(
|
|
84 |
|
85 |
# Create a new Metadata object
|
86 |
book_query = BookQuery(user)
|
87 |
-
book_query.add_book(db, title, author, category_id, year, publisher)
|
88 |
-
logging.info("Database Inserted")
|
89 |
|
90 |
return {
|
91 |
"filename": file.filename,
|
|
|
84 |
|
85 |
# Create a new Metadata object
|
86 |
book_query = BookQuery(user)
|
87 |
+
# book_query.add_book(db, title, author, category_id, year, publisher)
|
88 |
+
# logging.info("Database Inserted")
|
89 |
|
90 |
return {
|
91 |
"filename": file.filename,
|
script/document_uploader.py
CHANGED
@@ -58,7 +58,7 @@ class Uploader:
|
|
58 |
|
59 |
# Get metadata
|
60 |
# documents_with_metadata = self.metadata.apply_metadata(documents)
|
61 |
-
documents_with_metadata = await upload_file(self.reference, self.file)
|
62 |
|
63 |
# Get Topic
|
64 |
# topic_extractor = extract_topic(self.reference, self.content_table)
|
@@ -85,7 +85,7 @@ class Uploader:
|
|
85 |
try:
|
86 |
nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
|
87 |
# nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
|
88 |
-
return nodes_with_metadata
|
89 |
|
90 |
except Exception as e:
|
91 |
try:
|
|
|
58 |
|
59 |
# Get metadata
|
60 |
# documents_with_metadata = self.metadata.apply_metadata(documents)
|
61 |
+
documents_with_metadata, file_stream = await upload_file(self.reference, self.file)
|
62 |
|
63 |
# Get Topic
|
64 |
# topic_extractor = extract_topic(self.reference, self.content_table)
|
|
|
85 |
try:
|
86 |
nodes_with_metadata = pipeline.run(documents=documents_with_metadata)
|
87 |
# nodes_with_metadata = splitter.get_nodes_from_documents(documents_with_metadata)
|
88 |
+
return nodes_with_metadata, file_stream
|
89 |
|
90 |
except Exception as e:
|
91 |
try:
|
service/aws_loader.py
CHANGED
@@ -19,14 +19,14 @@ class Loader:
|
|
19 |
region_name="us-west-2",
|
20 |
)
|
21 |
|
22 |
-
def upload_to_s3(self,
|
23 |
try:
|
24 |
# If folder_name is provided, prepend it to the object_name
|
25 |
if folder_name:
|
26 |
object_name = f"{folder_name}/{object_name}"
|
27 |
|
28 |
# Open the PDF with PyMuPDF (fitz)
|
29 |
-
pdf_document = fitz.open(stream=
|
30 |
print("Jumlah halaman : ", pdf_document.page_count)
|
31 |
# Loop through each page of the PDF
|
32 |
for page_num in range(pdf_document.page_count):
|
|
|
19 |
region_name="us-west-2",
|
20 |
)
|
21 |
|
22 |
+
def upload_to_s3(self, file_stream: BytesIO, object_name, folder_name="summarizer"):
|
23 |
try:
|
24 |
# If folder_name is provided, prepend it to the object_name
|
25 |
if folder_name:
|
26 |
object_name = f"{folder_name}/{object_name}"
|
27 |
|
28 |
# Open the PDF with PyMuPDF (fitz)
|
29 |
+
pdf_document = fitz.open(stream=file_stream.getvalue(), filetype="pdf")
|
30 |
print("Jumlah halaman : ", pdf_document.page_count)
|
31 |
# Loop through each page of the PDF
|
32 |
for page_num in range(pdf_document.page_count):
|
service/reader_v3.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import nest_asyncio
|
|
|
3 |
|
4 |
from llama_parse import LlamaParse
|
5 |
from llama_index.core.node_parser import SimpleNodeParser
|
@@ -65,22 +66,22 @@ async def upload_file(reference, file: UploadFile):
|
|
65 |
try:
|
66 |
# Read the binary content of the uploaded file once
|
67 |
content = await file.read()
|
|
|
|
|
|
|
|
|
68 |
# Parse the journal
|
69 |
parsed_documents = parse_journal(content, file.filename)
|
70 |
-
# Extract metadata
|
71 |
-
# metadata_dict = await extract_metadata(content)
|
72 |
-
# print("Metadata Dictionary : \n\n", metadata_dict)
|
73 |
|
|
|
74 |
metadata_gen = Metadata(reference)
|
75 |
documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
|
76 |
-
|
77 |
-
# document_with_metadata =
|
78 |
|
79 |
-
print("Document with Metadata
|
80 |
-
print("
|
81 |
|
82 |
-
# Return
|
83 |
-
return documents_with_metadata
|
84 |
|
85 |
except Exception as e:
|
86 |
return JSONResponse(status_code=500, content=f"Error processing file: {e}")
|
|
|
1 |
import os
|
2 |
import nest_asyncio
|
3 |
+
from io import BytesIO
|
4 |
|
5 |
from llama_parse import LlamaParse
|
6 |
from llama_index.core.node_parser import SimpleNodeParser
|
|
|
66 |
try:
|
67 |
# Read the binary content of the uploaded file once
|
68 |
content = await file.read()
|
69 |
+
|
70 |
+
# Store the file content in a BytesIO stream for reuse later
|
71 |
+
file_stream = BytesIO(content)
|
72 |
+
|
73 |
# Parse the journal
|
74 |
parsed_documents = parse_journal(content, file.filename)
|
|
|
|
|
|
|
75 |
|
76 |
+
# Generate metadata
|
77 |
metadata_gen = Metadata(reference)
|
78 |
documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
|
|
|
|
|
79 |
|
80 |
+
print("Document with Metadata: \n\n", documents_with_metadata)
|
81 |
+
print("Number of documents: \n", len(documents_with_metadata))
|
82 |
|
83 |
+
# Return the parsed documents with metadata and the file stream
|
84 |
+
return documents_with_metadata, file_stream
|
85 |
|
86 |
except Exception as e:
|
87 |
return JSONResponse(status_code=500, content=f"Error processing file: {e}")
|