dsmultimedika commited on
Commit
4a2b28f
·
1 Parent(s): 0743bb0

Update Jurnal Reader

Browse files
core/journal_reading/extractor.py CHANGED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+
5
+ class Extractor():
6
+ def __init__(self):
7
+ pass
8
+
core/journal_reading/prompt.py ADDED
File without changes
core/journal_reading/upload.py CHANGED
@@ -1,30 +1,85 @@
1
- import tempfile
2
  import os
 
 
3
  from llama_parse import LlamaParse
4
  from llama_index.core.node_parser import SimpleNodeParser
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
 
 
6
 
7
- class JournalUploader:
8
- def __init__(self):
9
- pass
10
-
11
- def parser_journal(self):
12
- if local_file_name is None:
13
- local_file_name = "downloaded_pdf_file.pdf" # Default file name
14
-
15
- try:
16
- # Create a temporary directory to store the file
17
- temp_dir = tempfile.mkdtemp()
18
- file_path = os.path.join(temp_dir, local_file_name)
19
-
20
- with open(file_path, 'wb') as temp_file:
21
- self.s3_client.download_fileobj(self.bucket_name, object_name, temp_file)
22
-
23
- documents = LlamaParse(result_type="markdown").load_data(file_path)
24
-
25
- return documents
26
-
27
- except Exception as e:
28
- # Handle specific exceptions or fallback to generic one
29
- print(f"Error reading PDF file: {e}")
30
- raise RuntimeError(f"Failed to process the uploaded file: {e}")
 
 
1
  import os
2
+ import nest_asyncio
3
+
4
  from llama_parse import LlamaParse
5
  from llama_index.core.node_parser import SimpleNodeParser
6
+ from dotenv import load_dotenv
7
+ from fastapi import UploadFile, HTTPException, File
8
+ import fitz
9
+
10
+ from script.get_metadata import Metadata
11
+
12
+ load_dotenv()
13
+ nest_asyncio.apply()
14
+
15
+
16
+ async def parse_journal(content: bytes, file_name: str):
17
+ """Parse the journal using LlamaParse."""
18
+ try:
19
+ # Initialize the parser
20
+ parser = LlamaParse(
21
+ api_key=os.getenv("LLAMA_PARSE_API_KEY"),
22
+ result_type="markdown",
23
+ max_timeout=5000,
24
+ )
25
+
26
+ # Load and process the document
27
+ llama_parse_documents = parser.load_data(
28
+ content, extra_info={"file_name": file_name}
29
+ )
30
+
31
+ return llama_parse_documents
32
+
33
+ except Exception as e:
34
+ raise HTTPException(status_code=400, detail=f"Error processing file: {e}")
35
+
36
+
37
+ async def extract_metadata(content: bytes):
38
+ """Extract metadata from the PDF content."""
39
+ try:
40
+ # Open the binary content with PyMuPDF
41
+ pdf_document = fitz.open("pdf", content) # "pdf" specifies the format
42
+
43
+ # Extract metadata
44
+ metadata = pdf_document.metadata
45
+
46
+ # Prepare metadata dictionary with default values for missing fields
47
+ metadata_dict = {
48
+ "title": metadata.get("title", "N/A"),
49
+ "author": metadata.get("author", "N/A"),
50
+ "subject": metadata.get("subject", "N/A"),
51
+ "keywords": metadata.get("keywords", "N/A"),
52
+ "creation_date": metadata.get("created", "N/A"),
53
+ "modification_date": metadata.get("modified", "N/A"),
54
+ }
55
+
56
+ return metadata_dict
57
+
58
+ except Exception as e:
59
+ raise HTTPException(status_code=500, detail=f"Error inputting metadata: {e}")
60
+
61
+
62
+ async def upload_file(file: UploadFile = File(...)):
63
+ try:
64
+ # Read the binary content of the uploaded file once
65
+ content = await file.read()
66
+ # Parse the journal
67
+ parsed_documents = await parse_journal(content, file.filename)
68
+ # Extract metadata
69
+ metadata_dict = await extract_metadata(content)
70
+
71
+ print("Metadata Dictionary : \n\n", metadata_dict)
72
+
73
+ metadata_gen = Metadata(metadata_dict)
74
+ documents_with_metadata = metadata_gen.add_metadata(
75
+ parsed_documents, metadata_dict
76
+ )
77
+
78
+ print("Document with Metadata : \n\n", documents_with_metadata)
79
+ print("Banyak documents : \n", len(documents_with_metadata))
80
 
81
+ # Return both parsed documents and metadata
82
+ return {"status": "SUCCESS"}
83
 
84
+ except Exception as e:
85
+ raise HTTPException(status_code=500, detail=f"Error processing file: {e}")