Spaces:
Sleeping
Sleeping
import os | |
import nest_asyncio | |
from llama_parse import LlamaParse | |
from llama_index.core.node_parser import SimpleNodeParser | |
from dotenv import load_dotenv | |
from fastapi import UploadFile, HTTPException, File | |
import fitz | |
from script.get_metadata import Metadata | |
load_dotenv() | |
nest_asyncio.apply() | |
async def parse_journal(content: bytes, file_name: str): | |
"""Parse the journal using LlamaParse.""" | |
try: | |
# Initialize the parser | |
parser = LlamaParse( | |
api_key=os.getenv("LLAMA_PARSE_API_KEY"), | |
result_type="markdown", | |
max_timeout=5000, | |
) | |
# Load and process the document | |
llama_parse_documents = parser.load_data( | |
content, extra_info={"file_name": file_name} | |
) | |
return llama_parse_documents | |
except Exception as e: | |
raise HTTPException(status_code=400, detail=f"Error processing file: {e}") | |
async def extract_metadata(content: bytes): | |
"""Extract metadata from the PDF content.""" | |
try: | |
# Open the binary content with PyMuPDF | |
pdf_document = fitz.open("pdf", content) # "pdf" specifies the format | |
# Extract metadata | |
metadata = pdf_document.metadata | |
# Prepare metadata dictionary with default values for missing fields | |
metadata_dict = { | |
"title": metadata.get("title", "N/A"), | |
"author": metadata.get("author", "N/A"), | |
"subject": metadata.get("subject", "N/A"), | |
"keywords": metadata.get("keywords", "N/A"), | |
"creation_date": metadata.get("created", "N/A"), | |
"modification_date": metadata.get("modified", "N/A"), | |
} | |
return metadata_dict | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error inputting metadata: {e}") | |
async def upload_file(file: UploadFile = File(...)): | |
try: | |
# Read the binary content of the uploaded file once | |
content = await file.read() | |
# Parse the journal | |
parsed_documents = await parse_journal(content, file.filename) | |
# Extract metadata | |
metadata_dict = await extract_metadata(content) | |
print("Metadata Dictionary : \n\n", metadata_dict) | |
metadata_gen = Metadata(metadata_dict) | |
documents_with_metadata = metadata_gen.add_metadata( | |
parsed_documents, metadata_dict | |
) | |
print("Document with Metadata : \n\n", documents_with_metadata) | |
print("Banyak documents : \n", len(documents_with_metadata)) | |
# Return both parsed documents and metadata | |
return {"status": "SUCCESS"} | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error processing file: {e}") |