Spaces:
Sleeping
Sleeping
import os | |
import nest_asyncio | |
from io import BytesIO | |
from llama_parse import LlamaParse | |
from llama_index.core.node_parser import SimpleNodeParser | |
from dotenv import load_dotenv | |
from fastapi import UploadFile, File | |
from fastapi.responses import JSONResponse | |
import fitz | |
from script.get_metadata import Metadata | |
load_dotenv() | |
nest_asyncio.apply() | |
def parse_journal(content: bytes, file_name: str): | |
"""Parse the journal using LlamaParse.""" | |
try: | |
# Initialize the parser | |
parser = LlamaParse( | |
api_key=os.getenv("LLAMA_PARSE_API_KEY"), | |
result_type="markdown", | |
# use_vendor_multimodal_model=True, | |
# vendor_multimodal_model_name="openai-gpt-4o-mini", | |
) | |
# Load and process the document | |
llama_parse_documents = parser.load_data( | |
content, extra_info={"file_name": file_name} | |
) | |
return llama_parse_documents | |
except Exception as e: | |
return JSONResponse(status_code=400, content=f"Error processing file: {e}") | |
async def extract_metadata(content: bytes): | |
"""Extract metadata from the PDF content.""" | |
try: | |
# Open the binary content with PyMuPDF | |
pdf_document = fitz.open("pdf", content) # "pdf" specifies the format | |
# Extract metadata | |
metadata = pdf_document.metadata | |
# Prepare metadata dictionary with default values for missing fields | |
metadata_dict = { | |
"title": metadata.get("title", "N/A"), | |
"author": metadata.get("author", "N/A"), | |
"subject": metadata.get("subject", "N/A"), | |
"keywords": metadata.get("keywords", "N/A"), | |
"creation_date": metadata.get("created", "N/A"), | |
"modification_date": metadata.get("modified", "N/A"), | |
} | |
return metadata_dict | |
except Exception as e: | |
return JSONResponse(status_code=500, content=f"Error inputting metadata: {e}") | |
async def upload_file(reference, file: UploadFile): | |
try: | |
# Read the binary content of the uploaded file once | |
content = await file.read() | |
# Store the file content in a BytesIO stream for reuse later | |
file_stream = BytesIO(content) | |
# Parse the journal | |
parsed_documents = parse_journal(content, file.filename) | |
# Generate metadata | |
metadata_gen = Metadata(reference) | |
documents_with_metadata = metadata_gen.apply_metadata(parsed_documents) | |
print("Document with Metadata: \n\n", documents_with_metadata) | |
print("Number of documents: \n", len(documents_with_metadata)) | |
# Return the parsed documents with metadata and the file stream | |
return documents_with_metadata, file_stream | |
except Exception as e: | |
return JSONResponse(status_code=500, content=f"Error processing file: {e}") |