Spaces:
Sleeping
Sleeping
File size: 2,840 Bytes
69beac6 661a3cb 69beac6 2759c49 69beac6 661a3cb 69beac6 661a3cb 69beac6 661a3cb 69beac6 661a3cb 69beac6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import nest_asyncio
from io import BytesIO
from llama_parse import LlamaParse
from llama_index.core.node_parser import SimpleNodeParser
from dotenv import load_dotenv
from fastapi import UploadFile, File
from fastapi.responses import JSONResponse
import fitz
from script.get_metadata import Metadata
load_dotenv()
nest_asyncio.apply()
def parse_journal(content: bytes, file_name: str):
"""Parse the journal using LlamaParse."""
try:
# Initialize the parser
parser = LlamaParse(
api_key=os.getenv("LLAMA_PARSE_API_KEY"),
result_type="markdown",
# use_vendor_multimodal_model=True,
# vendor_multimodal_model_name="openai-gpt-4o-mini",
)
# Load and process the document
llama_parse_documents = parser.load_data(
content, extra_info={"file_name": file_name}
)
return llama_parse_documents
except Exception as e:
return JSONResponse(status_code=400, content=f"Error processing file: {e}")
async def extract_metadata(content: bytes):
"""Extract metadata from the PDF content."""
try:
# Open the binary content with PyMuPDF
pdf_document = fitz.open("pdf", content) # "pdf" specifies the format
# Extract metadata
metadata = pdf_document.metadata
# Prepare metadata dictionary with default values for missing fields
metadata_dict = {
"title": metadata.get("title", "N/A"),
"author": metadata.get("author", "N/A"),
"subject": metadata.get("subject", "N/A"),
"keywords": metadata.get("keywords", "N/A"),
"creation_date": metadata.get("created", "N/A"),
"modification_date": metadata.get("modified", "N/A"),
}
return metadata_dict
except Exception as e:
return JSONResponse(status_code=500, content=f"Error inputting metadata: {e}")
async def upload_file(reference, file: UploadFile):
try:
# Read the binary content of the uploaded file once
content = await file.read()
# Store the file content in a BytesIO stream for reuse later
file_stream = BytesIO(content)
# Parse the journal
parsed_documents = parse_journal(content, file.filename)
# Generate metadata
metadata_gen = Metadata(reference)
documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
print("Document with Metadata: \n\n", documents_with_metadata)
print("Number of documents: \n", len(documents_with_metadata))
# Return the parsed documents with metadata and the file stream
return documents_with_metadata, file_stream
except Exception as e:
return JSONResponse(status_code=500, content=f"Error processing file: {e}") |