Bot_Development / service /reader_v3.py
dsmultimedika's picture
change params reader
2759c49
raw
history blame
2.84 kB
import os
import nest_asyncio
from io import BytesIO
from llama_parse import LlamaParse
from llama_index.core.node_parser import SimpleNodeParser
from dotenv import load_dotenv
from fastapi import UploadFile, File
from fastapi.responses import JSONResponse
import fitz
from script.get_metadata import Metadata
load_dotenv()
nest_asyncio.apply()
def parse_journal(content: bytes, file_name: str):
"""Parse the journal using LlamaParse."""
try:
# Initialize the parser
parser = LlamaParse(
api_key=os.getenv("LLAMA_PARSE_API_KEY"),
result_type="markdown",
# use_vendor_multimodal_model=True,
# vendor_multimodal_model_name="openai-gpt-4o-mini",
)
# Load and process the document
llama_parse_documents = parser.load_data(
content, extra_info={"file_name": file_name}
)
return llama_parse_documents
except Exception as e:
return JSONResponse(status_code=400, content=f"Error processing file: {e}")
async def extract_metadata(content: bytes):
"""Extract metadata from the PDF content."""
try:
# Open the binary content with PyMuPDF
pdf_document = fitz.open("pdf", content) # "pdf" specifies the format
# Extract metadata
metadata = pdf_document.metadata
# Prepare metadata dictionary with default values for missing fields
metadata_dict = {
"title": metadata.get("title", "N/A"),
"author": metadata.get("author", "N/A"),
"subject": metadata.get("subject", "N/A"),
"keywords": metadata.get("keywords", "N/A"),
"creation_date": metadata.get("created", "N/A"),
"modification_date": metadata.get("modified", "N/A"),
}
return metadata_dict
except Exception as e:
return JSONResponse(status_code=500, content=f"Error inputting metadata: {e}")
async def upload_file(reference, file: UploadFile):
try:
# Read the binary content of the uploaded file once
content = await file.read()
# Store the file content in a BytesIO stream for reuse later
file_stream = BytesIO(content)
# Parse the journal
parsed_documents = parse_journal(content, file.filename)
# Generate metadata
metadata_gen = Metadata(reference)
documents_with_metadata = metadata_gen.apply_metadata(parsed_documents)
print("Document with Metadata: \n\n", documents_with_metadata)
print("Number of documents: \n", len(documents_with_metadata))
# Return the parsed documents with metadata and the file stream
return documents_with_metadata, file_stream
except Exception as e:
return JSONResponse(status_code=500, content=f"Error processing file: {e}")