beyondrag / preprocess_document.py
giulio98's picture
Update app.py
b5ac9e4
raw
history blame
1.22 kB
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
# Import required classes for building a custom converter
from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
import spaces
@spaces.GPU
def convert_to_markdown(file_objs, url, do_ocr, do_table_structure):
file_path = file_objs if file_objs is not None else url
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = do_ocr
pipeline_options.do_table_structure = do_table_structure
pdf_format_options = PdfFormatOption(
pipeline_options=pipeline_options,
backend=PyPdfiumDocumentBackend,
)
doc_converter = DocumentConverter(
allowed_formats=[InputFormat.PDF],
format_options={
InputFormat.PDF: pdf_format_options
}
)
# Pass the custom converter to the DoclingLoader.
loader = DoclingLoader(
file_path=file_path,
export_type=ExportType.MARKDOWN,
converter=doc_converter
)
docs = loader.load()
return docs[0].page_content