|
from langchain_docling import DoclingLoader |
|
from langchain_docling.loader import ExportType |
|
|
|
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat |
|
from docling.datamodel.pipeline_options import PdfPipelineOptions |
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend |
|
import spaces |
|
|
|
@spaces.GPU |
|
def convert_to_markdown(file_objs, url, do_ocr, do_table_structure): |
|
file_path = file_objs if file_objs is not None else url |
|
pipeline_options = PdfPipelineOptions() |
|
pipeline_options.do_ocr = do_ocr |
|
pipeline_options.do_table_structure = do_table_structure |
|
pdf_format_options = PdfFormatOption( |
|
pipeline_options=pipeline_options, |
|
backend=PyPdfiumDocumentBackend, |
|
) |
|
doc_converter = DocumentConverter( |
|
allowed_formats=[InputFormat.PDF], |
|
format_options={ |
|
InputFormat.PDF: pdf_format_options |
|
} |
|
) |
|
|
|
|
|
loader = DoclingLoader( |
|
file_path=file_path, |
|
export_type=ExportType.MARKDOWN, |
|
converter=doc_converter |
|
) |
|
docs = loader.load() |
|
return docs[0].page_content |
|
|