File size: 1,223 Bytes
b5ac9e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType

# Import required classes for building a custom converter
from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
import spaces

@spaces.GPU
def convert_to_markdown(file_objs, url, do_ocr, do_table_structure):
    file_path = file_objs if file_objs is not None else url
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = do_ocr
    pipeline_options.do_table_structure = do_table_structure
    pdf_format_options = PdfFormatOption(
        pipeline_options=pipeline_options,
        backend=PyPdfiumDocumentBackend,
    )
    doc_converter = DocumentConverter(
        allowed_formats=[InputFormat.PDF],
        format_options={
            InputFormat.PDF: pdf_format_options
        }
    )

    # Pass the custom converter to the DoclingLoader.
    loader = DoclingLoader(
        file_path=file_path,
        export_type=ExportType.MARKDOWN,
        converter=doc_converter
    )
    docs = loader.load()
    return docs[0].page_content