Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,223 Bytes
b5ac9e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
# Import required classes for building a custom converter
from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
import spaces
@spaces.GPU
def convert_to_markdown(file_objs, url, do_ocr, do_table_structure):
file_path = file_objs if file_objs is not None else url
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = do_ocr
pipeline_options.do_table_structure = do_table_structure
pdf_format_options = PdfFormatOption(
pipeline_options=pipeline_options,
backend=PyPdfiumDocumentBackend,
)
doc_converter = DocumentConverter(
allowed_formats=[InputFormat.PDF],
format_options={
InputFormat.PDF: pdf_format_options
}
)
# Pass the custom converter to the DoclingLoader.
loader = DoclingLoader(
file_path=file_path,
export_type=ExportType.MARKDOWN,
converter=doc_converter
)
docs = loader.load()
return docs[0].page_content
|