Spaces:

giulio98
/

beyondrag

Runtime error

beyondrag / preprocess_document.py

Update app.py

b5ac9e4 4 months ago

1.22 kB

	from langchain_docling import DoclingLoader
	from langchain_docling.loader import ExportType

	# Import required classes for building a custom converter
	from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat
	from docling.datamodel.pipeline_options import PdfPipelineOptions
	from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
	import spaces

	@spaces.GPU
	def convert_to_markdown(file_objs, url, do_ocr, do_table_structure):
	file_path = file_objs if file_objs is not None else url
	pipeline_options = PdfPipelineOptions()
	pipeline_options.do_ocr = do_ocr
	pipeline_options.do_table_structure = do_table_structure
	pdf_format_options = PdfFormatOption(
	pipeline_options=pipeline_options,
	backend=PyPdfiumDocumentBackend,
	)
	doc_converter = DocumentConverter(
	allowed_formats=[InputFormat.PDF],
	format_options={
	InputFormat.PDF: pdf_format_options
	}
	)

	# Pass the custom converter to the DoclingLoader.
	loader = DoclingLoader(
	file_path=file_path,
	export_type=ExportType.MARKDOWN,
	converter=doc_converter
	)
	docs = loader.load()
	return docs[0].page_content