|
from loguru import logger |
|
|
|
from magic_pdf.libs.drop_reason import DropReason |
|
|
|
|
|
def get_data_source(jso: dict): |
|
data_source = jso.get("data_source") |
|
if data_source is None: |
|
data_source = jso.get("file_source") |
|
return data_source |
|
|
|
|
|
def get_data_type(jso: dict): |
|
data_type = jso.get("data_type") |
|
if data_type is None: |
|
data_type = jso.get("file_type") |
|
return data_type |
|
|
|
|
|
def get_bookid(jso: dict): |
|
book_id = jso.get("bookid") |
|
if book_id is None: |
|
book_id = jso.get("original_file_id") |
|
return book_id |
|
|
|
|
|
def exception_handler(jso: dict, e): |
|
logger.exception(e) |
|
jso["_need_drop"] = True |
|
jso["_drop_reason"] = DropReason.Exception |
|
jso["_exception"] = f"ERROR: {e}" |
|
return jso |
|
|
|
|
|
def get_bookname(jso: dict): |
|
data_source = get_data_source(jso) |
|
file_id = jso.get("file_id") |
|
book_name = f"{data_source}/{file_id}" |
|
return book_name |
|
|
|
|
|
def spark_json_extractor(jso: dict) -> dict: |
|
|
|
""" |
|
从json中提取数据,返回一个dict |
|
""" |
|
|
|
return { |
|
"_pdf_type": jso["_pdf_type"], |
|
"model_list": jso["doc_layout_result"], |
|
} |
|
|