|
""" |
|
这里实现2个click命令: |
|
第一个: |
|
接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350 |
|
1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。 |
|
2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf |
|
3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图 |
|
4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件 |
|
|
|
最后把以上步骤准备好的对象传入真正的解析API |
|
|
|
第二个: |
|
接收1)pdf的本地路径。2)模型json文件(可选)。然后: |
|
1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图 |
|
2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件 |
|
3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入 |
|
|
|
|
|
效果: |
|
python magicpdf.py json-command --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 |
|
python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf |
|
""" |
|
|
|
import os |
|
import json as json_parse |
|
import click |
|
from loguru import logger |
|
from pathlib import Path |
|
from magic_pdf.libs.version import __version__ |
|
|
|
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode |
|
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox |
|
from magic_pdf.pipe.UNIPipe import UNIPipe |
|
from magic_pdf.pipe.OCRPipe import OCRPipe |
|
from magic_pdf.pipe.TXTPipe import TXTPipe |
|
from magic_pdf.libs.path_utils import ( |
|
parse_s3path, |
|
parse_s3_range_params, |
|
remove_non_official_s3_args, |
|
) |
|
from magic_pdf.libs.config_reader import ( |
|
get_local_dir, |
|
get_s3_config, |
|
) |
|
from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter |
|
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter |
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter |
|
import csv |
|
import copy |
|
import magic_pdf.model as model_config |
|
|
|
parse_pdf_methods = click.Choice(["ocr", "txt", "auto"]) |
|
|
|
|
|
def prepare_env(pdf_file_name, method): |
|
local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method) |
|
|
|
local_image_dir = os.path.join(str(local_parent_dir), "images") |
|
local_md_dir = local_parent_dir |
|
os.makedirs(local_image_dir, exist_ok=True) |
|
os.makedirs(local_md_dir, exist_ok=True) |
|
return local_image_dir, local_md_dir |
|
|
|
|
|
def write_to_csv(csv_file_path, csv_data): |
|
with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile: |
|
|
|
csv_writer = csv.writer(csvfile) |
|
|
|
csv_writer.writerow(csv_data) |
|
logger.info(f"数据已成功追加到 '{csv_file_path}'") |
|
|
|
|
|
def do_parse( |
|
pdf_file_name, |
|
pdf_bytes, |
|
model_list, |
|
parse_method, |
|
f_draw_span_bbox=True, |
|
f_draw_layout_bbox=True, |
|
f_dump_md=True, |
|
f_dump_middle_json=True, |
|
f_dump_model_json=True, |
|
f_dump_orig_pdf=True, |
|
f_dump_content_list=True, |
|
f_make_md_mode=MakeMode.MM_MD, |
|
): |
|
|
|
orig_model_list = copy.deepcopy(model_list) |
|
|
|
local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method) |
|
logger.info(f"local output dir is {local_md_dir}") |
|
image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir) |
|
image_dir = str(os.path.basename(local_image_dir)) |
|
|
|
if parse_method == "auto": |
|
jso_useful_key = {"_pdf_type": "", "model_list": model_list} |
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True) |
|
elif parse_method == "txt": |
|
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True) |
|
elif parse_method == "ocr": |
|
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True) |
|
else: |
|
logger.error("unknown parse method") |
|
exit(1) |
|
|
|
pipe.pipe_classify() |
|
|
|
"""如果没有传入有效的模型数据,则使用内置model解析""" |
|
if len(model_list) == 0: |
|
if model_config.__use_inside_model__: |
|
pipe.pipe_analyze() |
|
orig_model_list = copy.deepcopy(pipe.model_list) |
|
else: |
|
logger.error("need model list input") |
|
exit(1) |
|
|
|
pipe.pipe_parse() |
|
pdf_info = pipe.pdf_mid_data["pdf_info"] |
|
if f_draw_layout_bbox: |
|
draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir) |
|
if f_draw_span_bbox: |
|
draw_span_bbox(pdf_info, pdf_bytes, local_md_dir) |
|
|
|
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode) |
|
if f_dump_md: |
|
"""写markdown""" |
|
md_writer.write( |
|
content=md_content, |
|
path=f"{pdf_file_name}.md", |
|
mode=AbsReaderWriter.MODE_TXT, |
|
) |
|
|
|
if f_dump_middle_json: |
|
"""写middle_json""" |
|
md_writer.write( |
|
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), |
|
path=f"{pdf_file_name}_middle.json", |
|
mode=AbsReaderWriter.MODE_TXT, |
|
) |
|
|
|
if f_dump_model_json: |
|
"""写model_json""" |
|
md_writer.write( |
|
content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4), |
|
path=f"{pdf_file_name}_model.json", |
|
mode=AbsReaderWriter.MODE_TXT, |
|
) |
|
|
|
if f_dump_orig_pdf: |
|
"""写源pdf""" |
|
md_writer.write( |
|
content=pdf_bytes, |
|
path=f"{pdf_file_name}_origin.pdf", |
|
mode=AbsReaderWriter.MODE_BIN, |
|
) |
|
|
|
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE) |
|
if f_dump_content_list: |
|
"""写content_list""" |
|
md_writer.write( |
|
content=json_parse.dumps(content_list, ensure_ascii=False, indent=4), |
|
path=f"{pdf_file_name}_content_list.json", |
|
mode=AbsReaderWriter.MODE_TXT, |
|
) |
|
|
|
|
|
@click.group() |
|
@click.version_option(__version__, "--version", "-v", help="显示版本信息") |
|
@click.help_option("--help", "-h", help="显示帮助信息") |
|
def cli(): |
|
pass |
|
|
|
|
|
@cli.command() |
|
@click.option("--json", type=str, help="输入一个S3路径") |
|
@click.option( |
|
"--method", |
|
type=parse_pdf_methods, |
|
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法", |
|
default="auto", |
|
) |
|
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试") |
|
@click.option("--model_mode", type=click.STRING, default="full", |
|
help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢") |
|
def json_command(json, method, inside_model, model_mode): |
|
model_config.__use_inside_model__ = inside_model |
|
model_config.__model_mode__ = model_mode |
|
|
|
if not json.startswith("s3://"): |
|
logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path") |
|
exit(1) |
|
|
|
def read_s3_path(s3path): |
|
bucket, key = parse_s3path(s3path) |
|
|
|
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket) |
|
s3_rw = S3ReaderWriter( |
|
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path) |
|
) |
|
may_range_params = parse_s3_range_params(s3path) |
|
if may_range_params is None or 2 != len(may_range_params): |
|
byte_start, byte_end = 0, None |
|
else: |
|
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1]) |
|
byte_end += byte_start - 1 |
|
return s3_rw.read_jsonl( |
|
remove_non_official_s3_args(s3path), |
|
byte_start, |
|
byte_end, |
|
AbsReaderWriter.MODE_BIN, |
|
) |
|
|
|
jso = json_parse.loads(read_s3_path(json).decode("utf-8")) |
|
s3_file_path = jso.get("file_location") |
|
if s3_file_path is None: |
|
s3_file_path = jso.get("path") |
|
pdf_file_name = Path(s3_file_path).stem |
|
pdf_data = read_s3_path(s3_file_path) |
|
|
|
do_parse( |
|
pdf_file_name, |
|
pdf_data, |
|
jso["doc_layout_result"], |
|
method, |
|
) |
|
|
|
|
|
@cli.command() |
|
@click.option("--local_json", type=str, help="输入一个本地jsonl路径") |
|
@click.option( |
|
"--method", |
|
type=parse_pdf_methods, |
|
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法", |
|
default="auto", |
|
) |
|
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试") |
|
@click.option("--model_mode", type=click.STRING, default="full", |
|
help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢") |
|
def local_json_command(local_json, method, inside_model, model_mode): |
|
model_config.__use_inside_model__ = inside_model |
|
model_config.__model_mode__ = model_mode |
|
|
|
def read_s3_path(s3path): |
|
bucket, key = parse_s3path(s3path) |
|
|
|
s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket) |
|
s3_rw = S3ReaderWriter( |
|
s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path) |
|
) |
|
may_range_params = parse_s3_range_params(s3path) |
|
if may_range_params is None or 2 != len(may_range_params): |
|
byte_start, byte_end = 0, None |
|
else: |
|
byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1]) |
|
byte_end += byte_start - 1 |
|
return s3_rw.read_jsonl( |
|
remove_non_official_s3_args(s3path), |
|
byte_start, |
|
byte_end, |
|
AbsReaderWriter.MODE_BIN, |
|
) |
|
|
|
with open(local_json, "r", encoding="utf-8") as f: |
|
for json_line in f: |
|
jso = json_parse.loads(json_line) |
|
|
|
s3_file_path = jso.get("file_location") |
|
if s3_file_path is None: |
|
s3_file_path = jso.get("path") |
|
pdf_file_name = Path(s3_file_path).stem |
|
pdf_data = read_s3_path(s3_file_path) |
|
do_parse( |
|
pdf_file_name, |
|
pdf_data, |
|
jso["doc_layout_result"], |
|
method, |
|
) |
|
|
|
|
|
@cli.command() |
|
@click.option( |
|
"--pdf", type=click.Path(exists=True), required=True, |
|
help='pdf 文件路径, 支持单个文件或文件列表, 文件列表需要以".list"结尾, 一行一个pdf文件路径') |
|
@click.option("--model", type=click.Path(exists=True), help="模型的路径") |
|
@click.option( |
|
"--method", |
|
type=parse_pdf_methods, |
|
help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法", |
|
default="auto", |
|
) |
|
@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试") |
|
@click.option("--model_mode", type=click.STRING, default="full", |
|
help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢") |
|
def pdf_command(pdf, model, method, inside_model, model_mode): |
|
model_config.__use_inside_model__ = inside_model |
|
model_config.__model_mode__ = model_mode |
|
|
|
def read_fn(path): |
|
disk_rw = DiskReaderWriter(os.path.dirname(path)) |
|
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN) |
|
|
|
def get_model_json(model_path, doc_path): |
|
|
|
if model_path is None: |
|
file_name_without_extension, extension = os.path.splitext(doc_path) |
|
if extension == ".pdf": |
|
model_path = file_name_without_extension + ".json" |
|
else: |
|
raise Exception("pdf_path input error") |
|
if not os.path.exists(model_path): |
|
logger.warning( |
|
f"not found json {model_path} existed" |
|
) |
|
|
|
model_json = "[]" |
|
else: |
|
model_json = read_fn(model_path).decode("utf-8") |
|
else: |
|
model_json = read_fn(model_path).decode("utf-8") |
|
|
|
return model_json |
|
|
|
def parse_doc(doc_path): |
|
try: |
|
file_name = str(Path(doc_path).stem) |
|
pdf_data = read_fn(doc_path) |
|
jso = json_parse.loads(get_model_json(model, doc_path)) |
|
|
|
do_parse( |
|
file_name, |
|
pdf_data, |
|
jso, |
|
method, |
|
) |
|
|
|
except Exception as e: |
|
logger.exception(e) |
|
|
|
if not pdf: |
|
logger.error(f"Error: Missing argument '--pdf'.") |
|
exit(f"Error: Missing argument '--pdf'.") |
|
else: |
|
'''适配多个文档的list文件输入''' |
|
if pdf.endswith(".list"): |
|
with open(pdf, "r") as f: |
|
for line in f.readlines(): |
|
line = line.strip() |
|
parse_doc(line) |
|
else: |
|
'''适配单个文档的输入''' |
|
parse_doc(pdf) |
|
|
|
|
|
if __name__ == "__main__": |
|
""" |
|
python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551 |
|
""" |
|
cli() |
|
|