Spaces:
Sleeping
Sleeping
File size: 4,161 Bytes
e94100d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
from pathlib import Path
import re
import shutil
import tempfile
import uuid
import aspose.words as aw
import pymupdf4llm
from project_settings import project_path
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--filename",
default=(project_path / "data/files/pdf/2024.naacl-long.35.pdf").as_posix(),
# default=(project_path / "data/files/pdf/临时救助工作应知应会知识.pdf").as_posix(),
# default=(project_path / "data/unstructured_eval/pdf/麦肯锡2023年AI现状_生成式AI的爆发之年.pdf").as_posix(),
type=str
)
args = parser.parse_args()
return args
@BaseToMarkdown.register("pymupdf4llm")
class PyMuPdf2Llm(BaseToMarkdown):
"""
不支持图像
https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/
"""
def __init__(self, filename: str, image_folder: str = "media"):
super().__init__(filename)
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
# pdf to md
md_text = pymupdf4llm.to_markdown(self.filename)
with open(md_file.as_posix(), "w", encoding="utf-8") as f:
f.write(md_text)
# zip
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
self.zip_directory(temp_dir, output_zip_file)
shutil.rmtree(temp_dir)
return output_zip_file
@BaseToMarkdown.register("aspose_words")
class AsposeWordsPdf2Md(BaseToMarkdown):
"""
https://pypi.org/project/aspose-words/
https://products.aspose.com/words/python-net/
https://products.aspose.com/words/python-net/merge/pdf-to-markdown/
"""
def __init__(self, filename: str, image_folder: str = "media"):
super().__init__(filename)
self.doc = aw.Document(self.filename)
self.image_folder = image_folder
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
media_dir = temp_dir / self.image_folder
media_dir.mkdir(parents=True, exist_ok=False)
# pdf to md
self.doc.save(md_file.as_posix())
# images
for pattern in ["*.jpeg", "*.jpg", "*.png", "*.gif", "*.bmp", "*.tiff"]:
for image_file in temp_dir.glob(pattern):
shutil.move(
src=image_file.as_posix(),
dst=media_dir.as_posix(),
)
# md image convert
with open(md_file.as_posix(), "r", encoding="utf-8") as f:
md_text = f.read()
md_text = self.convert_image_to_media_dir(md_text, image_folder=self.image_folder)
with open(md_file.as_posix(), "w", encoding="utf-8") as f:
f.write(md_text)
# zip
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
self.zip_directory(temp_dir, output_zip_file)
shutil.rmtree(temp_dir)
return output_zip_file
def convert_image_to_media_dir(self,
markdown_text: str,
image_folder: str = "media",
):
pattern1 = r'\!\[(?:.*?)\]\((.+?)\)'
def replace(match):
relative_path = match.group(1)
relative_path = os.path.join(image_folder, relative_path)
result = f""
return result
markdown_text = re.sub(pattern1, replace, markdown_text)
return markdown_text
def main():
args = get_args()
p2m = PyMuPdf2Llm(args.filename)
# p2m = AsposeWordsPdf2Md(args.filename)
output_zip_file = p2m.save_to_zip(output_dir=".")
print(output_zip_file)
return
if __name__ == "__main__":
main()
|