|
import argparse |
|
import tempfile |
|
import time |
|
from collections import defaultdict |
|
|
|
from tqdm import tqdm |
|
import pypdfium2 as pdfium |
|
|
|
from marker.convert import convert_single_pdf |
|
from marker.logger import configure_logging |
|
from marker.models import load_all_models |
|
from marker.benchmark.scoring import score_text |
|
from marker.pdf.extract_text import naive_get_text |
|
import json |
|
import os |
|
import subprocess |
|
import shutil |
|
from tabulate import tabulate |
|
import torch |
|
|
|
configure_logging() |
|
|
|
|
|
def start_memory_profiling(): |
|
torch.cuda.memory._record_memory_history( |
|
max_entries=100000 |
|
) |
|
|
|
|
|
def stop_memory_profiling(memory_file): |
|
try: |
|
torch.cuda.memory._dump_snapshot(memory_file) |
|
except Exception as e: |
|
logger.error(f"Failed to capture memory snapshot {e}") |
|
|
|
|
|
torch.cuda.memory._record_memory_history(enabled=None) |
|
|
|
|
|
def nougat_prediction(pdf_filename, batch_size=1): |
|
out_dir = tempfile.mkdtemp() |
|
subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True) |
|
md_file = os.listdir(out_dir)[0] |
|
with open(os.path.join(out_dir, md_file), "r") as f: |
|
data = f.read() |
|
shutil.rmtree(out_dir) |
|
return data |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.") |
|
parser.add_argument("in_folder", help="Input PDF files") |
|
parser.add_argument("reference_folder", help="Reference folder with reference markdown files") |
|
parser.add_argument("out_file", help="Output filename") |
|
parser.add_argument("--nougat", action="store_true", help="Run nougat and compare", default=False) |
|
|
|
parser.add_argument("--marker_batch_multiplier", type=int, default=1, help="Batch size multiplier to use for marker when making predictions.") |
|
parser.add_argument("--nougat_batch_size", type=int, default=1, help="Batch size to use for nougat when making predictions.") |
|
parser.add_argument("--md_out_path", type=str, default=None, help="Output path for generated markdown files") |
|
parser.add_argument("--profile_memory", action="store_true", help="Profile memory usage", default=False) |
|
|
|
args = parser.parse_args() |
|
|
|
methods = ["marker"] |
|
if args.nougat: |
|
methods.append("nougat") |
|
|
|
if args.profile_memory: |
|
start_memory_profiling() |
|
|
|
model_lst = load_all_models() |
|
|
|
if args.profile_memory: |
|
stop_memory_profiling("model_load.pickle") |
|
|
|
scores = defaultdict(dict) |
|
benchmark_files = os.listdir(args.in_folder) |
|
benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")] |
|
times = defaultdict(dict) |
|
pages = defaultdict(int) |
|
|
|
for idx, fname in tqdm(enumerate(benchmark_files)): |
|
md_filename = fname.rsplit(".", 1)[0] + ".md" |
|
|
|
reference_filename = os.path.join(args.reference_folder, md_filename) |
|
with open(reference_filename, "r", encoding="utf-8") as f: |
|
reference = f.read() |
|
|
|
pdf_filename = os.path.join(args.in_folder, fname) |
|
doc = pdfium.PdfDocument(pdf_filename) |
|
pages[fname] = len(doc) |
|
|
|
for method in methods: |
|
start = time.time() |
|
if method == "marker": |
|
if args.profile_memory: |
|
start_memory_profiling() |
|
full_text, _, out_meta = convert_single_pdf(pdf_filename, model_lst, batch_multiplier=args.marker_batch_multiplier) |
|
if args.profile_memory: |
|
stop_memory_profiling(f"marker_memory_{idx}.pickle") |
|
elif method == "nougat": |
|
full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size) |
|
elif method == "naive": |
|
full_text = naive_get_text(doc) |
|
else: |
|
raise ValueError(f"Unknown method {method}") |
|
|
|
times[method][fname] = time.time() - start |
|
|
|
score = score_text(full_text, reference) |
|
scores[method][fname] = score |
|
|
|
if args.md_out_path: |
|
md_out_filename = f"{method}_{md_filename}" |
|
with open(os.path.join(args.md_out_path, md_out_filename), "w+") as f: |
|
f.write(full_text) |
|
|
|
total_pages = sum(pages.values()) |
|
with open(args.out_file, "w+") as f: |
|
write_data = defaultdict(dict) |
|
for method in methods: |
|
total_time = sum(times[method].values()) |
|
file_stats = { |
|
fname: |
|
{ |
|
"time": times[method][fname], |
|
"score": scores[method][fname], |
|
"pages": pages[fname] |
|
} |
|
|
|
for fname in benchmark_files |
|
} |
|
write_data[method] = { |
|
"files": file_stats, |
|
"avg_score": sum(scores[method].values()) / len(scores[method]), |
|
"time_per_page": total_time / total_pages, |
|
"time_per_doc": total_time / len(scores[method]) |
|
} |
|
|
|
json.dump(write_data, f, indent=4) |
|
|
|
summary_table = [] |
|
score_table = [] |
|
score_headers = benchmark_files |
|
for method in methods: |
|
summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]]) |
|
score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]]) |
|
|
|
print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"])) |
|
print("") |
|
print("Scores by file") |
|
print(tabulate(score_table, headers=["Method", *score_headers])) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|