marker-io / benchmark.py
Ritvik19's picture
Add all files and directories
c8a32e7
raw
history blame
5.82 kB
import argparse
import tempfile
import time
from collections import defaultdict
from tqdm import tqdm
import pypdfium2 as pdfium
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.benchmark.scoring import score_text
from marker.pdf.extract_text import naive_get_text
import json
import os
import subprocess
import shutil
from tabulate import tabulate
import torch
configure_logging()
def start_memory_profiling():
torch.cuda.memory._record_memory_history(
max_entries=100000
)
def stop_memory_profiling(memory_file):
try:
torch.cuda.memory._dump_snapshot(memory_file)
except Exception as e:
logger.error(f"Failed to capture memory snapshot {e}")
# Stop recording memory snapshot history.
torch.cuda.memory._record_memory_history(enabled=None)
def nougat_prediction(pdf_filename, batch_size=1):
out_dir = tempfile.mkdtemp()
subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
md_file = os.listdir(out_dir)[0]
with open(os.path.join(out_dir, md_file), "r") as f:
data = f.read()
shutil.rmtree(out_dir)
return data
def main():
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
parser.add_argument("in_folder", help="Input PDF files")
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
parser.add_argument("out_file", help="Output filename")
parser.add_argument("--nougat", action="store_true", help="Run nougat and compare", default=False)
# Nougat batch size 1 uses about as much VRAM as default marker settings
parser.add_argument("--marker_batch_multiplier", type=int, default=1, help="Batch size multiplier to use for marker when making predictions.")
parser.add_argument("--nougat_batch_size", type=int, default=1, help="Batch size to use for nougat when making predictions.")
parser.add_argument("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
parser.add_argument("--profile_memory", action="store_true", help="Profile memory usage", default=False)
args = parser.parse_args()
methods = ["marker"]
if args.nougat:
methods.append("nougat")
if args.profile_memory:
start_memory_profiling()
model_lst = load_all_models()
if args.profile_memory:
stop_memory_profiling("model_load.pickle")
scores = defaultdict(dict)
benchmark_files = os.listdir(args.in_folder)
benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")]
times = defaultdict(dict)
pages = defaultdict(int)
for idx, fname in tqdm(enumerate(benchmark_files)):
md_filename = fname.rsplit(".", 1)[0] + ".md"
reference_filename = os.path.join(args.reference_folder, md_filename)
with open(reference_filename, "r", encoding="utf-8") as f:
reference = f.read()
pdf_filename = os.path.join(args.in_folder, fname)
doc = pdfium.PdfDocument(pdf_filename)
pages[fname] = len(doc)
for method in methods:
start = time.time()
if method == "marker":
if args.profile_memory:
start_memory_profiling()
full_text, _, out_meta = convert_single_pdf(pdf_filename, model_lst, batch_multiplier=args.marker_batch_multiplier)
if args.profile_memory:
stop_memory_profiling(f"marker_memory_{idx}.pickle")
elif method == "nougat":
full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
elif method == "naive":
full_text = naive_get_text(doc)
else:
raise ValueError(f"Unknown method {method}")
times[method][fname] = time.time() - start
score = score_text(full_text, reference)
scores[method][fname] = score
if args.md_out_path:
md_out_filename = f"{method}_{md_filename}"
with open(os.path.join(args.md_out_path, md_out_filename), "w+") as f:
f.write(full_text)
total_pages = sum(pages.values())
with open(args.out_file, "w+") as f:
write_data = defaultdict(dict)
for method in methods:
total_time = sum(times[method].values())
file_stats = {
fname:
{
"time": times[method][fname],
"score": scores[method][fname],
"pages": pages[fname]
}
for fname in benchmark_files
}
write_data[method] = {
"files": file_stats,
"avg_score": sum(scores[method].values()) / len(scores[method]),
"time_per_page": total_time / total_pages,
"time_per_doc": total_time / len(scores[method])
}
json.dump(write_data, f, indent=4)
summary_table = []
score_table = []
score_headers = benchmark_files
for method in methods:
summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])
print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
print("")
print("Scores by file")
print(tabulate(score_table, headers=["Method", *score_headers]))
if __name__ == "__main__":
main()