from collections import Counter from statistics import mean, median from marker.schema.block import Span, Line from marker.schema.page import Page import re from typing import List def is_code_linelen(lines, thresh=80): # Decide based on chars per newline threshold total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines) total_newlines = max(len(lines) - 1, 1) if total_alnum_chars == 0: return False ratio = total_alnum_chars / total_newlines return ratio < thresh def comment_count(lines): pattern = re.compile(r"^(//|#|'|--|/\*|'''|\"\"\"|--\[\[|