Spaces:
Running
Running
File size: 10,010 Bytes
54f5afe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
"""Convert any document to Markdown."""
import re
from copy import deepcopy
from pathlib import Path
from typing import Any
import mdformat
import numpy as np
from pdftext.extraction import dictionary_output
from sklearn.cluster import KMeans
def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0915
"""Convert a PDF parsed with pdftext to Markdown."""
def add_heading_level_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]: # noqa: C901
"""Add heading level metadata to a PDF parsed with pdftext."""
def extract_font_size(span: dict[str, Any]) -> float:
"""Extract the font size from a text span."""
font_size: float = 1.0
if span["font"]["size"] > 1: # A value of 1 appears to mean "unknown" in pdftext.
font_size = span["font"]["size"]
elif digit_sequences := re.findall(r"\d+", span["font"]["name"] or ""):
font_size = float(digit_sequences[-1])
elif "\n" not in span["text"]: # Occasionally a span can contain a newline character.
if round(span["rotation"]) in (0.0, 180.0, -180.0):
font_size = span["bbox"][3] - span["bbox"][1]
elif round(span["rotation"]) in (90.0, -90.0, 270.0, -270.0):
font_size = span["bbox"][2] - span["bbox"][0]
return font_size
# Copy the pages.
pages = deepcopy(pages)
# Extract an array of all font sizes used by the text spans.
font_sizes = np.asarray(
[
extract_font_size(span)
for page in pages
for block in page["blocks"]
for line in block["lines"]
for span in line["spans"]
]
)
font_sizes = np.round(font_sizes * 2) / 2
unique_font_sizes, counts = np.unique(font_sizes, return_counts=True)
# Determine the paragraph font size as the mode font size.
tiny = unique_font_sizes < min(5, np.max(unique_font_sizes))
counts[tiny] = -counts[tiny]
mode = np.argmax(counts)
counts[tiny] = -counts[tiny]
mode_font_size = unique_font_sizes[mode]
# Determine (at most) 6 heading font sizes by clustering font sizes larger than the mode.
heading_font_sizes = unique_font_sizes[mode + 1 :]
if len(heading_font_sizes) > 0:
heading_counts = counts[mode + 1 :]
kmeans = KMeans(n_clusters=min(6, len(heading_font_sizes)), random_state=42)
kmeans.fit(heading_font_sizes[:, np.newaxis], sample_weight=heading_counts)
heading_font_sizes = np.sort(np.ravel(kmeans.cluster_centers_))[::-1]
# Add heading level information to the text spans and lines.
for page in pages:
for block in page["blocks"]:
for line in block["lines"]:
if "md" not in line:
line["md"] = {}
heading_level = np.zeros(8) # 0-5: <h1>-<h6>, 6: <p>, 7: <small>
for span in line["spans"]:
if "md" not in span:
span["md"] = {}
span_font_size = extract_font_size(span)
if span_font_size < mode_font_size:
idx = 7
elif span_font_size == mode_font_size:
idx = 6
else:
idx = np.argmin(np.abs(heading_font_sizes - span_font_size)) # type: ignore[assignment]
span["md"]["heading_level"] = idx + 1
heading_level[idx] += len(span["text"])
line["md"]["heading_level"] = np.argmax(heading_level) + 1
return pages
def add_emphasis_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Add emphasis metadata such as bold and italic to a PDF parsed with pdftext."""
# Copy the pages.
pages = deepcopy(pages)
# Add emphasis metadata to the text spans.
for page in pages:
for block in page["blocks"]:
for line in block["lines"]:
if "md" not in line:
line["md"] = {}
for span in line["spans"]:
if "md" not in span:
span["md"] = {}
span["md"]["bold"] = span["font"]["weight"] > 500 # noqa: PLR2004
span["md"]["italic"] = "ital" in (span["font"]["name"] or "").lower()
line["md"]["bold"] = all(
span["md"]["bold"] for span in line["spans"] if span["text"].strip()
)
line["md"]["italic"] = all(
span["md"]["italic"] for span in line["spans"] if span["text"].strip()
)
return pages
def strip_page_numbers(pages: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Strip page numbers from a PDF parsed with pdftext."""
# Copy the pages.
pages = deepcopy(pages)
# Remove lines that only contain a page number.
for page in pages:
for block in page["blocks"]:
block["lines"] = [
line
for line in block["lines"]
if not re.match(
r"^\s*[#0]*\d+\s*$", "".join(span["text"] for span in line["spans"])
)
]
return pages
def convert_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0912
"""Convert a list of pages to Markdown."""
pages_md = []
for page in pages:
page_md = ""
for block in page["blocks"]:
block_text = ""
for line in block["lines"]:
# Build the line text and style the spans.
line_text = ""
for span in line["spans"]:
if (
not line["md"]["bold"]
and not line["md"]["italic"]
and span["md"]["bold"]
and span["md"]["italic"]
):
line_text += f"***{span['text']}***"
elif not line["md"]["bold"] and span["md"]["bold"]:
line_text += f"**{span['text']}**"
elif not line["md"]["italic"] and span["md"]["italic"]:
line_text += f"*{span['text']}*"
else:
line_text += span["text"]
# Add emphasis to the line (if it's not a heading or whitespace).
line_text = line_text.rstrip()
line_is_whitespace = not line_text.strip()
line_is_heading = line["md"]["heading_level"] <= 6 # noqa: PLR2004
if not line_is_heading and not line_is_whitespace:
if line["md"]["bold"] and line["md"]["italic"]:
line_text = f"***{line_text}***"
elif line["md"]["bold"]:
line_text = f"**{line_text}**"
elif line["md"]["italic"]:
line_text = f"*{line_text}*"
# Set the heading level.
if line_is_heading and not line_is_whitespace:
line_text = f"{'#' * line['md']['heading_level']} {line_text}"
line_text += "\n"
block_text += line_text
block_text = block_text.rstrip() + "\n\n"
page_md += block_text
pages_md.append(page_md.strip())
return pages_md
def merge_split_headings(pages: list[str]) -> list[str]:
"""Merge headings that are split across lines."""
def _merge_split_headings(match: re.Match[str]) -> str:
atx_headings = [line.strip("# ").strip() for line in match.group().splitlines()]
return f"{match.group(1)} {' '.join(atx_headings)}\n\n"
pages_md = [
re.sub(
r"^(#+)[ \t]+[^\n]+\n+(?:^\1[ \t]+[^\n]+\n+)+",
_merge_split_headings,
page,
flags=re.MULTILINE,
)
for page in pages
]
return pages_md
# Add heading level metadata.
pages = add_heading_level_metadata(pages)
# Add emphasis metadata.
pages = add_emphasis_metadata(pages)
# Strip page numbers.
pages = strip_page_numbers(pages)
# Convert the pages to Markdown.
pages_md = convert_to_markdown(pages)
# Merge headings that are split across lines.
pages_md = merge_split_headings(pages_md)
return pages_md
def document_to_markdown(doc_path: Path) -> str:
"""Convert any document to GitHub Flavored Markdown."""
# Convert the file's content to GitHub Flavored Markdown.
if doc_path.suffix == ".pdf":
# Parse the PDF with pdftext and convert it to Markdown.
pages = dictionary_output(doc_path, sort=True, keep_chars=False)
doc = "\n\n".join(parsed_pdf_to_markdown(pages))
else:
try:
# Use pandoc for everything else.
import pypandoc
doc = pypandoc.convert_file(doc_path, to="gfm")
except ImportError as error:
error_message = (
"To convert files to Markdown with pandoc, please install the `pandoc` extra."
)
raise ImportError(error_message) from error
except RuntimeError:
# File format not supported, fall back to reading the text.
doc = doc_path.read_text()
# Improve Markdown quality.
doc = mdformat.text(doc)
return doc
|