Spaces:
Running
Running
"""Convert any document to Markdown.""" | |
import re | |
from copy import deepcopy | |
from pathlib import Path | |
from typing import Any | |
import mdformat | |
import numpy as np | |
from pdftext.extraction import dictionary_output | |
from sklearn.cluster import KMeans | |
def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0915 | |
"""Convert a PDF parsed with pdftext to Markdown.""" | |
def add_heading_level_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]: # noqa: C901 | |
"""Add heading level metadata to a PDF parsed with pdftext.""" | |
def extract_font_size(span: dict[str, Any]) -> float: | |
"""Extract the font size from a text span.""" | |
font_size: float = 1.0 | |
if span["font"]["size"] > 1: # A value of 1 appears to mean "unknown" in pdftext. | |
font_size = span["font"]["size"] | |
elif digit_sequences := re.findall(r"\d+", span["font"]["name"] or ""): | |
font_size = float(digit_sequences[-1]) | |
elif "\n" not in span["text"]: # Occasionally a span can contain a newline character. | |
if round(span["rotation"]) in (0.0, 180.0, -180.0): | |
font_size = span["bbox"][3] - span["bbox"][1] | |
elif round(span["rotation"]) in (90.0, -90.0, 270.0, -270.0): | |
font_size = span["bbox"][2] - span["bbox"][0] | |
return font_size | |
# Copy the pages. | |
pages = deepcopy(pages) | |
# Extract an array of all font sizes used by the text spans. | |
font_sizes = np.asarray( | |
[ | |
extract_font_size(span) | |
for page in pages | |
for block in page["blocks"] | |
for line in block["lines"] | |
for span in line["spans"] | |
] | |
) | |
font_sizes = np.round(font_sizes * 2) / 2 | |
unique_font_sizes, counts = np.unique(font_sizes, return_counts=True) | |
# Determine the paragraph font size as the mode font size. | |
tiny = unique_font_sizes < min(5, np.max(unique_font_sizes)) | |
counts[tiny] = -counts[tiny] | |
mode = np.argmax(counts) | |
counts[tiny] = -counts[tiny] | |
mode_font_size = unique_font_sizes[mode] | |
# Determine (at most) 6 heading font sizes by clustering font sizes larger than the mode. | |
heading_font_sizes = unique_font_sizes[mode + 1 :] | |
if len(heading_font_sizes) > 0: | |
heading_counts = counts[mode + 1 :] | |
kmeans = KMeans(n_clusters=min(6, len(heading_font_sizes)), random_state=42) | |
kmeans.fit(heading_font_sizes[:, np.newaxis], sample_weight=heading_counts) | |
heading_font_sizes = np.sort(np.ravel(kmeans.cluster_centers_))[::-1] | |
# Add heading level information to the text spans and lines. | |
for page in pages: | |
for block in page["blocks"]: | |
for line in block["lines"]: | |
if "md" not in line: | |
line["md"] = {} | |
heading_level = np.zeros(8) # 0-5: <h1>-<h6>, 6: <p>, 7: <small> | |
for span in line["spans"]: | |
if "md" not in span: | |
span["md"] = {} | |
span_font_size = extract_font_size(span) | |
if span_font_size < mode_font_size: | |
idx = 7 | |
elif span_font_size == mode_font_size: | |
idx = 6 | |
else: | |
idx = np.argmin(np.abs(heading_font_sizes - span_font_size)) # type: ignore[assignment] | |
span["md"]["heading_level"] = idx + 1 | |
heading_level[idx] += len(span["text"]) | |
line["md"]["heading_level"] = np.argmax(heading_level) + 1 | |
return pages | |
def add_emphasis_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]: | |
"""Add emphasis metadata such as bold and italic to a PDF parsed with pdftext.""" | |
# Copy the pages. | |
pages = deepcopy(pages) | |
# Add emphasis metadata to the text spans. | |
for page in pages: | |
for block in page["blocks"]: | |
for line in block["lines"]: | |
if "md" not in line: | |
line["md"] = {} | |
for span in line["spans"]: | |
if "md" not in span: | |
span["md"] = {} | |
span["md"]["bold"] = span["font"]["weight"] > 500 # noqa: PLR2004 | |
span["md"]["italic"] = "ital" in (span["font"]["name"] or "").lower() | |
line["md"]["bold"] = all( | |
span["md"]["bold"] for span in line["spans"] if span["text"].strip() | |
) | |
line["md"]["italic"] = all( | |
span["md"]["italic"] for span in line["spans"] if span["text"].strip() | |
) | |
return pages | |
def strip_page_numbers(pages: list[dict[str, Any]]) -> list[dict[str, Any]]: | |
"""Strip page numbers from a PDF parsed with pdftext.""" | |
# Copy the pages. | |
pages = deepcopy(pages) | |
# Remove lines that only contain a page number. | |
for page in pages: | |
for block in page["blocks"]: | |
block["lines"] = [ | |
line | |
for line in block["lines"] | |
if not re.match( | |
r"^\s*[#0]*\d+\s*$", "".join(span["text"] for span in line["spans"]) | |
) | |
] | |
return pages | |
def convert_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0912 | |
"""Convert a list of pages to Markdown.""" | |
pages_md = [] | |
for page in pages: | |
page_md = "" | |
for block in page["blocks"]: | |
block_text = "" | |
for line in block["lines"]: | |
# Build the line text and style the spans. | |
line_text = "" | |
for span in line["spans"]: | |
if ( | |
not line["md"]["bold"] | |
and not line["md"]["italic"] | |
and span["md"]["bold"] | |
and span["md"]["italic"] | |
): | |
line_text += f"***{span['text']}***" | |
elif not line["md"]["bold"] and span["md"]["bold"]: | |
line_text += f"**{span['text']}**" | |
elif not line["md"]["italic"] and span["md"]["italic"]: | |
line_text += f"*{span['text']}*" | |
else: | |
line_text += span["text"] | |
# Add emphasis to the line (if it's not a heading or whitespace). | |
line_text = line_text.rstrip() | |
line_is_whitespace = not line_text.strip() | |
line_is_heading = line["md"]["heading_level"] <= 6 # noqa: PLR2004 | |
if not line_is_heading and not line_is_whitespace: | |
if line["md"]["bold"] and line["md"]["italic"]: | |
line_text = f"***{line_text}***" | |
elif line["md"]["bold"]: | |
line_text = f"**{line_text}**" | |
elif line["md"]["italic"]: | |
line_text = f"*{line_text}*" | |
# Set the heading level. | |
if line_is_heading and not line_is_whitespace: | |
line_text = f"{'#' * line['md']['heading_level']} {line_text}" | |
line_text += "\n" | |
block_text += line_text | |
block_text = block_text.rstrip() + "\n\n" | |
page_md += block_text | |
pages_md.append(page_md.strip()) | |
return pages_md | |
def merge_split_headings(pages: list[str]) -> list[str]: | |
"""Merge headings that are split across lines.""" | |
def _merge_split_headings(match: re.Match[str]) -> str: | |
atx_headings = [line.strip("# ").strip() for line in match.group().splitlines()] | |
return f"{match.group(1)} {' '.join(atx_headings)}\n\n" | |
pages_md = [ | |
re.sub( | |
r"^(#+)[ \t]+[^\n]+\n+(?:^\1[ \t]+[^\n]+\n+)+", | |
_merge_split_headings, | |
page, | |
flags=re.MULTILINE, | |
) | |
for page in pages | |
] | |
return pages_md | |
# Add heading level metadata. | |
pages = add_heading_level_metadata(pages) | |
# Add emphasis metadata. | |
pages = add_emphasis_metadata(pages) | |
# Strip page numbers. | |
pages = strip_page_numbers(pages) | |
# Convert the pages to Markdown. | |
pages_md = convert_to_markdown(pages) | |
# Merge headings that are split across lines. | |
pages_md = merge_split_headings(pages_md) | |
return pages_md | |
def document_to_markdown(doc_path: Path) -> str: | |
"""Convert any document to GitHub Flavored Markdown.""" | |
# Convert the file's content to GitHub Flavored Markdown. | |
if doc_path.suffix == ".pdf": | |
# Parse the PDF with pdftext and convert it to Markdown. | |
pages = dictionary_output(doc_path, sort=True, keep_chars=False) | |
doc = "\n\n".join(parsed_pdf_to_markdown(pages)) | |
else: | |
try: | |
# Use pandoc for everything else. | |
import pypandoc | |
doc = pypandoc.convert_file(doc_path, to="gfm") | |
except ImportError as error: | |
error_message = ( | |
"To convert files to Markdown with pandoc, please install the `pandoc` extra." | |
) | |
raise ImportError(error_message) from error | |
except RuntimeError: | |
# File format not supported, fall back to reading the text. | |
doc = doc_path.read_text() | |
# Improve Markdown quality. | |
doc = mdformat.text(doc) | |
return doc | |