rag_lite / src /raglite /_markdown.py
EL GHAFRAOUI AYOUB
C
54f5afe
"""Convert any document to Markdown."""
import re
from copy import deepcopy
from pathlib import Path
from typing import Any
import mdformat
import numpy as np
from pdftext.extraction import dictionary_output
from sklearn.cluster import KMeans
def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0915
"""Convert a PDF parsed with pdftext to Markdown."""
def add_heading_level_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]: # noqa: C901
"""Add heading level metadata to a PDF parsed with pdftext."""
def extract_font_size(span: dict[str, Any]) -> float:
"""Extract the font size from a text span."""
font_size: float = 1.0
if span["font"]["size"] > 1: # A value of 1 appears to mean "unknown" in pdftext.
font_size = span["font"]["size"]
elif digit_sequences := re.findall(r"\d+", span["font"]["name"] or ""):
font_size = float(digit_sequences[-1])
elif "\n" not in span["text"]: # Occasionally a span can contain a newline character.
if round(span["rotation"]) in (0.0, 180.0, -180.0):
font_size = span["bbox"][3] - span["bbox"][1]
elif round(span["rotation"]) in (90.0, -90.0, 270.0, -270.0):
font_size = span["bbox"][2] - span["bbox"][0]
return font_size
# Copy the pages.
pages = deepcopy(pages)
# Extract an array of all font sizes used by the text spans.
font_sizes = np.asarray(
[
extract_font_size(span)
for page in pages
for block in page["blocks"]
for line in block["lines"]
for span in line["spans"]
]
)
font_sizes = np.round(font_sizes * 2) / 2
unique_font_sizes, counts = np.unique(font_sizes, return_counts=True)
# Determine the paragraph font size as the mode font size.
tiny = unique_font_sizes < min(5, np.max(unique_font_sizes))
counts[tiny] = -counts[tiny]
mode = np.argmax(counts)
counts[tiny] = -counts[tiny]
mode_font_size = unique_font_sizes[mode]
# Determine (at most) 6 heading font sizes by clustering font sizes larger than the mode.
heading_font_sizes = unique_font_sizes[mode + 1 :]
if len(heading_font_sizes) > 0:
heading_counts = counts[mode + 1 :]
kmeans = KMeans(n_clusters=min(6, len(heading_font_sizes)), random_state=42)
kmeans.fit(heading_font_sizes[:, np.newaxis], sample_weight=heading_counts)
heading_font_sizes = np.sort(np.ravel(kmeans.cluster_centers_))[::-1]
# Add heading level information to the text spans and lines.
for page in pages:
for block in page["blocks"]:
for line in block["lines"]:
if "md" not in line:
line["md"] = {}
heading_level = np.zeros(8) # 0-5: <h1>-<h6>, 6: <p>, 7: <small>
for span in line["spans"]:
if "md" not in span:
span["md"] = {}
span_font_size = extract_font_size(span)
if span_font_size < mode_font_size:
idx = 7
elif span_font_size == mode_font_size:
idx = 6
else:
idx = np.argmin(np.abs(heading_font_sizes - span_font_size)) # type: ignore[assignment]
span["md"]["heading_level"] = idx + 1
heading_level[idx] += len(span["text"])
line["md"]["heading_level"] = np.argmax(heading_level) + 1
return pages
def add_emphasis_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Add emphasis metadata such as bold and italic to a PDF parsed with pdftext."""
# Copy the pages.
pages = deepcopy(pages)
# Add emphasis metadata to the text spans.
for page in pages:
for block in page["blocks"]:
for line in block["lines"]:
if "md" not in line:
line["md"] = {}
for span in line["spans"]:
if "md" not in span:
span["md"] = {}
span["md"]["bold"] = span["font"]["weight"] > 500 # noqa: PLR2004
span["md"]["italic"] = "ital" in (span["font"]["name"] or "").lower()
line["md"]["bold"] = all(
span["md"]["bold"] for span in line["spans"] if span["text"].strip()
)
line["md"]["italic"] = all(
span["md"]["italic"] for span in line["spans"] if span["text"].strip()
)
return pages
def strip_page_numbers(pages: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Strip page numbers from a PDF parsed with pdftext."""
# Copy the pages.
pages = deepcopy(pages)
# Remove lines that only contain a page number.
for page in pages:
for block in page["blocks"]:
block["lines"] = [
line
for line in block["lines"]
if not re.match(
r"^\s*[#0]*\d+\s*$", "".join(span["text"] for span in line["spans"])
)
]
return pages
def convert_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0912
"""Convert a list of pages to Markdown."""
pages_md = []
for page in pages:
page_md = ""
for block in page["blocks"]:
block_text = ""
for line in block["lines"]:
# Build the line text and style the spans.
line_text = ""
for span in line["spans"]:
if (
not line["md"]["bold"]
and not line["md"]["italic"]
and span["md"]["bold"]
and span["md"]["italic"]
):
line_text += f"***{span['text']}***"
elif not line["md"]["bold"] and span["md"]["bold"]:
line_text += f"**{span['text']}**"
elif not line["md"]["italic"] and span["md"]["italic"]:
line_text += f"*{span['text']}*"
else:
line_text += span["text"]
# Add emphasis to the line (if it's not a heading or whitespace).
line_text = line_text.rstrip()
line_is_whitespace = not line_text.strip()
line_is_heading = line["md"]["heading_level"] <= 6 # noqa: PLR2004
if not line_is_heading and not line_is_whitespace:
if line["md"]["bold"] and line["md"]["italic"]:
line_text = f"***{line_text}***"
elif line["md"]["bold"]:
line_text = f"**{line_text}**"
elif line["md"]["italic"]:
line_text = f"*{line_text}*"
# Set the heading level.
if line_is_heading and not line_is_whitespace:
line_text = f"{'#' * line['md']['heading_level']} {line_text}"
line_text += "\n"
block_text += line_text
block_text = block_text.rstrip() + "\n\n"
page_md += block_text
pages_md.append(page_md.strip())
return pages_md
def merge_split_headings(pages: list[str]) -> list[str]:
"""Merge headings that are split across lines."""
def _merge_split_headings(match: re.Match[str]) -> str:
atx_headings = [line.strip("# ").strip() for line in match.group().splitlines()]
return f"{match.group(1)} {' '.join(atx_headings)}\n\n"
pages_md = [
re.sub(
r"^(#+)[ \t]+[^\n]+\n+(?:^\1[ \t]+[^\n]+\n+)+",
_merge_split_headings,
page,
flags=re.MULTILINE,
)
for page in pages
]
return pages_md
# Add heading level metadata.
pages = add_heading_level_metadata(pages)
# Add emphasis metadata.
pages = add_emphasis_metadata(pages)
# Strip page numbers.
pages = strip_page_numbers(pages)
# Convert the pages to Markdown.
pages_md = convert_to_markdown(pages)
# Merge headings that are split across lines.
pages_md = merge_split_headings(pages_md)
return pages_md
def document_to_markdown(doc_path: Path) -> str:
"""Convert any document to GitHub Flavored Markdown."""
# Convert the file's content to GitHub Flavored Markdown.
if doc_path.suffix == ".pdf":
# Parse the PDF with pdftext and convert it to Markdown.
pages = dictionary_output(doc_path, sort=True, keep_chars=False)
doc = "\n\n".join(parsed_pdf_to_markdown(pages))
else:
try:
# Use pandoc for everything else.
import pypandoc
doc = pypandoc.convert_file(doc_path, to="gfm")
except ImportError as error:
error_message = (
"To convert files to Markdown with pandoc, please install the `pandoc` extra."
)
raise ImportError(error_message) from error
except RuntimeError:
# File format not supported, fall back to reading the text.
doc = doc_path.read_text()
# Improve Markdown quality.
doc = mdformat.text(doc)
return doc