File size: 10,010 Bytes
54f5afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""Convert any document to Markdown."""

import re
from copy import deepcopy
from pathlib import Path
from typing import Any

import mdformat
import numpy as np
from pdftext.extraction import dictionary_output
from sklearn.cluster import KMeans


def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]:  # noqa: C901, PLR0915
    """Convert a PDF parsed with pdftext to Markdown."""

    def add_heading_level_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]:  # noqa: C901
        """Add heading level metadata to a PDF parsed with pdftext."""

        def extract_font_size(span: dict[str, Any]) -> float:
            """Extract the font size from a text span."""
            font_size: float = 1.0
            if span["font"]["size"] > 1:  # A value of 1 appears to mean "unknown" in pdftext.
                font_size = span["font"]["size"]
            elif digit_sequences := re.findall(r"\d+", span["font"]["name"] or ""):
                font_size = float(digit_sequences[-1])
            elif "\n" not in span["text"]:  # Occasionally a span can contain a newline character.
                if round(span["rotation"]) in (0.0, 180.0, -180.0):
                    font_size = span["bbox"][3] - span["bbox"][1]
                elif round(span["rotation"]) in (90.0, -90.0, 270.0, -270.0):
                    font_size = span["bbox"][2] - span["bbox"][0]
            return font_size

        # Copy the pages.
        pages = deepcopy(pages)
        # Extract an array of all font sizes used by the text spans.
        font_sizes = np.asarray(
            [
                extract_font_size(span)
                for page in pages
                for block in page["blocks"]
                for line in block["lines"]
                for span in line["spans"]
            ]
        )
        font_sizes = np.round(font_sizes * 2) / 2
        unique_font_sizes, counts = np.unique(font_sizes, return_counts=True)
        # Determine the paragraph font size as the mode font size.
        tiny = unique_font_sizes < min(5, np.max(unique_font_sizes))
        counts[tiny] = -counts[tiny]
        mode = np.argmax(counts)
        counts[tiny] = -counts[tiny]
        mode_font_size = unique_font_sizes[mode]
        # Determine (at most) 6 heading font sizes by clustering font sizes larger than the mode.
        heading_font_sizes = unique_font_sizes[mode + 1 :]
        if len(heading_font_sizes) > 0:
            heading_counts = counts[mode + 1 :]
            kmeans = KMeans(n_clusters=min(6, len(heading_font_sizes)), random_state=42)
            kmeans.fit(heading_font_sizes[:, np.newaxis], sample_weight=heading_counts)
            heading_font_sizes = np.sort(np.ravel(kmeans.cluster_centers_))[::-1]
        # Add heading level information to the text spans and lines.
        for page in pages:
            for block in page["blocks"]:
                for line in block["lines"]:
                    if "md" not in line:
                        line["md"] = {}
                    heading_level = np.zeros(8)  # 0-5: <h1>-<h6>, 6: <p>, 7: <small>
                    for span in line["spans"]:
                        if "md" not in span:
                            span["md"] = {}
                        span_font_size = extract_font_size(span)
                        if span_font_size < mode_font_size:
                            idx = 7
                        elif span_font_size == mode_font_size:
                            idx = 6
                        else:
                            idx = np.argmin(np.abs(heading_font_sizes - span_font_size))  # type: ignore[assignment]
                        span["md"]["heading_level"] = idx + 1
                        heading_level[idx] += len(span["text"])
                    line["md"]["heading_level"] = np.argmax(heading_level) + 1
        return pages

    def add_emphasis_metadata(pages: list[dict[str, Any]]) -> list[dict[str, Any]]:
        """Add emphasis metadata such as bold and italic to a PDF parsed with pdftext."""
        # Copy the pages.
        pages = deepcopy(pages)
        # Add emphasis metadata to the text spans.
        for page in pages:
            for block in page["blocks"]:
                for line in block["lines"]:
                    if "md" not in line:
                        line["md"] = {}
                    for span in line["spans"]:
                        if "md" not in span:
                            span["md"] = {}
                        span["md"]["bold"] = span["font"]["weight"] > 500  # noqa: PLR2004
                        span["md"]["italic"] = "ital" in (span["font"]["name"] or "").lower()
                    line["md"]["bold"] = all(
                        span["md"]["bold"] for span in line["spans"] if span["text"].strip()
                    )
                    line["md"]["italic"] = all(
                        span["md"]["italic"] for span in line["spans"] if span["text"].strip()
                    )
        return pages

    def strip_page_numbers(pages: list[dict[str, Any]]) -> list[dict[str, Any]]:
        """Strip page numbers from a PDF parsed with pdftext."""
        # Copy the pages.
        pages = deepcopy(pages)
        # Remove lines that only contain a page number.
        for page in pages:
            for block in page["blocks"]:
                block["lines"] = [
                    line
                    for line in block["lines"]
                    if not re.match(
                        r"^\s*[#0]*\d+\s*$", "".join(span["text"] for span in line["spans"])
                    )
                ]
        return pages

    def convert_to_markdown(pages: list[dict[str, Any]]) -> list[str]:  # noqa: C901, PLR0912
        """Convert a list of pages to Markdown."""
        pages_md = []
        for page in pages:
            page_md = ""
            for block in page["blocks"]:
                block_text = ""
                for line in block["lines"]:
                    # Build the line text and style the spans.
                    line_text = ""
                    for span in line["spans"]:
                        if (
                            not line["md"]["bold"]
                            and not line["md"]["italic"]
                            and span["md"]["bold"]
                            and span["md"]["italic"]
                        ):
                            line_text += f"***{span['text']}***"
                        elif not line["md"]["bold"] and span["md"]["bold"]:
                            line_text += f"**{span['text']}**"
                        elif not line["md"]["italic"] and span["md"]["italic"]:
                            line_text += f"*{span['text']}*"
                        else:
                            line_text += span["text"]
                    # Add emphasis to the line (if it's not a heading or whitespace).
                    line_text = line_text.rstrip()
                    line_is_whitespace = not line_text.strip()
                    line_is_heading = line["md"]["heading_level"] <= 6  # noqa: PLR2004
                    if not line_is_heading and not line_is_whitespace:
                        if line["md"]["bold"] and line["md"]["italic"]:
                            line_text = f"***{line_text}***"
                        elif line["md"]["bold"]:
                            line_text = f"**{line_text}**"
                        elif line["md"]["italic"]:
                            line_text = f"*{line_text}*"
                    # Set the heading level.
                    if line_is_heading and not line_is_whitespace:
                        line_text = f"{'#' * line['md']['heading_level']} {line_text}"
                    line_text += "\n"
                    block_text += line_text
                block_text = block_text.rstrip() + "\n\n"
                page_md += block_text
            pages_md.append(page_md.strip())
        return pages_md

    def merge_split_headings(pages: list[str]) -> list[str]:
        """Merge headings that are split across lines."""

        def _merge_split_headings(match: re.Match[str]) -> str:
            atx_headings = [line.strip("# ").strip() for line in match.group().splitlines()]
            return f"{match.group(1)} {' '.join(atx_headings)}\n\n"

        pages_md = [
            re.sub(
                r"^(#+)[ \t]+[^\n]+\n+(?:^\1[ \t]+[^\n]+\n+)+",
                _merge_split_headings,
                page,
                flags=re.MULTILINE,
            )
            for page in pages
        ]
        return pages_md

    # Add heading level metadata.
    pages = add_heading_level_metadata(pages)
    # Add emphasis metadata.
    pages = add_emphasis_metadata(pages)
    # Strip page numbers.
    pages = strip_page_numbers(pages)
    # Convert the pages to Markdown.
    pages_md = convert_to_markdown(pages)
    # Merge headings that are split across lines.
    pages_md = merge_split_headings(pages_md)
    return pages_md


def document_to_markdown(doc_path: Path) -> str:
    """Convert any document to GitHub Flavored Markdown."""
    # Convert the file's content to GitHub Flavored Markdown.
    if doc_path.suffix == ".pdf":
        # Parse the PDF with pdftext and convert it to Markdown.
        pages = dictionary_output(doc_path, sort=True, keep_chars=False)
        doc = "\n\n".join(parsed_pdf_to_markdown(pages))
    else:
        try:
            # Use pandoc for everything else.
            import pypandoc

            doc = pypandoc.convert_file(doc_path, to="gfm")
        except ImportError as error:
            error_message = (
                "To convert files to Markdown with pandoc, please install the `pandoc` extra."
            )
            raise ImportError(error_message) from error
        except RuntimeError:
            # File format not supported, fall back to reading the text.
            doc = doc_path.read_text()
    # Improve Markdown quality.
    doc = mdformat.text(doc)
    return doc