File size: 3,036 Bytes
54f5afe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""Sentence splitter."""

import re

import spacy
from markdown_it import MarkdownIt
from spacy.language import Language


@Language.component("_mark_additional_sentence_boundaries")
def _mark_additional_sentence_boundaries(doc: spacy.tokens.Doc) -> spacy.tokens.Doc:
    """Mark additional sentence boundaries in Markdown documents."""

    def get_markdown_heading_indexes(doc: str) -> list[tuple[int, int]]:
        """Get the indexes of the headings in a Markdown document."""
        md = MarkdownIt()
        tokens = md.parse(doc)
        headings = []
        lines = doc.splitlines(keepends=True)
        char_idx = [0]
        for line in lines:
            char_idx.append(char_idx[-1] + len(line))
        for token in tokens:
            if token.type == "heading_open":
                start_line, end_line = token.map  # type: ignore[misc]
                heading_start = char_idx[start_line]
                heading_end = char_idx[end_line]
                headings.append((heading_start, heading_end))
        return headings

    headings = get_markdown_heading_indexes(doc.text)
    for heading_start, heading_end in headings:
        # Mark the start of a heading as a new sentence.
        for token in doc:
            if heading_start <= token.idx:
                token.is_sent_start = True
                break
        # Mark the end of a heading as a new sentence.
        for token in doc:
            if heading_end <= token.idx:
                token.is_sent_start = True
                break
    return doc


def split_sentences(doc: str, max_len: int | None = None) -> list[str]:
    """Split a document into sentences."""
    # Split sentences with spaCy.
    try:
        nlp = spacy.load("xx_sent_ud_sm")
    except OSError as error:
        error_message = "Please install `xx_sent_ud_sm` with `pip install https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.7.0/xx_sent_ud_sm-3.7.0-py3-none-any.whl`."
        raise ImportError(error_message) from error
    nlp.add_pipe("_mark_additional_sentence_boundaries", before="senter")
    sentences = [sent.text_with_ws for sent in nlp(doc).sents if sent.text.strip()]
    # Apply additional splits on paragraphs and sentences because spaCy's splitting is not perfect.
    if max_len is not None:
        for pattern in (r"(?<=\n\n)", r"(?<=\.\s)"):
            sentences = [
                part
                for sent in sentences
                for part in ([sent] if len(sent) <= max_len else re.split(pattern, sent))
            ]
    # Recursively split long sentences in the middle if they are still too long.
    if max_len is not None:
        while any(len(sentence) > max_len for sentence in sentences):
            sentences = [
                part
                for sent in sentences
                for part in (
                    [sent]
                    if len(sent) <= max_len
                    else [sent[: len(sent) // 2], sent[len(sent) // 2 :]]
                )
            ]
    return sentences