Spaces:
Running
Running
"""Sentence splitter.""" | |
import re | |
import spacy | |
from markdown_it import MarkdownIt | |
from spacy.language import Language | |
def _mark_additional_sentence_boundaries(doc: spacy.tokens.Doc) -> spacy.tokens.Doc: | |
"""Mark additional sentence boundaries in Markdown documents.""" | |
def get_markdown_heading_indexes(doc: str) -> list[tuple[int, int]]: | |
"""Get the indexes of the headings in a Markdown document.""" | |
md = MarkdownIt() | |
tokens = md.parse(doc) | |
headings = [] | |
lines = doc.splitlines(keepends=True) | |
char_idx = [0] | |
for line in lines: | |
char_idx.append(char_idx[-1] + len(line)) | |
for token in tokens: | |
if token.type == "heading_open": | |
start_line, end_line = token.map # type: ignore[misc] | |
heading_start = char_idx[start_line] | |
heading_end = char_idx[end_line] | |
headings.append((heading_start, heading_end)) | |
return headings | |
headings = get_markdown_heading_indexes(doc.text) | |
for heading_start, heading_end in headings: | |
# Mark the start of a heading as a new sentence. | |
for token in doc: | |
if heading_start <= token.idx: | |
token.is_sent_start = True | |
break | |
# Mark the end of a heading as a new sentence. | |
for token in doc: | |
if heading_end <= token.idx: | |
token.is_sent_start = True | |
break | |
return doc | |
def split_sentences(doc: str, max_len: int | None = None) -> list[str]: | |
"""Split a document into sentences.""" | |
# Split sentences with spaCy. | |
try: | |
nlp = spacy.load("xx_sent_ud_sm") | |
except OSError as error: | |
error_message = "Please install `xx_sent_ud_sm` with `pip install https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.7.0/xx_sent_ud_sm-3.7.0-py3-none-any.whl`." | |
raise ImportError(error_message) from error | |
nlp.add_pipe("_mark_additional_sentence_boundaries", before="senter") | |
sentences = [sent.text_with_ws for sent in nlp(doc).sents if sent.text.strip()] | |
# Apply additional splits on paragraphs and sentences because spaCy's splitting is not perfect. | |
if max_len is not None: | |
for pattern in (r"(?<=\n\n)", r"(?<=\.\s)"): | |
sentences = [ | |
part | |
for sent in sentences | |
for part in ([sent] if len(sent) <= max_len else re.split(pattern, sent)) | |
] | |
# Recursively split long sentences in the middle if they are still too long. | |
if max_len is not None: | |
while any(len(sentence) > max_len for sentence in sentences): | |
sentences = [ | |
part | |
for sent in sentences | |
for part in ( | |
[sent] | |
if len(sent) <= max_len | |
else [sent[: len(sent) // 2], sent[len(sent) // 2 :]] | |
) | |
] | |
return sentences | |