Spaces:
Running
Running
File size: 3,036 Bytes
54f5afe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
"""Sentence splitter."""
import re
import spacy
from markdown_it import MarkdownIt
from spacy.language import Language
@Language.component("_mark_additional_sentence_boundaries")
def _mark_additional_sentence_boundaries(doc: spacy.tokens.Doc) -> spacy.tokens.Doc:
"""Mark additional sentence boundaries in Markdown documents."""
def get_markdown_heading_indexes(doc: str) -> list[tuple[int, int]]:
"""Get the indexes of the headings in a Markdown document."""
md = MarkdownIt()
tokens = md.parse(doc)
headings = []
lines = doc.splitlines(keepends=True)
char_idx = [0]
for line in lines:
char_idx.append(char_idx[-1] + len(line))
for token in tokens:
if token.type == "heading_open":
start_line, end_line = token.map # type: ignore[misc]
heading_start = char_idx[start_line]
heading_end = char_idx[end_line]
headings.append((heading_start, heading_end))
return headings
headings = get_markdown_heading_indexes(doc.text)
for heading_start, heading_end in headings:
# Mark the start of a heading as a new sentence.
for token in doc:
if heading_start <= token.idx:
token.is_sent_start = True
break
# Mark the end of a heading as a new sentence.
for token in doc:
if heading_end <= token.idx:
token.is_sent_start = True
break
return doc
def split_sentences(doc: str, max_len: int | None = None) -> list[str]:
"""Split a document into sentences."""
# Split sentences with spaCy.
try:
nlp = spacy.load("xx_sent_ud_sm")
except OSError as error:
error_message = "Please install `xx_sent_ud_sm` with `pip install https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.7.0/xx_sent_ud_sm-3.7.0-py3-none-any.whl`."
raise ImportError(error_message) from error
nlp.add_pipe("_mark_additional_sentence_boundaries", before="senter")
sentences = [sent.text_with_ws for sent in nlp(doc).sents if sent.text.strip()]
# Apply additional splits on paragraphs and sentences because spaCy's splitting is not perfect.
if max_len is not None:
for pattern in (r"(?<=\n\n)", r"(?<=\.\s)"):
sentences = [
part
for sent in sentences
for part in ([sent] if len(sent) <= max_len else re.split(pattern, sent))
]
# Recursively split long sentences in the middle if they are still too long.
if max_len is not None:
while any(len(sentence) > max_len for sentence in sentences):
sentences = [
part
for sent in sentences
for part in (
[sent]
if len(sent) <= max_len
else [sent[: len(sent) // 2], sent[len(sent) // 2 :]]
)
]
return sentences
|