Spaces:
Running
Running
"""Test RAGLite's chunk splitting functionality.""" | |
import numpy as np | |
import pytest | |
from raglite._split_chunks import split_chunks | |
def test_edge_cases(sentences: list[str]) -> None: | |
"""Test chunk splitting edge cases.""" | |
sentence_embeddings = np.ones((len(sentences), 768)).astype(np.float16) | |
chunks, chunk_embeddings = split_chunks( | |
sentences, sentence_embeddings, sentence_window_size=3, max_size=1440 | |
) | |
assert isinstance(chunks, list) | |
assert isinstance(chunk_embeddings, list) | |
assert len(chunk_embeddings) == (len(chunks) if sentences else 1) | |
assert all(isinstance(chunk, str) for chunk in chunks) | |
assert all(isinstance(chunk_embedding, np.ndarray) for chunk_embedding in chunk_embeddings) | |
assert all(ce.dtype == sentence_embeddings.dtype for ce in chunk_embeddings) | |
assert sum(ce.shape[0] for ce in chunk_embeddings) == sentence_embeddings.shape[0] | |
assert all(ce.shape[1] == sentence_embeddings.shape[1] for ce in chunk_embeddings) | |
def test_long_sentence(sentences: list[str]) -> None: | |
"""Test chunking on sentences that are too long.""" | |
sentence_embeddings = np.ones((len(sentences), 768)).astype(np.float16) | |
with pytest.raises( | |
ValueError, match="Sentence with length larger than chunk max_size detected." | |
): | |
_ = split_chunks(sentences, sentence_embeddings, sentence_window_size=3, max_size=1440) | |