colibri.qdrant / tests /consensus_tests /test_resharding.py
Gouzi Mohaled
Ajout du dossier tests
3932407
import multiprocessing
import pathlib
import random
from time import sleep
from .test_dummy_shard import assert_http_response
from .fixtures import upsert_random_points, create_collection, random_dense_vector
from .utils import *
COLLECTION_NAME = "test_collection"
# Test resharding.
#
# On a static collection, this performs resharding up and down a few times and
# asserts the shard and point counts are correct.
#
# More specifically this starts at 1 shard, reshards 3 times to 4 shards, and
# reshards 3 times back to 1 shard.
@pytest.mark.skip(reason="moving resharding driver to external service")
def test_resharding(tmp_path: pathlib.Path):
assert_project_root()
num_points = 1000
# Prevent optimizers messing with point counts
env={
"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
}
peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
# Create collection, insert points
create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
wait_collection_exists_and_active_on_all_peers(
collection_name=COLLECTION_NAME,
peer_api_uris=peer_api_uris,
)
upsert_random_points(peer_api_uris[0], num_points)
sleep(1)
# Assert node shard and point sum count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)
# We cannot reshard down now, because we only have one shard
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "down"
}
})
assert r.status_code == 400
assert r.json()["status"]["error"] == "Bad request: cannot remove shard 0 by resharding down, it is the last shard"
# Reshard up 3 times in sequence
for shard_count in range(2, 5):
# Start resharding
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "up"
}
})
assert_http_ok(r)
# Wait for resharding operation to start and stop
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
for uri in peer_api_uris:
wait_for_collection_resharding_operations_count(uri, COLLECTION_NAME, 0)
# Assert node shard and point sum count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)
assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)
sleep(1)
# Match all points on all nodes exactly
data = []
for uri in peer_api_uris:
r = requests.post(
f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
"limit": 999999999,
"with_vectors": True,
"with_payload": True,
}
)
assert_http_ok(r)
data.append(r.json()["result"])
check_data_consistency(data)
# Reshard down 3 times in sequence
for shard_count in range(3, 0, -1):
# Start resharding
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "down"
}
})
assert_http_ok(r)
# Wait for resharding operation to start and stop
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
for uri in peer_api_uris:
wait_for_collection_resharding_operations_count(uri, COLLECTION_NAME, 0)
# Assert node shard and point sum count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)
assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)
sleep(1)
# Match all points on all nodes exactly
data = []
for uri in peer_api_uris:
r = requests.post(
f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
"limit": 999999999,
"with_vectors": True,
"with_payload": True,
}
)
assert_http_ok(r)
data.append(r.json()["result"])
check_data_consistency(data)
# Test resharding shard balancing.
#
# Sets up a 3 node cluster and a collection with 1 shard and 2 replicas.
# Performs resharding 7 times and asserts the shards replicas are evenly
# balanced across all nodes.
#
# In this case the replicas are balanced on the second and third node. The first
# node has all shards because we explicitly set it as shard target all the time.
@pytest.mark.skip(reason="moving resharding driver to external service")
def test_resharding_balance(tmp_path: pathlib.Path):
assert_project_root()
num_points = 100
# Prevent optimizers messing with point counts
env={
"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
}
peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
first_peer_id = get_cluster_info(peer_api_uris[0])['peer_id']
# Create collection, insert points
create_collection(peer_api_uris[0], shard_number=1, replication_factor=2)
wait_collection_exists_and_active_on_all_peers(
collection_name=COLLECTION_NAME,
peer_api_uris=peer_api_uris,
)
upsert_random_points(peer_api_uris[0], num_points)
sleep(1)
# This test assumes we have a replica for every shard on the first node
# If that is not the case, move the replica there now
if get_collection_local_shards_count(peer_api_uris[0], COLLECTION_NAME) == 0:
second_peer_id = get_cluster_info(peer_api_uris[1])['peer_id']
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"move_shard": {
"shard_id": 0,
"from_peer_id": second_peer_id,
"to_peer_id": first_peer_id,
"method": "stream_records",
}
})
assert_http_ok(r)
wait_for_collection_shard_transfers_count(peer_api_uris[0], COLLECTION_NAME, 0)
assert check_collection_local_shards_count(peer_api_uris[0], COLLECTION_NAME, 1)
# Assert node point count
for uri in peer_api_uris:
assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points
# Reshard 5 times in sequence
for _shard_count in range(5):
# Start resharding
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "up",
"peer_id": first_peer_id
}
})
assert_http_ok(r)
# Wait for resharding operation to start and stop
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
for uri in peer_api_uris:
wait_for_collection_resharding_operations_count(uri, COLLECTION_NAME, 0)
# Point count across cluster must be stable
for uri in peer_api_uris:
assert get_collection_point_count(uri, COLLECTION_NAME, exact=False) == num_points
# We must end up with:
# - 6 shards on first node, it was the resharding target
# - 3 shards on the other two nodes, 6 replicas balanced over 2 nodes
assert check_collection_local_shards_count(peer_api_uris[0], COLLECTION_NAME, 6)
for uri in peer_api_uris[1:]:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, 3)
# Test resharding with concurrent updates.
#
# This performs resharding a few times while sending point updates to all peers
# concurrently. At the end of the whole process it asserts the expected point
# count.
#
# The concurrent update tasks consist of:
# - 3 threads upserting new points on all peers
# - 1 threads updating existing points on the first peer
# - 2 threads deleting points on the first two peers
@pytest.mark.skip(reason="moving resharding driver to external service")
def test_resharding_concurrent_updates(tmp_path: pathlib.Path):
assert_project_root()
num_points = 1000
num_inserts = 1000
num_updates = 500
num_deletes = 33
# Prevent optimizers messing with point counts
env={
"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
}
peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
# Create collection, insert points
create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
wait_collection_exists_and_active_on_all_peers(
collection_name=COLLECTION_NAME,
peer_api_uris=peer_api_uris,
)
upsert_random_points(peer_api_uris[0], num_points)
sleep(1)
# Assert node shard and point sum count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)
# During resharding, keep pushing updates into the collection
update_tasks = [
# Upsert new points on all peers
run_in_background(upsert_points_throttled, peer_api_uris[0], COLLECTION_NAME, start=10000, end=10000 + num_inserts),
run_in_background(upsert_points_throttled, peer_api_uris[1], COLLECTION_NAME, start=20000, end=20000 + num_inserts),
run_in_background(upsert_points_throttled, peer_api_uris[2], COLLECTION_NAME, start=30000, end=30000 + num_inserts),
# Update existing points on the first peer
run_in_background(upsert_points_throttled, peer_api_uris[0], COLLECTION_NAME, start=0, end=num_updates),
# Delete points on the first two peers, don't overlap with updates
run_in_background(delete_points_throttled, peer_api_uris[0], COLLECTION_NAME, start=num_updates, end=num_updates + num_deletes),
run_in_background(delete_points_throttled, peer_api_uris[1], COLLECTION_NAME, start=num_updates + num_deletes, end=num_updates + num_deletes * 2),
]
# Reshard 3 times in sequence
for shard_count in range(2, 5):
# Start resharding
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "up"
}
})
assert_http_ok(r)
# Wait for resharding operation to start and stop
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
for uri in peer_api_uris:
wait_for_collection_resharding_operations_count(uri, COLLECTION_NAME, 0, wait_for_timeout=120)
# Assert node shard count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)
# Wait for background updates to finish
for task in update_tasks:
while task.is_alive():
pass
# Assert node shard and point sum count
# Expects base points + 3x upserts - 2x deletes
expected_points = num_points + num_inserts * 3 - num_deletes * 2
for uri in peer_api_uris:
assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, expected_points)
sleep(1)
# Match all points on all nodes exactly
# Note: due to concurrent updates on all peers this check may fail, but I've
# not seen this yet. Once it does, we probably want to remove this.
data = []
for uri in peer_api_uris:
r = requests.post(
f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
"limit": 999999999,
"with_vectors": True,
"with_payload": True,
}
)
assert_http_ok(r)
data.append(r.json()["result"])
check_data_consistency(data)
# Test point count during resharding.
#
# On a static collection, this performs resharding a few times and asserts the
# exact point count remains stable on all peers during the whole process.
@pytest.mark.skip(reason="moving resharding driver to external service")
def test_resharding_stable_point_count(tmp_path: pathlib.Path):
assert_project_root()
num_points = 1000
# Prevent optimizers messing with point counts
env={
"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
}
peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
# Create collection, insert points
create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
wait_collection_exists_and_active_on_all_peers(
collection_name=COLLECTION_NAME,
peer_api_uris=peer_api_uris,
)
upsert_random_points(peer_api_uris[0], num_points)
sleep(1)
# Assert node shard and point sum count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)
# Reshard 3 times in sequence
for shard_count in range(2, 5):
# Start resharding
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "up"
}
})
assert_http_ok(r)
# Wait for resharding to start
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
# Continuously assert point count on all peers, must be stable
# Stop once all peers reported completed resharding
while True:
for uri in peer_api_uris:
assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points
cardinality_count = get_collection_point_count(uri, COLLECTION_NAME, exact=False)
assert cardinality_count >= num_points / 2 and cardinality_count < num_points * 2
all_completed = True
for uri in peer_api_uris:
if not check_collection_resharding_operations_count(uri, COLLECTION_NAME, 0):
all_completed = False
break
if all_completed:
break
# Assert node shard count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)
sleep(1)
# Match all points on all nodes exactly
data = []
for uri in peer_api_uris:
r = requests.post(
f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
"limit": 999999999,
"with_vectors": True,
"with_payload": True,
}
)
assert_http_ok(r)
data.append(r.json()["result"])
check_data_consistency(data)
# Test point count during resharding and indexing.
#
# On a static collection, this performs resharding and indexing a few times and
# asserts the exact point count remains stable on all peers during the whole
# process.
@pytest.mark.skip(reason="moving resharding driver to external service")
def test_resharding_indexing_stable_point_count(tmp_path: pathlib.Path):
assert_project_root()
num_points = 1000
# Configure optimizers to index right away with a low vector count
env={
"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "1",
}
peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
# Create collection, insert points
create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
wait_collection_exists_and_active_on_all_peers(
collection_name=COLLECTION_NAME,
peer_api_uris=peer_api_uris,
)
upsert_random_points(peer_api_uris[0], num_points)
sleep(1)
# Assert node shard and exact point count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points
# Reshard 3 times in sequence
for shard_count in range(2, 5):
# Start resharding
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "up"
}
})
assert_http_ok(r)
# Wait for resharding operation to start and stop
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
# Continuously assert exact point count on all peers, must be stable
# Stop once all peers reported completed resharding
while True:
for uri in peer_api_uris:
assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points
all_completed = True
for uri in peer_api_uris:
if not check_collection_resharding_operations_count(uri, COLLECTION_NAME, 0):
all_completed = False
break
if all_completed:
break
# Assert node shard count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)
# Wait for optimizations to complete
for uri in peer_api_uris:
wait_collection_green(uri, COLLECTION_NAME)
# Assert exact point count one more time
for uri in peer_api_uris:
assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points
# Match all points on all nodes exactly
data = []
for uri in peer_api_uris:
r = requests.post(
f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
"limit": 999999999,
"with_vectors": True,
"with_payload": True,
}
)
assert_http_ok(r)
data.append(r.json()["result"])
check_data_consistency(data)
# Test point scroll stability during resharding.
#
# On a static collection, this performs resharding a few times and asserts
# scrolling remains stable on all peers during the whole process.
@pytest.mark.skip(reason="moving resharding driver to external service")
def test_resharding_stable_scroll(tmp_path: pathlib.Path):
assert_project_root()
num_points = 1000
scroll_limit = 25
# Prevent optimizers messing with point counts
env={
"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
}
peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
# Create collection, insert points
create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
wait_collection_exists_and_active_on_all_peers(
collection_name=COLLECTION_NAME,
peer_api_uris=peer_api_uris,
)
upsert_random_points(peer_api_uris[0], num_points)
sleep(1)
# Assert node shard and point sum count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)
# Match scroll sample of points on all nodes exactly
data = []
for uri in peer_api_uris:
r = requests.post(
f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
"limit": scroll_limit,
"with_vectors": True,
"with_payload": False,
}
)
assert_http_ok(r)
data.append(r.json()["result"])
check_data_consistency(data)
# Reshard 3 times in sequence
for shard_count in range(2, 5):
# Start resharding
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "up"
}
})
assert_http_ok(r)
# Wait for resharding to start
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
# Continuously assert point scroll samples on all peers, must be stable
# Stop once all peers reported completed resharding
while True:
# Match scroll sample of points on all nodes exactly
data = []
for uri in peer_api_uris:
r = requests.post(
f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
"limit": scroll_limit,
"with_vectors": True,
"with_payload": False,
}
)
assert_http_ok(r)
data.append(r.json()["result"])
check_data_consistency(data)
all_completed = True
for uri in peer_api_uris:
if not check_collection_resharding_operations_count(uri, COLLECTION_NAME, 0):
all_completed = False
break
if all_completed:
break
# Assert node shard count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)
# Test point query stability during resharding.
#
# On a static collection, this performs resharding a few times and asserts
# query remains stable on all peers during the whole process.
@pytest.mark.skip(reason="moving resharding driver to external service")
def test_resharding_stable_query(tmp_path: pathlib.Path):
assert_project_root()
num_points = 1000
query_limit = 10
# Prevent optimizers messing with point counts
env={
"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
}
peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
# Create collection, insert points
create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
wait_collection_exists_and_active_on_all_peers(
collection_name=COLLECTION_NAME,
peer_api_uris=peer_api_uris,
)
upsert_random_points(peer_api_uris[0], num_points)
sleep(1)
# Assert node shard and point sum count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)
# Match search sample of points on all nodes exactly
data = []
search_vector = random_dense_vector()
for uri in peer_api_uris:
r = requests.post(
f"{uri}/collections/{COLLECTION_NAME}/points/query", json={
"vector": search_vector,
"limit": query_limit,
}
)
assert_http_ok(r)
data.append(r.json()["result"])
check_query_consistency(data)
# Reshard 3 times in sequence
for shard_count in range(2, 5):
# Start resharding
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "up"
}
})
assert_http_ok(r)
# Wait for resharding to start
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
# Continuously assert point search samples on all peers, must be stable
# Stop once all peers reported completed resharding
while True:
# Match search sample of points on all nodes exactly
data = []
search_vector = random_dense_vector()
for uri in peer_api_uris:
r = requests.post(
f"{uri}/collections/{COLLECTION_NAME}/points/query", json={
"vector": search_vector,
"limit": query_limit,
}
)
assert_http_ok(r)
data.append(r.json()["result"])
check_query_consistency(data)
all_completed = True
for uri in peer_api_uris:
if not check_collection_resharding_operations_count(uri, COLLECTION_NAME, 0):
all_completed = False
break
if all_completed:
break
# Assert node shard count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)
# Test resharding resumption on restart at various stages.
#
# On a static collection, this performs resharding. It kills and restarts the
# driving peer at various stages. On restart, it should finish resharding as if
# nothing happened.
@pytest.mark.skip(reason="moving resharding driver to external service")
def test_resharding_resume_on_restart(tmp_path: pathlib.Path):
assert_project_root()
num_points = 2500
# Stages at which we interrupt and restart resharding
# We'd like to interrupt at other stages too, but they are too quick for this test to catch them
interrupt_stages = ["migrate points", "replicate"]
# Prevent optimizers messing with point counts
env={
"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
}
peer_api_uris, peer_dirs, bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
first_peer_process = processes.pop(0)
first_peer_id = get_cluster_info(peer_api_uris[0])['peer_id']
# Create collection, insert points
create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
wait_collection_exists_and_active_on_all_peers(
collection_name=COLLECTION_NAME,
peer_api_uris=peer_api_uris,
)
upsert_random_points(peer_api_uris[0], num_points)
sleep(1)
# Assert node shard and point sum count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)
# Start resharding
r = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "up",
"peer_id": first_peer_id
}
})
assert_http_ok(r)
# Interrupt the resharding node once at each stage
for stage in interrupt_stages:
# Wait for resharding operation to start and migrate points
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
wait_for_collection_resharding_operation_stage(peer_api_uris[0], COLLECTION_NAME, stage)
# Kill and restart first peer
first_peer_process.kill()
sleep(1)
peer_api_uris[0] = start_peer(peer_dirs[0], "peer_0_restarted.log", bootstrap_uri, extra_env=env)
first_peer_process = processes.pop()
wait_for_peer_online(peer_api_uris[0], "/")
# Wait for resharding operation to start and stop
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
for uri in peer_api_uris:
wait_for_collection_resharding_operations_count(uri, COLLECTION_NAME, 0)
# Assert node shard and point sum count
for uri in peer_api_uris:
assert check_collection_local_shards_count(uri, COLLECTION_NAME, 2)
assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points
sleep(1)
# Match all points on all nodes exactly
data = []
for uri in peer_api_uris:
r = requests.post(
f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
"limit": 999999999,
"with_vectors": True,
"with_payload": True,
}
)
assert_http_ok(r)
data.append(r.json()["result"])
check_data_consistency(data)
# Test that resharding can be aborted (before it reached `WriteHashRingCommitted` stage)
@pytest.mark.skip(reason="seems like a deadlock is sometimes possible during explicit (?) abort, so the test is disabled until deadlock is fixed, to reduce flakiness")
def test_resharding_abort(tmp_path: pathlib.Path):
peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)
# Abort resharding
resp = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"abort_resharding": {}
}
)
assert_http_ok(resp)
# Wait for resharding to abort
wait_for_resharding_to_finish(peer_api_uris, 3)
# Test that resharding *can't* be aborted, once it reached `WriteHashRingCommitted` stage
@pytest.mark.skip(reason="flaky")
def test_resharding_try_abort_after_write_hash_ring_committed(tmp_path: pathlib.Path):
peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)
# Wait for `propagate deletes` resharding stage
wait_for_one_of_resharding_operation_stages(
peer_api_uris[0],
[
'commit write hash ring',
'propagate deletes',
'finalize',
],
wait_for_interval=0.125,
)
# Try to abort resharding
resp = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"abort_resharding": {}
}
)
assert resp.status_code == 400
# Wait for resharding to finish successfully
wait_for_resharding_to_finish(peer_api_uris, 4)
# Test that resharding is automatically aborted, when collection is deleted
@pytest.mark.skip(reason="flaky")
def test_resharding_abort_on_delete_collection(tmp_path: pathlib.Path):
peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)
# Delete collection
resp = requests.delete(f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}")
assert_http_ok(resp)
# TODO: Check... *something*? What? 🤔
# Test that resharding is automatically aborted, when custom shard key is deleted
@pytest.mark.skip(reason="flaky")
def test_resharding_abort_on_delete_shard_key(tmp_path: pathlib.Path):
peer_api_uris, peer_ids = bootstrap_resharding(
tmp_path,
shard_keys=["custom_shard_key_1", "custom_shard_key_2"],
resharding_shard_key="custom_shard_key_2",
)
# Delete shard key
resp = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/shards/delete", json={
"shard_key": "custom_shard_key_2",
}
)
assert_http_ok(resp)
# Wait for resharding to abort (!?)
wait_for_resharding_to_finish(peer_api_uris, 3)
# Test that resharding is automatically aborted, when we force-remove resharding peer
@pytest.mark.skip(reason="flaky")
def test_resharding_abort_on_remove_peer(tmp_path: pathlib.Path):
# Place resharding shard on the *last* peer for this test, so that the first peer would still
# be available, after we remove *resharding* peer...
peer_api_uris, peer_ids = bootstrap_resharding(tmp_path, replication_peer_idx=-1)
# Remove peer
resp = requests.delete(f"{peer_api_uris[0]}/cluster/peer/{peer_ids[-1]}?force=true")
assert_http_ok(resp)
# Wait for resharding to abort
wait_for_resharding_to_finish(peer_api_uris[:-1], 3)
# Test that resharding is automatically restarted, when we force-remove a peer,
# that receives a *replica* of the new shard during replication
@pytest.mark.skip(reason="flaky")
def test_resharding_restart_on_remove_peer_during_replicate(tmp_path: pathlib.Path):
peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)
# Wait for `stream_records` shard transfer (during `replicate` resharding stage)
info = wait_for_resharding_shard_transfer_info(peer_api_uris[0], 'replicate', 'stream_records')
# Select peer to remove
peer_to_remove = info['to']
# Remove peer
resp = requests.delete(f"{peer_api_uris[0]}/cluster/peer/{info['to']}?force=true")
assert_http_ok(resp)
# Wait for resharding to restart
wait_for_collection_resharding_operation_stage(peer_api_uris[0], COLLECTION_NAME, 'migrate points')
# Select peers that weren't removed
valid_peer_uris=[]
for peer_idx in range(0, len(peer_api_uris)):
if peer_ids[peer_idx] == peer_to_remove:
continue
valid_peer_uris.append(peer_api_uris[peer_idx])
# Wait for resharding to finish successfully
wait_for_resharding_to_finish(valid_peer_uris, 4)
# Test that new shard *can't* be removed during resharding (before it has been replicated at least once)
@pytest.mark.skip(reason="flaky")
def test_resharding_try_abort_on_remove_shard_before_replicate(tmp_path: pathlib.Path):
peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)
# Try to remove new shard (before it has been replicated at least once)
resp = requests.post(f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"drop_replica": {
"peer_id": peer_ids[0],
"shard_id": 3,
}
})
assert resp.status_code == 400
# Test that resharding is automatically restarted, when we remove new shard during replication
@pytest.mark.skip(reason="removing transfer source shard is broken at the moment (in general, not just for resharding)")
def test_resharding_restart_on_remove_src_shard_during_replicate(tmp_path: pathlib.Path):
resharding_restart_on_remove_shard_during_replicate(tmp_path, 'from')
# Test that resharding is automatically restarted, when we remove new shard during replication
@pytest.mark.skip(reason="resharding detects removing destination shard of replication shard transfer as successful transfer")
def test_resharding_restart_on_remove_dst_shard_during_replicate(tmp_path: pathlib.Path):
resharding_restart_on_remove_shard_during_replicate(tmp_path, 'to')
def resharding_restart_on_remove_shard_during_replicate(tmp_path: pathlib.Path, shard_to_remove: str):
peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)
# Wait for `stream_records` shard transfer (during `replicate` resharding stage)
info = wait_for_resharding_shard_transfer_info(peer_api_uris[0], 'replicate', 'stream_records')
# Remove replica of the new shard
resp = requests.post(f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"drop_replica": {
"peer_id": info[shard_to_remove],
"shard_id": 3,
}
})
assert_http_ok(resp)
# Wait for resharding to restart and finish successfully
wait_for_collection_resharding_operation_stage(peer_api_uris[0], COLLECTION_NAME, 'migrate points')
wait_for_resharding_to_finish(peer_api_uris, 4)
def bootstrap_resharding(
tmp_path: pathlib.Path,
shard_keys: list[str] | str | None = None,
shard_number: int = 3,
replication_factor: int = 2,
replication_peer_idx: int = 0,
resharding_shard_key: str | None = None,
):
peer_api_uris, peer_ids = bootstrap_cluster(tmp_path, shard_keys=shard_keys)
# Start resharding
resp = requests.post(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
"start_resharding": {
"direction": "up",
"peer_id": peer_ids[replication_peer_idx],
"shard_key": resharding_shard_key,
}
})
assert_http_ok(resp)
# Wait for resharding to start
wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
return (peer_api_uris, peer_ids)
def bootstrap_cluster(
tmp_path: pathlib.Path,
shard_keys: list[str] | str | None = None,
shard_number: int = 3,
replication_factor: int = 2,
) -> tuple[list[str], list[str]]:
assert_project_root()
num_points = 10000
# Prevent optimizers messing with point counts
env={
"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
}
peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
peer_ids = []
for peer_uri in peer_api_uris:
peer_ids.append(get_cluster_info(peer_uri)['peer_id'])
# Create collection
create_collection(
peer_api_uris[0],
COLLECTION_NAME,
shard_number,
replication_factor,
sharding_method='auto' if shard_keys is None else 'custom',
)
wait_collection_exists_and_active_on_all_peers(
collection_name=COLLECTION_NAME,
peer_api_uris=peer_api_uris,
)
# Create custom shard keys (if required), and upload points to collection
if type(shard_keys) is not list:
shard_keys: list[str | None] = [shard_keys]
for shard_key in shard_keys:
# Create custom shard key (if required)
if shard_key is not None:
resp = requests.put(
f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/shards", json={
"shard_key": shard_key,
"shards_number": shard_number,
"replication_factor": replication_factor,
}
)
assert_http_ok(resp)
# Upsert points to collection
upsert_random_points(peer_api_uris[0], num_points, shard_key=shard_key)
sleep(1)
return (peer_api_uris, peer_ids)
def wait_for_one_of_resharding_operation_stages(peer_uri: str, expected_stages: list[str], **kwargs):
def resharding_operation_stages():
requests.post(f"{peer_uri}/collections/{COLLECTION_NAME}/points/scroll")
info = get_collection_cluster_info(peer_uri, COLLECTION_NAME)
if 'resharding_operations' not in info:
return False
for resharding in info['resharding_operations']:
if not 'comment' in resharding:
continue
stage, *_ = resharding['comment'].split(':', maxsplit=1)
if stage in expected_stages:
return True
return False
wait_for(resharding_operation_stages, **kwargs)
def wait_for_resharding_shard_transfer_info(peer_uri: str, expected_stage: str | None, expected_method: str):
if expected_stage is not None:
wait_for_collection_resharding_operation_stage(peer_uri, COLLECTION_NAME, expected_stage)
wait_for_collection_shard_transfer_method(peer_uri, COLLECTION_NAME, expected_method)
info = get_collection_cluster_info(peer_uri, COLLECTION_NAME)
return info['shard_transfers'][0]
def wait_for_resharding_to_finish(peer_uris: list[str], expected_shard_number: int):
# Wait for resharding to finish
for peer_uri in peer_uris:
wait_for_collection_resharding_operations_count(
peer_uri,
COLLECTION_NAME,
0,
wait_for_timeout=60,
)
# Check number of shards in the collection
for peer_uri in peer_uris:
resp = get_collection_cluster_info(peer_uri, COLLECTION_NAME)
assert resp['shard_count'] == expected_shard_number
def run_in_background(run, *args, **kwargs):
p = multiprocessing.Process(target=run, args=args, kwargs=kwargs)
p.start()
return p
def upsert_points_throttled(peer_url, collection_name, start=0, end=None):
batch_size = 2
offset = start
while True:
count = min(end - offset, batch_size) if end is not None else batch_size
if count <= 0:
return
upsert_random_points(peer_url, count, collection_name, offset=offset)
offset += count
sleep(random.uniform(0.01, 0.05))
def delete_points_throttled(peer_url, collection_name, start=0, end=None):
batch_size = 2
offset = start
while True:
count = min(end - offset, batch_size) if end is not None else batch_size
if count <= 0:
return
r = requests.post(
f"{peer_url}/collections/{collection_name}/points/delete?wait=true", json={
"points": list(range(offset, offset + count)),
}
)
assert_http_ok(r)
offset += count
sleep(random.uniform(0.04, 0.06))
def check_data_consistency(data):
assert(len(data) > 1)
for i in range(len(data) - 1):
j = i + 1
data_i = data[i]["points"]
data_j = data[j]["points"]
if data_i != data_j:
ids_i = set(x.id for x in data_i)
ids_j = set(x.id for x in data_j)
diff = ids_i - ids_j
if len(diff) < 100:
print(f"Diff between {i} and {j}: {diff}")
else:
print(f"Diff len between {i} and {j}: {len(diff)}")
assert False, "Data on all nodes should be consistent"
def check_query_consistency(data):
assert(len(data) > 1)
for i in range(len(data) - 1):
j = i + 1
data_i = data[i]["points"]
data_j = data[j]["points"]
for item in data_i:
if "version" in item:
del item["version"]
for item in data_j:
if "version" in item:
del item["version"]
if data_i != data_j:
ids_i = set(x["id"] for x in data_i)
ids_j = set(x["id"] for x in data_j)
diff = ids_i - ids_j
if len(diff) < 100:
print(f"Diff between {i} and {j}: {diff}")
else:
print(f"Diff len between {i} and {j}: {len(diff)}")
assert False, "Query results on all nodes should be consistent"