Spaces:

reztilop
/

colibri.qdrant

Build error

colibri.qdrant / tests /consensus_tests /test_resharding.py

Gouzi Mohaled

Ajout du dossier tests

3932407 5 months ago

41.8 kB

	import multiprocessing
	import pathlib
	import random
	from time import sleep

	from .test_dummy_shard import assert_http_response

	from .fixtures import upsert_random_points, create_collection, random_dense_vector
	from .utils import *

	COLLECTION_NAME = "test_collection"


	# Test resharding.
	#
	# On a static collection, this performs resharding up and down a few times and
	# asserts the shard and point counts are correct.
	#
	# More specifically this starts at 1 shard, reshards 3 times to 4 shards, and
	# reshards 3 times back to 1 shard.
	@pytest.mark.skip(reason="moving resharding driver to external service")
	def test_resharding(tmp_path: pathlib.Path):
	assert_project_root()

	num_points = 1000

	# Prevent optimizers messing with point counts
	env={
	"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
	}

	peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)

	# Create collection, insert points
	create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
	wait_collection_exists_and_active_on_all_peers(
	collection_name=COLLECTION_NAME,
	peer_api_uris=peer_api_uris,
	)
	upsert_random_points(peer_api_uris[0], num_points)

	sleep(1)

	# Assert node shard and point sum count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
	assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)

	# We cannot reshard down now, because we only have one shard
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "down"
	}
	})
	assert r.status_code == 400
	assert r.json()["status"]["error"] == "Bad request: cannot remove shard 0 by resharding down, it is the last shard"

	# Reshard up 3 times in sequence
	for shard_count in range(2, 5):
	# Start resharding
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "up"
	}
	})
	assert_http_ok(r)

	# Wait for resharding operation to start and stop
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
	for uri in peer_api_uris:
	wait_for_collection_resharding_operations_count(uri, COLLECTION_NAME, 0)

	# Assert node shard and point sum count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)
	assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)

	sleep(1)

	# Match all points on all nodes exactly
	data = []
	for uri in peer_api_uris:
	r = requests.post(
	f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
	"limit": 999999999,
	"with_vectors": True,
	"with_payload": True,
	}
	)
	assert_http_ok(r)
	data.append(r.json()["result"])
	check_data_consistency(data)

	# Reshard down 3 times in sequence
	for shard_count in range(3, 0, -1):
	# Start resharding
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "down"
	}
	})
	assert_http_ok(r)

	# Wait for resharding operation to start and stop
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
	for uri in peer_api_uris:
	wait_for_collection_resharding_operations_count(uri, COLLECTION_NAME, 0)

	# Assert node shard and point sum count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)
	assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)

	sleep(1)

	# Match all points on all nodes exactly
	data = []
	for uri in peer_api_uris:
	r = requests.post(
	f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
	"limit": 999999999,
	"with_vectors": True,
	"with_payload": True,
	}
	)
	assert_http_ok(r)
	data.append(r.json()["result"])
	check_data_consistency(data)


	# Test resharding shard balancing.
	#
	# Sets up a 3 node cluster and a collection with 1 shard and 2 replicas.
	# Performs resharding 7 times and asserts the shards replicas are evenly
	# balanced across all nodes.
	#
	# In this case the replicas are balanced on the second and third node. The first
	# node has all shards because we explicitly set it as shard target all the time.
	@pytest.mark.skip(reason="moving resharding driver to external service")
	def test_resharding_balance(tmp_path: pathlib.Path):
	assert_project_root()

	num_points = 100

	# Prevent optimizers messing with point counts
	env={
	"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
	}

	peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
	first_peer_id = get_cluster_info(peer_api_uris[0])['peer_id']

	# Create collection, insert points
	create_collection(peer_api_uris[0], shard_number=1, replication_factor=2)
	wait_collection_exists_and_active_on_all_peers(
	collection_name=COLLECTION_NAME,
	peer_api_uris=peer_api_uris,
	)
	upsert_random_points(peer_api_uris[0], num_points)

	sleep(1)

	# This test assumes we have a replica for every shard on the first node
	# If that is not the case, move the replica there now
	if get_collection_local_shards_count(peer_api_uris[0], COLLECTION_NAME) == 0:
	second_peer_id = get_cluster_info(peer_api_uris[1])['peer_id']
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"move_shard": {
	"shard_id": 0,
	"from_peer_id": second_peer_id,
	"to_peer_id": first_peer_id,
	"method": "stream_records",
	}
	})
	assert_http_ok(r)
	wait_for_collection_shard_transfers_count(peer_api_uris[0], COLLECTION_NAME, 0)
	assert check_collection_local_shards_count(peer_api_uris[0], COLLECTION_NAME, 1)

	# Assert node point count
	for uri in peer_api_uris:
	assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points

	# Reshard 5 times in sequence
	for _shard_count in range(5):
	# Start resharding
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "up",
	"peer_id": first_peer_id
	}
	})
	assert_http_ok(r)

	# Wait for resharding operation to start and stop
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
	for uri in peer_api_uris:
	wait_for_collection_resharding_operations_count(uri, COLLECTION_NAME, 0)

	# Point count across cluster must be stable
	for uri in peer_api_uris:
	assert get_collection_point_count(uri, COLLECTION_NAME, exact=False) == num_points

	# We must end up with:
	# - 6 shards on first node, it was the resharding target
	# - 3 shards on the other two nodes, 6 replicas balanced over 2 nodes
	assert check_collection_local_shards_count(peer_api_uris[0], COLLECTION_NAME, 6)
	for uri in peer_api_uris[1:]:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, 3)


	# Test resharding with concurrent updates.
	#
	# This performs resharding a few times while sending point updates to all peers
	# concurrently. At the end of the whole process it asserts the expected point
	# count.
	#
	# The concurrent update tasks consist of:
	# - 3 threads upserting new points on all peers
	# - 1 threads updating existing points on the first peer
	# - 2 threads deleting points on the first two peers
	@pytest.mark.skip(reason="moving resharding driver to external service")
	def test_resharding_concurrent_updates(tmp_path: pathlib.Path):
	assert_project_root()

	num_points = 1000
	num_inserts = 1000
	num_updates = 500
	num_deletes = 33

	# Prevent optimizers messing with point counts
	env={
	"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
	}

	peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)

	# Create collection, insert points
	create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
	wait_collection_exists_and_active_on_all_peers(
	collection_name=COLLECTION_NAME,
	peer_api_uris=peer_api_uris,
	)
	upsert_random_points(peer_api_uris[0], num_points)

	sleep(1)

	# Assert node shard and point sum count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
	assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)

	# During resharding, keep pushing updates into the collection
	update_tasks = [
	# Upsert new points on all peers
	run_in_background(upsert_points_throttled, peer_api_uris[0], COLLECTION_NAME, start=10000, end=10000 + num_inserts),
	run_in_background(upsert_points_throttled, peer_api_uris[1], COLLECTION_NAME, start=20000, end=20000 + num_inserts),
	run_in_background(upsert_points_throttled, peer_api_uris[2], COLLECTION_NAME, start=30000, end=30000 + num_inserts),
	# Update existing points on the first peer
	run_in_background(upsert_points_throttled, peer_api_uris[0], COLLECTION_NAME, start=0, end=num_updates),
	# Delete points on the first two peers, don't overlap with updates
	run_in_background(delete_points_throttled, peer_api_uris[0], COLLECTION_NAME, start=num_updates, end=num_updates + num_deletes),
	run_in_background(delete_points_throttled, peer_api_uris[1], COLLECTION_NAME, start=num_updates + num_deletes, end=num_updates + num_deletes * 2),
	]

	# Reshard 3 times in sequence
	for shard_count in range(2, 5):
	# Start resharding
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "up"
	}
	})
	assert_http_ok(r)

	# Wait for resharding operation to start and stop
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
	for uri in peer_api_uris:
	wait_for_collection_resharding_operations_count(uri, COLLECTION_NAME, 0, wait_for_timeout=120)

	# Assert node shard count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)

	# Wait for background updates to finish
	for task in update_tasks:
	while task.is_alive():
	pass

	# Assert node shard and point sum count
	# Expects base points + 3x upserts - 2x deletes
	expected_points = num_points + num_inserts * 3 - num_deletes * 2
	for uri in peer_api_uris:
	assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, expected_points)

	sleep(1)

	# Match all points on all nodes exactly
	# Note: due to concurrent updates on all peers this check may fail, but I've
	# not seen this yet. Once it does, we probably want to remove this.
	data = []
	for uri in peer_api_uris:
	r = requests.post(
	f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
	"limit": 999999999,
	"with_vectors": True,
	"with_payload": True,
	}
	)
	assert_http_ok(r)
	data.append(r.json()["result"])
	check_data_consistency(data)


	# Test point count during resharding.
	#
	# On a static collection, this performs resharding a few times and asserts the
	# exact point count remains stable on all peers during the whole process.
	@pytest.mark.skip(reason="moving resharding driver to external service")
	def test_resharding_stable_point_count(tmp_path: pathlib.Path):
	assert_project_root()

	num_points = 1000

	# Prevent optimizers messing with point counts
	env={
	"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
	}

	peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)

	# Create collection, insert points
	create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
	wait_collection_exists_and_active_on_all_peers(
	collection_name=COLLECTION_NAME,
	peer_api_uris=peer_api_uris,
	)
	upsert_random_points(peer_api_uris[0], num_points)

	sleep(1)

	# Assert node shard and point sum count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
	assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)

	# Reshard 3 times in sequence
	for shard_count in range(2, 5):
	# Start resharding
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "up"
	}
	})
	assert_http_ok(r)

	# Wait for resharding to start
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)

	# Continuously assert point count on all peers, must be stable
	# Stop once all peers reported completed resharding
	while True:
	for uri in peer_api_uris:
	assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points
	cardinality_count = get_collection_point_count(uri, COLLECTION_NAME, exact=False)
	assert cardinality_count >= num_points / 2 and cardinality_count < num_points * 2

	all_completed = True
	for uri in peer_api_uris:
	if not check_collection_resharding_operations_count(uri, COLLECTION_NAME, 0):
	all_completed = False
	break
	if all_completed:
	break

	# Assert node shard count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)

	sleep(1)

	# Match all points on all nodes exactly
	data = []
	for uri in peer_api_uris:
	r = requests.post(
	f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
	"limit": 999999999,
	"with_vectors": True,
	"with_payload": True,
	}
	)
	assert_http_ok(r)
	data.append(r.json()["result"])
	check_data_consistency(data)


	# Test point count during resharding and indexing.
	#
	# On a static collection, this performs resharding and indexing a few times and
	# asserts the exact point count remains stable on all peers during the whole
	# process.
	@pytest.mark.skip(reason="moving resharding driver to external service")
	def test_resharding_indexing_stable_point_count(tmp_path: pathlib.Path):
	assert_project_root()

	num_points = 1000

	# Configure optimizers to index right away with a low vector count
	env={
	"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "1",
	}

	peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)

	# Create collection, insert points
	create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
	wait_collection_exists_and_active_on_all_peers(
	collection_name=COLLECTION_NAME,
	peer_api_uris=peer_api_uris,
	)
	upsert_random_points(peer_api_uris[0], num_points)

	sleep(1)

	# Assert node shard and exact point count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
	assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points

	# Reshard 3 times in sequence
	for shard_count in range(2, 5):
	# Start resharding
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "up"
	}
	})
	assert_http_ok(r)

	# Wait for resharding operation to start and stop
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)

	# Continuously assert exact point count on all peers, must be stable
	# Stop once all peers reported completed resharding
	while True:
	for uri in peer_api_uris:
	assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points

	all_completed = True
	for uri in peer_api_uris:
	if not check_collection_resharding_operations_count(uri, COLLECTION_NAME, 0):
	all_completed = False
	break
	if all_completed:
	break

	# Assert node shard count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)

	# Wait for optimizations to complete
	for uri in peer_api_uris:
	wait_collection_green(uri, COLLECTION_NAME)

	# Assert exact point count one more time
	for uri in peer_api_uris:
	assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points

	# Match all points on all nodes exactly
	data = []
	for uri in peer_api_uris:
	r = requests.post(
	f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
	"limit": 999999999,
	"with_vectors": True,
	"with_payload": True,
	}
	)
	assert_http_ok(r)
	data.append(r.json()["result"])
	check_data_consistency(data)


	# Test point scroll stability during resharding.
	#
	# On a static collection, this performs resharding a few times and asserts
	# scrolling remains stable on all peers during the whole process.
	@pytest.mark.skip(reason="moving resharding driver to external service")
	def test_resharding_stable_scroll(tmp_path: pathlib.Path):
	assert_project_root()

	num_points = 1000
	scroll_limit = 25

	# Prevent optimizers messing with point counts
	env={
	"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
	}

	peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)

	# Create collection, insert points
	create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
	wait_collection_exists_and_active_on_all_peers(
	collection_name=COLLECTION_NAME,
	peer_api_uris=peer_api_uris,
	)
	upsert_random_points(peer_api_uris[0], num_points)

	sleep(1)

	# Assert node shard and point sum count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
	assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)

	# Match scroll sample of points on all nodes exactly
	data = []
	for uri in peer_api_uris:
	r = requests.post(
	f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
	"limit": scroll_limit,
	"with_vectors": True,
	"with_payload": False,
	}
	)
	assert_http_ok(r)
	data.append(r.json()["result"])
	check_data_consistency(data)

	# Reshard 3 times in sequence
	for shard_count in range(2, 5):
	# Start resharding
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "up"
	}
	})
	assert_http_ok(r)

	# Wait for resharding to start
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)

	# Continuously assert point scroll samples on all peers, must be stable
	# Stop once all peers reported completed resharding
	while True:
	# Match scroll sample of points on all nodes exactly
	data = []
	for uri in peer_api_uris:
	r = requests.post(
	f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
	"limit": scroll_limit,
	"with_vectors": True,
	"with_payload": False,
	}
	)
	assert_http_ok(r)
	data.append(r.json()["result"])
	check_data_consistency(data)

	all_completed = True
	for uri in peer_api_uris:
	if not check_collection_resharding_operations_count(uri, COLLECTION_NAME, 0):
	all_completed = False
	break
	if all_completed:
	break

	# Assert node shard count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)


	# Test point query stability during resharding.
	#
	# On a static collection, this performs resharding a few times and asserts
	# query remains stable on all peers during the whole process.
	@pytest.mark.skip(reason="moving resharding driver to external service")
	def test_resharding_stable_query(tmp_path: pathlib.Path):
	assert_project_root()

	num_points = 1000
	query_limit = 10

	# Prevent optimizers messing with point counts
	env={
	"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
	}

	peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)

	# Create collection, insert points
	create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
	wait_collection_exists_and_active_on_all_peers(
	collection_name=COLLECTION_NAME,
	peer_api_uris=peer_api_uris,
	)
	upsert_random_points(peer_api_uris[0], num_points)

	sleep(1)

	# Assert node shard and point sum count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
	assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)

	# Match search sample of points on all nodes exactly
	data = []
	search_vector = random_dense_vector()
	for uri in peer_api_uris:
	r = requests.post(
	f"{uri}/collections/{COLLECTION_NAME}/points/query", json={
	"vector": search_vector,
	"limit": query_limit,
	}
	)
	assert_http_ok(r)
	data.append(r.json()["result"])
	check_query_consistency(data)

	# Reshard 3 times in sequence
	for shard_count in range(2, 5):
	# Start resharding
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "up"
	}
	})
	assert_http_ok(r)

	# Wait for resharding to start
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)

	# Continuously assert point search samples on all peers, must be stable
	# Stop once all peers reported completed resharding
	while True:
	# Match search sample of points on all nodes exactly
	data = []
	search_vector = random_dense_vector()
	for uri in peer_api_uris:
	r = requests.post(
	f"{uri}/collections/{COLLECTION_NAME}/points/query", json={
	"vector": search_vector,
	"limit": query_limit,
	}
	)
	assert_http_ok(r)
	data.append(r.json()["result"])
	check_query_consistency(data)

	all_completed = True
	for uri in peer_api_uris:
	if not check_collection_resharding_operations_count(uri, COLLECTION_NAME, 0):
	all_completed = False
	break
	if all_completed:
	break

	# Assert node shard count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, shard_count)


	# Test resharding resumption on restart at various stages.
	#
	# On a static collection, this performs resharding. It kills and restarts the
	# driving peer at various stages. On restart, it should finish resharding as if
	# nothing happened.
	@pytest.mark.skip(reason="moving resharding driver to external service")
	def test_resharding_resume_on_restart(tmp_path: pathlib.Path):
	assert_project_root()

	num_points = 2500

	# Stages at which we interrupt and restart resharding
	# We'd like to interrupt at other stages too, but they are too quick for this test to catch them
	interrupt_stages = ["migrate points", "replicate"]

	# Prevent optimizers messing with point counts
	env={
	"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
	}

	peer_api_uris, peer_dirs, bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)
	first_peer_process = processes.pop(0)
	first_peer_id = get_cluster_info(peer_api_uris[0])['peer_id']

	# Create collection, insert points
	create_collection(peer_api_uris[0], shard_number=1, replication_factor=3)
	wait_collection_exists_and_active_on_all_peers(
	collection_name=COLLECTION_NAME,
	peer_api_uris=peer_api_uris,
	)
	upsert_random_points(peer_api_uris[0], num_points)

	sleep(1)

	# Assert node shard and point sum count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, 1)
	assert check_collection_local_shards_point_count(uri, COLLECTION_NAME, num_points)

	# Start resharding
	r = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "up",
	"peer_id": first_peer_id
	}
	})
	assert_http_ok(r)

	# Interrupt the resharding node once at each stage
	for stage in interrupt_stages:
	# Wait for resharding operation to start and migrate points
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
	wait_for_collection_resharding_operation_stage(peer_api_uris[0], COLLECTION_NAME, stage)

	# Kill and restart first peer
	first_peer_process.kill()
	sleep(1)
	peer_api_uris[0] = start_peer(peer_dirs[0], "peer_0_restarted.log", bootstrap_uri, extra_env=env)
	first_peer_process = processes.pop()
	wait_for_peer_online(peer_api_uris[0], "/")

	# Wait for resharding operation to start and stop
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)
	for uri in peer_api_uris:
	wait_for_collection_resharding_operations_count(uri, COLLECTION_NAME, 0)

	# Assert node shard and point sum count
	for uri in peer_api_uris:
	assert check_collection_local_shards_count(uri, COLLECTION_NAME, 2)
	assert get_collection_point_count(uri, COLLECTION_NAME, exact=True) == num_points

	sleep(1)

	# Match all points on all nodes exactly
	data = []
	for uri in peer_api_uris:
	r = requests.post(
	f"{uri}/collections/{COLLECTION_NAME}/points/scroll", json={
	"limit": 999999999,
	"with_vectors": True,
	"with_payload": True,
	}
	)
	assert_http_ok(r)
	data.append(r.json()["result"])
	check_data_consistency(data)


	# Test that resharding can be aborted (before it reached `WriteHashRingCommitted` stage)
	@pytest.mark.skip(reason="seems like a deadlock is sometimes possible during explicit (?) abort, so the test is disabled until deadlock is fixed, to reduce flakiness")
	def test_resharding_abort(tmp_path: pathlib.Path):
	peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)

	# Abort resharding
	resp = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"abort_resharding": {}
	}
	)

	assert_http_ok(resp)

	# Wait for resharding to abort
	wait_for_resharding_to_finish(peer_api_uris, 3)

	# Test that resharding can't be aborted, once it reached `WriteHashRingCommitted` stage
	@pytest.mark.skip(reason="flaky")
	def test_resharding_try_abort_after_write_hash_ring_committed(tmp_path: pathlib.Path):
	peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)

	# Wait for `propagate deletes` resharding stage
	wait_for_one_of_resharding_operation_stages(
	peer_api_uris[0],
	[
	'commit write hash ring',
	'propagate deletes',
	'finalize',
	],
	wait_for_interval=0.125,
	)

	# Try to abort resharding
	resp = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"abort_resharding": {}
	}
	)

	assert resp.status_code == 400

	# Wait for resharding to finish successfully
	wait_for_resharding_to_finish(peer_api_uris, 4)

	# Test that resharding is automatically aborted, when collection is deleted
	@pytest.mark.skip(reason="flaky")
	def test_resharding_abort_on_delete_collection(tmp_path: pathlib.Path):
	peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)

	# Delete collection
	resp = requests.delete(f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}")
	assert_http_ok(resp)

	# TODO: Check... something? What? 🤔

	# Test that resharding is automatically aborted, when custom shard key is deleted
	@pytest.mark.skip(reason="flaky")
	def test_resharding_abort_on_delete_shard_key(tmp_path: pathlib.Path):
	peer_api_uris, peer_ids = bootstrap_resharding(
	tmp_path,
	shard_keys=["custom_shard_key_1", "custom_shard_key_2"],
	resharding_shard_key="custom_shard_key_2",
	)

	# Delete shard key
	resp = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/shards/delete", json={
	"shard_key": "custom_shard_key_2",
	}
	)

	assert_http_ok(resp)

	# Wait for resharding to abort (!?)
	wait_for_resharding_to_finish(peer_api_uris, 3)

	# Test that resharding is automatically aborted, when we force-remove resharding peer
	@pytest.mark.skip(reason="flaky")
	def test_resharding_abort_on_remove_peer(tmp_path: pathlib.Path):
	# Place resharding shard on the last peer for this test, so that the first peer would still
	# be available, after we remove resharding peer...
	peer_api_uris, peer_ids = bootstrap_resharding(tmp_path, replication_peer_idx=-1)

	# Remove peer
	resp = requests.delete(f"{peer_api_uris[0]}/cluster/peer/{peer_ids[-1]}?force=true")
	assert_http_ok(resp)

	# Wait for resharding to abort
	wait_for_resharding_to_finish(peer_api_uris[:-1], 3)

	# Test that resharding is automatically restarted, when we force-remove a peer,
	# that receives a replica of the new shard during replication
	@pytest.mark.skip(reason="flaky")
	def test_resharding_restart_on_remove_peer_during_replicate(tmp_path: pathlib.Path):
	peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)

	# Wait for `stream_records` shard transfer (during `replicate` resharding stage)
	info = wait_for_resharding_shard_transfer_info(peer_api_uris[0], 'replicate', 'stream_records')

	# Select peer to remove
	peer_to_remove = info['to']

	# Remove peer
	resp = requests.delete(f"{peer_api_uris[0]}/cluster/peer/{info['to']}?force=true")
	assert_http_ok(resp)

	# Wait for resharding to restart
	wait_for_collection_resharding_operation_stage(peer_api_uris[0], COLLECTION_NAME, 'migrate points')

	# Select peers that weren't removed
	valid_peer_uris=[]
	for peer_idx in range(0, len(peer_api_uris)):
	if peer_ids[peer_idx] == peer_to_remove:
	continue

	valid_peer_uris.append(peer_api_uris[peer_idx])

	# Wait for resharding to finish successfully
	wait_for_resharding_to_finish(valid_peer_uris, 4)

	# Test that new shard can't be removed during resharding (before it has been replicated at least once)
	@pytest.mark.skip(reason="flaky")
	def test_resharding_try_abort_on_remove_shard_before_replicate(tmp_path: pathlib.Path):
	peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)

	# Try to remove new shard (before it has been replicated at least once)
	resp = requests.post(f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"drop_replica": {
	"peer_id": peer_ids[0],
	"shard_id": 3,
	}
	})

	assert resp.status_code == 400

	# Test that resharding is automatically restarted, when we remove new shard during replication
	@pytest.mark.skip(reason="removing transfer source shard is broken at the moment (in general, not just for resharding)")
	def test_resharding_restart_on_remove_src_shard_during_replicate(tmp_path: pathlib.Path):
	resharding_restart_on_remove_shard_during_replicate(tmp_path, 'from')

	# Test that resharding is automatically restarted, when we remove new shard during replication
	@pytest.mark.skip(reason="resharding detects removing destination shard of replication shard transfer as successful transfer")
	def test_resharding_restart_on_remove_dst_shard_during_replicate(tmp_path: pathlib.Path):
	resharding_restart_on_remove_shard_during_replicate(tmp_path, 'to')

	def resharding_restart_on_remove_shard_during_replicate(tmp_path: pathlib.Path, shard_to_remove: str):
	peer_api_uris, peer_ids = bootstrap_resharding(tmp_path)

	# Wait for `stream_records` shard transfer (during `replicate` resharding stage)
	info = wait_for_resharding_shard_transfer_info(peer_api_uris[0], 'replicate', 'stream_records')

	# Remove replica of the new shard
	resp = requests.post(f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"drop_replica": {
	"peer_id": info[shard_to_remove],
	"shard_id": 3,
	}
	})

	assert_http_ok(resp)

	# Wait for resharding to restart and finish successfully
	wait_for_collection_resharding_operation_stage(peer_api_uris[0], COLLECTION_NAME, 'migrate points')
	wait_for_resharding_to_finish(peer_api_uris, 4)


	def bootstrap_resharding(
	tmp_path: pathlib.Path,
	shard_keys: list[str] \| str \| None = None,
	shard_number: int = 3,
	replication_factor: int = 2,
	replication_peer_idx: int = 0,
	resharding_shard_key: str \| None = None,
	):
	peer_api_uris, peer_ids = bootstrap_cluster(tmp_path, shard_keys=shard_keys)

	# Start resharding
	resp = requests.post(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/cluster", json={
	"start_resharding": {
	"direction": "up",
	"peer_id": peer_ids[replication_peer_idx],
	"shard_key": resharding_shard_key,
	}
	})

	assert_http_ok(resp)

	# Wait for resharding to start
	wait_for_collection_resharding_operations_count(peer_api_uris[0], COLLECTION_NAME, 1)

	return (peer_api_uris, peer_ids)

	def bootstrap_cluster(
	tmp_path: pathlib.Path,
	shard_keys: list[str] \| str \| None = None,
	shard_number: int = 3,
	replication_factor: int = 2,
	) -> tuple[list[str], list[str]]:
	assert_project_root()

	num_points = 10000

	# Prevent optimizers messing with point counts
	env={
	"QDRANT__STORAGE__OPTIMIZERS__INDEXING_THRESHOLD_KB": "0",
	}

	peer_api_uris, _peer_dirs, _bootstrap_uri = start_cluster(tmp_path, 3, None, extra_env=env)

	peer_ids = []
	for peer_uri in peer_api_uris:
	peer_ids.append(get_cluster_info(peer_uri)['peer_id'])

	# Create collection
	create_collection(
	peer_api_uris[0],
	COLLECTION_NAME,
	shard_number,
	replication_factor,
	sharding_method='auto' if shard_keys is None else 'custom',
	)

	wait_collection_exists_and_active_on_all_peers(
	collection_name=COLLECTION_NAME,
	peer_api_uris=peer_api_uris,
	)

	# Create custom shard keys (if required), and upload points to collection
	if type(shard_keys) is not list:
	shard_keys: list[str \| None] = [shard_keys]

	for shard_key in shard_keys:
	# Create custom shard key (if required)
	if shard_key is not None:
	resp = requests.put(
	f"{peer_api_uris[0]}/collections/{COLLECTION_NAME}/shards", json={
	"shard_key": shard_key,
	"shards_number": shard_number,
	"replication_factor": replication_factor,
	}
	)

	assert_http_ok(resp)

	# Upsert points to collection
	upsert_random_points(peer_api_uris[0], num_points, shard_key=shard_key)

	sleep(1)

	return (peer_api_uris, peer_ids)

	def wait_for_one_of_resharding_operation_stages(peer_uri: str, expected_stages: list[str], **kwargs):
	def resharding_operation_stages():
	requests.post(f"{peer_uri}/collections/{COLLECTION_NAME}/points/scroll")

	info = get_collection_cluster_info(peer_uri, COLLECTION_NAME)

	if 'resharding_operations' not in info:
	return False

	for resharding in info['resharding_operations']:
	if not 'comment' in resharding:
	continue

	stage, *_ = resharding['comment'].split(':', maxsplit=1)

	if stage in expected_stages:
	return True

	return False

	wait_for(resharding_operation_stages, **kwargs)

	def wait_for_resharding_shard_transfer_info(peer_uri: str, expected_stage: str \| None, expected_method: str):
	if expected_stage is not None:
	wait_for_collection_resharding_operation_stage(peer_uri, COLLECTION_NAME, expected_stage)

	wait_for_collection_shard_transfer_method(peer_uri, COLLECTION_NAME, expected_method)

	info = get_collection_cluster_info(peer_uri, COLLECTION_NAME)
	return info['shard_transfers'][0]

	def wait_for_resharding_to_finish(peer_uris: list[str], expected_shard_number: int):
	# Wait for resharding to finish
	for peer_uri in peer_uris:
	wait_for_collection_resharding_operations_count(
	peer_uri,
	COLLECTION_NAME,
	0,
	wait_for_timeout=60,
	)

	# Check number of shards in the collection
	for peer_uri in peer_uris:
	resp = get_collection_cluster_info(peer_uri, COLLECTION_NAME)
	assert resp['shard_count'] == expected_shard_number


	def run_in_background(run, args, *kwargs):
	p = multiprocessing.Process(target=run, args=args, kwargs=kwargs)
	p.start()
	return p


	def upsert_points_throttled(peer_url, collection_name, start=0, end=None):
	batch_size = 2
	offset = start

	while True:
	count = min(end - offset, batch_size) if end is not None else batch_size
	if count <= 0:
	return

	upsert_random_points(peer_url, count, collection_name, offset=offset)
	offset += count

	sleep(random.uniform(0.01, 0.05))


	def delete_points_throttled(peer_url, collection_name, start=0, end=None):
	batch_size = 2
	offset = start

	while True:
	count = min(end - offset, batch_size) if end is not None else batch_size
	if count <= 0:
	return

	r = requests.post(
	f"{peer_url}/collections/{collection_name}/points/delete?wait=true", json={
	"points": list(range(offset, offset + count)),
	}
	)
	assert_http_ok(r)
	offset += count

	sleep(random.uniform(0.04, 0.06))


	def check_data_consistency(data):
	assert(len(data) > 1)

	for i in range(len(data) - 1):
	j = i + 1

	data_i = data[i]["points"]
	data_j = data[j]["points"]

	if data_i != data_j:
	ids_i = set(x.id for x in data_i)
	ids_j = set(x.id for x in data_j)

	diff = ids_i - ids_j

	if len(diff) < 100:
	print(f"Diff between {i} and {j}: {diff}")
	else:
	print(f"Diff len between {i} and {j}: {len(diff)}")

	assert False, "Data on all nodes should be consistent"


	def check_query_consistency(data):
	assert(len(data) > 1)

	for i in range(len(data) - 1):
	j = i + 1

	data_i = data[i]["points"]
	data_j = data[j]["points"]

	for item in data_i:
	if "version" in item:
	del item["version"]
	for item in data_j:
	if "version" in item:
	del item["version"]

	if data_i != data_j:
	ids_i = set(x["id"] for x in data_i)
	ids_j = set(x["id"] for x in data_j)

	diff = ids_i - ids_j

	if len(diff) < 100:
	print(f"Diff between {i} and {j}: {diff}")
	else:
	print(f"Diff len between {i} and {j}: {len(diff)}")

	assert False, "Query results on all nodes should be consistent"