colibri.qdrant / tests /consensus_tests /test_cluster_rejoin.py
Gouzi Mohaled
Ajout du dossier tests
3932407
import io
import pathlib
import shutil
from time import sleep
from typing import Any
from consensus_tests.fixtures import create_collection, upsert_random_points, drop_collection
import requests
from .utils import *
N_PEERS = 3
N_REPLICA = 2
N_SHARDS = 3
def test_rejoin_cluster(tmp_path: pathlib.Path):
assert_project_root()
# Start cluster
peer_api_uris, peer_dirs, bootstrap_uri = start_cluster(tmp_path, N_PEERS, port_seed=10000)
create_collection(peer_api_uris[0], shard_number=N_SHARDS, replication_factor=N_REPLICA)
wait_collection_exists_and_active_on_all_peers(collection_name="test_collection", peer_api_uris=peer_api_uris)
upsert_random_points(peer_api_uris[0], 100)
# Stop last node
p = processes.pop()
p.kill()
# Validate upsert works with the dead node
upsert_random_points(peer_api_uris[0], 100)
# Assert that there are dead replicas
wait_for_some_replicas_not_active(peer_api_uris[0], "test_collection")
# Repeatedly drop, re-create collection and add data to it to accumulate Raft log entries
for i in range(0, 2):
print(f"creating collection {i}")
# Drop test_collection
drop_collection(peer_api_uris[0], "test_collection", timeout=5)
# Re-create test_collection
create_collection(peer_api_uris[0], shard_number=N_SHARDS, replication_factor=N_REPLICA, timeout=3)
# Collection might not be ready yet, we don't care
upsert_random_points(peer_api_uris[0], 100)
print(f"before recovery end {i}")
res = requests.get(f"{peer_api_uris[1]}/collections")
print(res.json())
# Create new collection unknown to the dead node
create_collection(
peer_api_uris[0],
"test_collection2",
shard_number=N_SHARDS,
replication_factor=N_REPLICA,
timeout=3
)
# Restart last node
new_url = start_peer(peer_dirs[-1], "peer_0_restarted.log", bootstrap_uri, port=20000)
peer_api_uris[-1] = new_url
# Wait for restarted node to be up and ready
wait_all_peers_up([new_url])
# Repeatedly drop, re-create collection and add data to it to accumulate Raft log entries
for i in range(0, 5):
print(f"after recovery start {i}")
# Drop test_collection
drop_collection(peer_api_uris[0], "test_collection", timeout=5)
# Re-create test_collection
create_collection(peer_api_uris[0], shard_number=N_SHARDS, replication_factor=N_REPLICA, timeout=3)
upsert_random_points(peer_api_uris[0], 500, fail_on_error=False)
print(f"after recovery end {i}")
res = requests.get(f"{new_url}/collections")
print(res.json())
wait_for_all_replicas_active(peer_api_uris[0], "test_collection2")
# Assert that the restarted node has recovered the new collection
wait_for_all_replicas_active(new_url, "test_collection2")
def test_rejoin_origin_from_wal(tmp_path: pathlib.Path):
"""
This test checks that origin peer (first peer of the cluster) commits its own peer ID to consensus.
- remove origin peer from cluster
- modify second peer's `raft_state.json`, so that it does *not* provide origin peer ID and URL
when bootstrapping new peer
- add new peer to the cluster (bootstrapping from second peer), and check that it has valid
state after it syncs with consensus
- if new peer has valid state at the end of the test, it means it received correct origin peer
ID and URL from consensus
"""
# Overwrite `first_voter` peer
def overwrite_first_voter(state: dict[str, Any], _: Any):
state["first_voter"] = state["this_peer_id"]
return state
rejoin_cluster_test(tmp_path, start_cluster, overwrite_first_voter)
def test_rejoin_origin_from_state(tmp_path: pathlib.Path):
"""
This test checks that Qdrant persists origin peer ID (`first_voter` field in `raft_state.json`)
and propagates fake origin peer URL when bootstrapping new peer.
- start cluster using *preconfigured* origin peer that does *not* have origin peer ID and URL
committed to consensus
- remove origin peer from cluster
- assert that second peer's `raft_state.json` contains valid origin peer ID
- add new peer to the cluster (bootstrapping from second peer), and check that it has valid
state after it syncs with consensus
- if new peer has valid state at the end of the test, it means it received correct origin peer
ID and (fake) URL from second peer during bootstrap
"""
# Assert origin peer ID is persisted as `first_voter`
def assert_first_voter(state: dict[str, Any], origin_peer_id: int):
assert state["first_voter"] == origin_peer_id
rejoin_cluster_test(tmp_path, start_preconfigured_cluster, assert_first_voter)
@pytest.mark.skip("this test simulates and asserts past, incorrect behavior")
def test_rejoin_no_origin(tmp_path: pathlib.Path):
"""
This test checks that `rejoin_cluster_test` is sufficient to reproduce "missing origin peer" bug.
It simulates *earlier* behavior of Qdrant (bypassing all fixes to commit/persist/recover origin
peer ID/URL), and then checks that new peer added to such cluster has *invalid* state.
This test is disabled by default, but it's useful to "test the tests" and reproduce original bug.
"""
# Overwrite `first_voter` peer
def overwrite_first_voter(state: dict[str, Any], _: Any):
state["first_voter"] = 1337
return state
rejoin_cluster_test(tmp_path, start_preconfigured_cluster, overwrite_first_voter, expected_shards=2)
def test_rejoin_recover_origin(tmp_path: pathlib.Path):
"""
This test checks that Qdrant recovers origin peer ID from WAL, if origin peer was not yet
removed from the cluster.
"""
collection = "test_collection"
peers = 3
shards = 3
# Start cluster
peer_uris, peer_dirs, bootstrap_uri = start_preconfigured_cluster(tmp_path, peers)
# Get origin peer ID
origin_peer_id = get_cluster_info(peer_uris[0])["peer_id"]
# Wait a few seconds for consensus to catch up
sleep(5)
# Kill second peer
second_peer = processes.pop(1)
second_peer.kill()
# Remove `first_voter` from `raft_state.json`
with open(f"{peer_dirs[1]}/storage/raft_state.json", "r+") as file:
state = json.load(file)
del state["first_voter"]
file.seek(0, io.SEEK_SET)
file.truncate()
json.dump(state, file)
# Restart second peer with the same URI and ports
second_peer_uri, bootstrap_uri = start_first_peer(peer_dirs[1], "peer_0_1_restarted.log", second_peer.p2p_port)
wait_for_peer_online(second_peer_uri)
# Assert second peer recovered `first_voter` from WAL
with open(f"{peer_dirs[1]}/storage/raft_state.json", "r") as file:
state = json.load(file)
assert state["first_voter"] == origin_peer_id
# Create collection, move all shards from first peer, remove first peer from cluster
create_collection(peer_uris[0], collection, shards, 1)
move_all_shards_from_peer(peer_uris[0], collection)
remove_peer(peer_uris[0])
processes.pop(0).kill()
# Wait a few seconds for new leader
sleep(5)
# Add new peer to cluster
new_peer_uri, new_peer_dir = add_new_peer(tmp_path, peers, bootstrap_uri, collection)
# Assert that new peer observe expected number of remote shards
info = get_collection_cluster_info(new_peer_uri, collection)
assert len(info["remote_shards"]) == shards
def rejoin_cluster_test(
tmp_path: pathlib.Path,
start_cluster: Callable[[pathlib.Path, int], tuple[list[str], list[pathlib.Path], str]],
raft_state: Callable[[dict[str, Any], int], Any | None],
collection: str = "test_collection",
peers: int = 3,
shards: int = 3,
expected_shards: int = 3,
):
"""
Parameterized test body, that tests adding new peer after origin peer was removed from the cluster.
See: <https://github.com/qdrant/qdrant/issues/5138>
"""
# Start cluster
peer_uris, peer_dirs, bootstrap_uri = start_cluster(tmp_path, peers)
# Get origin peer ID
origin_peer_id = get_cluster_info(peer_uris[0])["peer_id"]
# Create collection, move all shards from first peer, remove first peer from cluster
create_collection(peer_uris[0], collection, shards, 1)
move_all_shards_from_peer(peer_uris[0], collection)
remove_peer(peer_uris[0])
processes.pop(0).kill()
# Generally, we could use *any* (second/third/random/last/etc) peer to bootstrap new peer from,
# but using second peer allows to (trivially) catch a single additional corner case in how we
# initialize consensus state when bootstrapping new peer.
# Kill second peer
second_peer = processes.pop(0)
second_peer.kill()
# Check/modify last peer `raft_state.json`
with open(f"{peer_dirs[1]}/storage/raft_state.json", "r+") as file:
state = json.load(file)
if new_state := raft_state(state, origin_peer_id):
file.seek(0, io.SEEK_SET)
file.truncate()
json.dump(new_state, file)
# Restart second peer with the same URI and ports
second_peer_uri, bootstrap_uri = start_first_peer(peer_dirs[1], "peer_0_1_restarted.log", second_peer.p2p_port)
wait_for_peer_online(second_peer_uri)
# Add new peer to cluster
new_peer_uri, new_peer_dir = add_new_peer(tmp_path, peers, bootstrap_uri, collection)
# Assert that new peer observe expected number of remote shards
info = get_collection_cluster_info(new_peer_uri, collection)
assert len(info["remote_shards"]) == expected_shards
def start_preconfigured_cluster(tmp_path: pathlib.Path, peers: int = 3):
assert_project_root()
# Collect peer URIs
peer_uris = []
# Create peer directories
peer_dirs = make_peer_folders(tmp_path, peers)
# Copy first peer Raft state and WAL from `test_cluster_rejoin_data`.
#
# It's just an "empty" peer, but its peer ID is *not* committed into WAL. We can use this peer to
# test that first peer ID is correctly recovered/propagated, even when it's not committed into WAL.
shutil.copytree("tests/consensus_tests/test_cluster_rejoin_data", f"{peer_dirs[0]}/storage")
# Modify peer URI in Raft state to prevent URI change on startup 🙄
p2p_port = get_port()
grpc_port = get_port()
http_port = get_port()
with open(f"{peer_dirs[0]}/storage/raft_state.json", "r+") as file:
state = json.load(file)
state["peer_address_by_id"][str(state["this_peer_id"])] = f"http://127.0.0.1:{p2p_port}"
file.seek(0, io.SEEK_SET)
file.truncate()
json.dump(state, file)
# Start first peer
first_peer_uri, bootstrap_uri = start_first_peer(peer_dirs[0], "peer_0_0.log", p2p_port)
peer_uris.append(first_peer_uri)
wait_for_peer_online(first_peer_uri)
# Bootstrap other peers
for peer_idx in range(1, peers):
peer_uri = start_peer(peer_dirs[peer_idx], f"peer_0_{peer_idx}.log", bootstrap_uri)
peer_uris.append(peer_uri)
wait_all_peers_up(peer_uris)
return peer_uris, peer_dirs, bootstrap_uri
def move_all_shards_from_peer(peer_uri: str, collection: str = "test_collection") -> tuple[int, int]:
"""
Moves all shards from peer at `peer_uri` to another (random) peer in the cluster.
"""
# Find peer to move shards to
info = get_cluster_info(peer_uri)
current_peer_id = info["peer_id"]
other_peer_id = None
for peer_id, info in info["peers"].items():
peer_id = int(peer_id)
if peer_id != current_peer_id:
other_peer_id = peer_id
break
assert other_peer_id
# Move all shards from first peer to second peer
info = get_collection_cluster_info(peer_uri, collection)
for shard in info["local_shards"]:
resp = requests.post(f"{peer_uri}/collections/{collection}/cluster", json={
"move_shard": {
"from_peer_id": current_peer_id,
"to_peer_id": other_peer_id,
"shard_id": shard["shard_id"],
}
})
assert_http_ok(resp)
# Wait until all transfers finished
wait_for_collection_shard_transfers_count(peer_uri, collection, 0)
return current_peer_id, other_peer_id
def remove_peer(peer_uri: str, peer_id: int | None = None):
if peer_id is None:
info = get_cluster_info(peer_uri)
peer_id = info["peer_id"]
resp = requests.delete(f"{peer_uri}/cluster/peer/{peer_id}")
assert_http_ok(resp)
def add_new_peer(tmp_path: pathlib.Path, peer_idx: int, bootstrap_uri: str, collection: str | None = None):
peer_dir = make_peer_folder(tmp_path, peer_idx)
peer_uri = start_peer(peer_dir, f"peer_0_{peer_idx}.log", bootstrap_uri)
wait_for_peer_online(peer_uri)
if collection is not None:
wait_collection_on_all_peers(collection, [peer_uri])
return peer_uri, peer_dir