File size: 13,025 Bytes
3932407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
import io
import pathlib
import shutil
from time import sleep
from typing import Any

from consensus_tests.fixtures import create_collection, upsert_random_points, drop_collection
import requests
from .utils import *

N_PEERS = 3
N_REPLICA = 2
N_SHARDS = 3


def test_rejoin_cluster(tmp_path: pathlib.Path):
    assert_project_root()
    # Start cluster
    peer_api_uris, peer_dirs, bootstrap_uri = start_cluster(tmp_path, N_PEERS, port_seed=10000)

    create_collection(peer_api_uris[0], shard_number=N_SHARDS, replication_factor=N_REPLICA)
    wait_collection_exists_and_active_on_all_peers(collection_name="test_collection", peer_api_uris=peer_api_uris)
    upsert_random_points(peer_api_uris[0], 100)

    # Stop last node
    p = processes.pop()
    p.kill()

    # Validate upsert works with the dead node
    upsert_random_points(peer_api_uris[0], 100)

    # Assert that there are dead replicas
    wait_for_some_replicas_not_active(peer_api_uris[0], "test_collection")

    # Repeatedly drop, re-create collection and add data to it to accumulate Raft log entries
    for i in range(0, 2):
        print(f"creating collection {i}")
        # Drop test_collection
        drop_collection(peer_api_uris[0], "test_collection", timeout=5)
        # Re-create test_collection
        create_collection(peer_api_uris[0], shard_number=N_SHARDS, replication_factor=N_REPLICA, timeout=3)
        # Collection might not be ready yet, we don't care
        upsert_random_points(peer_api_uris[0], 100)
        print(f"before recovery end {i}")
        res = requests.get(f"{peer_api_uris[1]}/collections")
        print(res.json())

    # Create new collection unknown to the dead node
    create_collection(
        peer_api_uris[0],
        "test_collection2",
        shard_number=N_SHARDS,
        replication_factor=N_REPLICA,
        timeout=3
    )

    # Restart last node
    new_url = start_peer(peer_dirs[-1], "peer_0_restarted.log", bootstrap_uri, port=20000)

    peer_api_uris[-1] = new_url

    # Wait for restarted node to be up and ready
    wait_all_peers_up([new_url])

    # Repeatedly drop, re-create collection and add data to it to accumulate Raft log entries
    for i in range(0, 5):
        print(f"after recovery start {i}")
        # Drop test_collection
        drop_collection(peer_api_uris[0], "test_collection", timeout=5)
        # Re-create test_collection
        create_collection(peer_api_uris[0], shard_number=N_SHARDS, replication_factor=N_REPLICA, timeout=3)
        upsert_random_points(peer_api_uris[0], 500, fail_on_error=False)
        print(f"after recovery end {i}")
        res = requests.get(f"{new_url}/collections")
        print(res.json())

    wait_for_all_replicas_active(peer_api_uris[0], "test_collection2")
    # Assert that the restarted node has recovered the new collection
    wait_for_all_replicas_active(new_url, "test_collection2")


def test_rejoin_origin_from_wal(tmp_path: pathlib.Path):
    """
    This test checks that origin peer (first peer of the cluster) commits its own peer ID to consensus.

    - remove origin peer from cluster
    - modify second peer's `raft_state.json`, so that it does *not* provide origin peer ID and URL
      when bootstrapping new peer
    - add new peer to the cluster (bootstrapping from second peer), and check that it has valid
      state after it syncs with consensus
    - if new peer has valid state at the end of the test, it means it received correct origin peer
      ID and URL from consensus
    """

    # Overwrite `first_voter` peer
    def overwrite_first_voter(state: dict[str, Any], _: Any):
        state["first_voter"] = state["this_peer_id"]
        return state

    rejoin_cluster_test(tmp_path, start_cluster, overwrite_first_voter)

def test_rejoin_origin_from_state(tmp_path: pathlib.Path):
    """
    This test checks that Qdrant persists origin peer ID (`first_voter` field in `raft_state.json`)
    and propagates fake origin peer URL when bootstrapping new peer.

    - start cluster using *preconfigured* origin peer that does *not* have origin peer ID and URL
      committed to consensus
    - remove origin peer from cluster
    - assert that second peer's `raft_state.json` contains valid origin peer ID
    - add new peer to the cluster (bootstrapping from second peer), and check that it has valid
      state after it syncs with consensus
    - if new peer has valid state at the end of the test, it means it received correct origin peer
      ID and (fake) URL from second peer during bootstrap
    """

    # Assert origin peer ID is persisted as `first_voter`
    def assert_first_voter(state: dict[str, Any], origin_peer_id: int):
        assert state["first_voter"] == origin_peer_id

    rejoin_cluster_test(tmp_path, start_preconfigured_cluster, assert_first_voter)

@pytest.mark.skip("this test simulates and asserts past, incorrect behavior")
def test_rejoin_no_origin(tmp_path: pathlib.Path):
    """
    This test checks that `rejoin_cluster_test` is sufficient to reproduce "missing origin peer" bug.

    It simulates *earlier* behavior of Qdrant (bypassing all fixes to commit/persist/recover origin
    peer ID/URL), and then checks that new peer added to such cluster has *invalid* state.

    This test is disabled by default, but it's useful to "test the tests" and reproduce original bug.
    """

    # Overwrite `first_voter` peer
    def overwrite_first_voter(state: dict[str, Any], _: Any):
        state["first_voter"] = 1337
        return state

    rejoin_cluster_test(tmp_path, start_preconfigured_cluster, overwrite_first_voter, expected_shards=2)


def test_rejoin_recover_origin(tmp_path: pathlib.Path):
    """
    This test checks that Qdrant recovers origin peer ID from WAL, if origin peer was not yet
    removed from the cluster.
    """

    collection = "test_collection"
    peers = 3
    shards = 3

    # Start cluster
    peer_uris, peer_dirs, bootstrap_uri = start_preconfigured_cluster(tmp_path, peers)

    # Get origin peer ID
    origin_peer_id = get_cluster_info(peer_uris[0])["peer_id"]

    # Wait a few seconds for consensus to catch up
    sleep(5)

    # Kill second peer
    second_peer = processes.pop(1)
    second_peer.kill()

    # Remove `first_voter` from `raft_state.json`
    with open(f"{peer_dirs[1]}/storage/raft_state.json", "r+") as file:
        state = json.load(file)

        del state["first_voter"]

        file.seek(0, io.SEEK_SET)
        file.truncate()
        json.dump(state, file)

    # Restart second peer with the same URI and ports
    second_peer_uri, bootstrap_uri = start_first_peer(peer_dirs[1], "peer_0_1_restarted.log", second_peer.p2p_port)
    wait_for_peer_online(second_peer_uri)

    # Assert second peer recovered `first_voter` from WAL
    with open(f"{peer_dirs[1]}/storage/raft_state.json", "r") as file:
        state = json.load(file)
        assert state["first_voter"] == origin_peer_id

    # Create collection, move all shards from first peer, remove first peer from cluster
    create_collection(peer_uris[0], collection, shards, 1)
    move_all_shards_from_peer(peer_uris[0], collection)
    remove_peer(peer_uris[0])
    processes.pop(0).kill()

    # Wait a few seconds for new leader
    sleep(5)

    # Add new peer to cluster
    new_peer_uri, new_peer_dir = add_new_peer(tmp_path, peers, bootstrap_uri, collection)

    # Assert that new peer observe expected number of remote shards
    info = get_collection_cluster_info(new_peer_uri, collection)
    assert len(info["remote_shards"]) == shards


def rejoin_cluster_test(
    tmp_path: pathlib.Path,
    start_cluster: Callable[[pathlib.Path, int], tuple[list[str], list[pathlib.Path], str]],
    raft_state: Callable[[dict[str, Any], int], Any | None],
    collection: str = "test_collection",
    peers: int = 3,
    shards: int = 3,
    expected_shards: int = 3,
):
    """
    Parameterized test body, that tests adding new peer after origin peer was removed from the cluster.
    See: <https://github.com/qdrant/qdrant/issues/5138>
    """

    # Start cluster
    peer_uris, peer_dirs, bootstrap_uri = start_cluster(tmp_path, peers)

    # Get origin peer ID
    origin_peer_id = get_cluster_info(peer_uris[0])["peer_id"]

    # Create collection, move all shards from first peer, remove first peer from cluster
    create_collection(peer_uris[0], collection, shards, 1)
    move_all_shards_from_peer(peer_uris[0], collection)
    remove_peer(peer_uris[0])
    processes.pop(0).kill()

    # Generally, we could use *any* (second/third/random/last/etc) peer to bootstrap new peer from,
    # but using second peer allows to (trivially) catch a single additional corner case in how we
    # initialize consensus state when bootstrapping new peer.

    # Kill second peer
    second_peer = processes.pop(0)
    second_peer.kill()

    # Check/modify last peer `raft_state.json`
    with open(f"{peer_dirs[1]}/storage/raft_state.json", "r+") as file:
        state = json.load(file)

        if new_state := raft_state(state, origin_peer_id):
            file.seek(0, io.SEEK_SET)
            file.truncate()
            json.dump(new_state, file)

    # Restart second peer with the same URI and ports
    second_peer_uri, bootstrap_uri = start_first_peer(peer_dirs[1], "peer_0_1_restarted.log", second_peer.p2p_port)
    wait_for_peer_online(second_peer_uri)

    # Add new peer to cluster
    new_peer_uri, new_peer_dir = add_new_peer(tmp_path, peers, bootstrap_uri, collection)

    # Assert that new peer observe expected number of remote shards
    info = get_collection_cluster_info(new_peer_uri, collection)
    assert len(info["remote_shards"]) == expected_shards

def start_preconfigured_cluster(tmp_path: pathlib.Path, peers: int = 3):
    assert_project_root()

    # Collect peer URIs
    peer_uris = []

    # Create peer directories
    peer_dirs = make_peer_folders(tmp_path, peers)

    # Copy first peer Raft state and WAL from `test_cluster_rejoin_data`.
    #
    # It's just an "empty" peer, but its peer ID is *not* committed into WAL. We can use this peer to
    # test that first peer ID is correctly recovered/propagated, even when it's not committed into WAL.
    shutil.copytree("tests/consensus_tests/test_cluster_rejoin_data", f"{peer_dirs[0]}/storage")

    # Modify peer URI in Raft state to prevent URI change on startup 🙄
    p2p_port = get_port()
    grpc_port = get_port()
    http_port = get_port()

    with open(f"{peer_dirs[0]}/storage/raft_state.json", "r+") as file:
        state = json.load(file)

        state["peer_address_by_id"][str(state["this_peer_id"])] = f"http://127.0.0.1:{p2p_port}"

        file.seek(0, io.SEEK_SET)
        file.truncate()
        json.dump(state, file)

    # Start first peer
    first_peer_uri, bootstrap_uri = start_first_peer(peer_dirs[0], "peer_0_0.log", p2p_port)
    peer_uris.append(first_peer_uri)

    wait_for_peer_online(first_peer_uri)

    # Bootstrap other peers
    for peer_idx in range(1, peers):
        peer_uri = start_peer(peer_dirs[peer_idx], f"peer_0_{peer_idx}.log", bootstrap_uri)
        peer_uris.append(peer_uri)

    wait_all_peers_up(peer_uris)

    return peer_uris, peer_dirs, bootstrap_uri


def move_all_shards_from_peer(peer_uri: str, collection: str = "test_collection") -> tuple[int, int]:
    """
    Moves all shards from peer at `peer_uri` to another (random) peer in the cluster.
    """

    # Find peer to move shards to
    info = get_cluster_info(peer_uri)

    current_peer_id = info["peer_id"]
    other_peer_id = None

    for peer_id, info in info["peers"].items():
        peer_id = int(peer_id)

        if peer_id != current_peer_id:
            other_peer_id = peer_id
            break

    assert other_peer_id

    # Move all shards from first peer to second peer
    info = get_collection_cluster_info(peer_uri, collection)

    for shard in info["local_shards"]:
        resp = requests.post(f"{peer_uri}/collections/{collection}/cluster", json={
            "move_shard": {
                "from_peer_id": current_peer_id,
                "to_peer_id": other_peer_id,
                "shard_id": shard["shard_id"],
            }
        })

        assert_http_ok(resp)

    # Wait until all transfers finished
    wait_for_collection_shard_transfers_count(peer_uri, collection, 0)

    return current_peer_id, other_peer_id

def remove_peer(peer_uri: str, peer_id: int | None = None):
    if peer_id is None:
        info = get_cluster_info(peer_uri)
        peer_id = info["peer_id"]

    resp = requests.delete(f"{peer_uri}/cluster/peer/{peer_id}")
    assert_http_ok(resp)

def add_new_peer(tmp_path: pathlib.Path, peer_idx: int, bootstrap_uri: str, collection: str | None = None):
    peer_dir = make_peer_folder(tmp_path, peer_idx)
    peer_uri = start_peer(peer_dir, f"peer_0_{peer_idx}.log", bootstrap_uri)

    wait_for_peer_online(peer_uri)

    if collection is not None:
        wait_collection_on_all_peers(collection, [peer_uri])

    return peer_uri, peer_dir