Spaces:

reztilop
/

colibri.qdrant

Build error

colibri.qdrant / lib /collection /src /wal_delta.rs

Gouzi Mohaled

Ajout du dossier lib

84d2a97 8 months ago

77.3 kB

	use std::fmt::Debug;
	use std::sync::Arc;

	use parking_lot::{Mutex as ParkingMutex, MutexGuard as ParkingMutexGuard};
	use thiserror::Error;
	use tokio::sync::Mutex;

	use crate::operations::{ClockTag, OperationWithClockTag};
	use crate::shards::local_shard::clock_map::{ClockMap, RecoveryPoint};
	use crate::wal::SerdeWal;

	pub type LockedWal = Arc<ParkingMutex<SerdeWal<OperationWithClockTag>>>;

	/// A WAL that is recoverable, with operations having clock tags and a corresponding clock map.
	pub struct RecoverableWal {
	pub(super) wal: LockedWal,

	/// Map of all highest seen clocks for each peer and clock ID.
	pub(super) newest_clocks: Arc<Mutex<ClockMap>>,

	/// Map of all clocks and ticks that are cut off.
	///
	/// Clock ticks equal to those in this map are still recoverable, while clock ticks below those
	/// in this map are not.
	///
	/// This means two things:
	/// - this WAL has at least all these clock versions
	/// - (so if we advance these clocks, we have to advance `newest_clocks` as well)
	/// - this WAL cannot resolve any delta below any of these clocks
	pub(super) oldest_clocks: Arc<Mutex<ClockMap>>,
	}

	impl RecoverableWal {
	pub fn new(
	wal: LockedWal,
	highest_clocks: Arc<Mutex<ClockMap>>,
	cutoff_clocks: Arc<Mutex<ClockMap>>,
	) -> Self {
	Self {
	wal,
	newest_clocks: highest_clocks,
	oldest_clocks: cutoff_clocks,
	}
	}

	// TODO: More meaningful method name and documentation
	//
	/// Write a record to the WAL, guarantee durability.
	///
	/// On success, this returns the WAL record number of the written operation along with a WAL
	/// lock guard.
	#[must_use = "returned record number and WAL lock must be used carefully"]
	pub async fn lock_and_write<'a>(
	&'a self,
	operation: &mut OperationWithClockTag,
	) -> crate::wal::Result<(u64, ParkingMutexGuard<'a, SerdeWal<OperationWithClockTag>>)> {
	// Update last seen clock map and correct clock tag if necessary
	if let Some(clock_tag) = &mut operation.clock_tag {
	let operation_accepted = self
	.newest_clocks
	.lock()
	.await
	.advance_clock_and_correct_tag(clock_tag);

	if !operation_accepted {
	return Err(crate::wal::WalError::ClockRejected);
	}
	}

	// Write operation to WAL
	let mut wal_lock = self.wal.lock();
	wal_lock.write(operation).map(\|op_num\| (op_num, wal_lock))
	}

	/// Update the cutoff clock map based on the given recovery point
	///
	/// This can only increase clock ticks in the cutoff clock map. If there already are higher
	/// clock ticks, they're kept.
	///
	/// It updates the highest seen clocks alongside with it.
	pub async fn update_cutoff(&self, cutoff: &RecoveryPoint) {
	// Lock highest and cutoff maps separately to avoid deadlocks

	{
	let mut newest_clocks = self.newest_clocks.lock().await;
	for clock_tag in cutoff.iter_as_clock_tags() {
	newest_clocks.advance_clock(clock_tag);
	}
	}

	{
	let mut oldest_clocks = self.oldest_clocks.lock().await;
	for clock_tag in cutoff.iter_as_clock_tags() {
	oldest_clocks.advance_clock(clock_tag);
	}
	}
	}

	/// Get a recovery point for this WAL.
	pub async fn recovery_point(&self) -> RecoveryPoint {
	self.newest_clocks.lock().await.to_recovery_point()
	}

	pub async fn resolve_wal_delta(
	&self,
	recovery_point: RecoveryPoint,
	) -> Result<Option<u64>, WalDeltaError> {
	let newest_clocks = self.recovery_point().await;

	let oldest_clocks = self.oldest_clocks.lock().await.to_recovery_point();

	resolve_wal_delta(
	self.wal
	.lock()
	.read_all(true)
	.map(\|(op_num, op)\| (op_num, op.clock_tag)),
	recovery_point,
	newest_clocks,
	oldest_clocks,
	)
	}

	pub fn wal_version(&self) -> Result<Option<u64>, WalDeltaError> {
	let wal = self.wal.lock();
	if wal.is_empty() {
	Ok(None)
	} else {
	Ok(Some(wal.last_index()))
	}
	}

	/// Append records to this WAL from `other`, starting at operation `append_from` in `other`.
	#[cfg(test)]
	pub async fn append_from(&self, other: &Self, append_from: u64) -> crate::wal::Result<()> {
	let mut operations = other
	.wal
	.lock()
	.read(append_from)
	.map(\|(_, op)\| op)
	.collect::<Vec<_>>();
	for update in operations.iter_mut() {
	let (_, _) = self.lock_and_write(update).await?;
	}
	Ok(())
	}
	}

	/// Resolve the WAL delta for the given `recovery_point`
	///
	/// A `local_wal`, `newest_clocks` and `oldest_clocks` are required to resolve the
	/// delta. These should be from the node being the source of recovery, likely the current one. The
	/// `local_wal` is used to resolve the diff. The `newest_clocks` is used to extend the given
	/// recovery point with clocks the failed node does not know about. The `oldest_clocks` is
	/// used as lower bound for WAL delta resolution.
	///
	/// The delta can be sent over to the node which the recovery point is from, to restore its
	/// WAL making it consistent with the current shard.
	///
	/// On success, an option holding a WAL record number is returned.
	/// If `Some` - the remote WAL can be recovered by sending the local WAL from that record number.
	/// If `None` - the remote WAL is already equal, and we don't have to send any records.
	/// If `Err` - no delta can be resolved.
	fn resolve_wal_delta(
	operations: impl DoubleEndedIterator<Item = (u64, Option<ClockTag>)>,
	mut recovery_point: RecoveryPoint,
	mut newest_clocks: RecoveryPoint,
	mut oldest_clocks: RecoveryPoint,
	) -> Result<Option<u64>, WalDeltaError> {
	// If recovery point is empty, we cannot do a diff transfer
	if recovery_point.is_empty() {
	return Err(WalDeltaError::Empty);
	}

	// If the recovery point has clocks our current node does not know about
	// we're missing essential records and cannot resolve a WAL delta
	if recovery_point.has_clocks_not_in(&newest_clocks) {
	return Err(WalDeltaError::UnknownClocks);
	}

	// If our current node has any lower clock than the recovery point specifies,
	// we're missing essential records and cannot resolve a WAL delta
	if recovery_point.has_any_newer_clocks_than(&newest_clocks) {
	return Err(WalDeltaError::HigherThanCurrent);
	}

	// From this point, increase all clocks by one
	// We must do that so we can specify clock tick 0 as needing everything from that clock
	recovery_point.increase_all_clocks_by(1);
	newest_clocks.increase_all_clocks_by(1);
	oldest_clocks.increase_all_clocks_by(1);

	// Extend clock map with missing clocks this node know about
	// Ensure the recovering node gets records for a clock it might not have seen yet
	recovery_point.initialize_clocks_missing_from(&newest_clocks);

	// Remove clocks that are equal to this node, we don't have to transfer records for them
	// TODO: do we want to remove higher clocks too, as the recovery node already has all data?
	recovery_point.remove_clocks_equal_to(&newest_clocks);

	// Recovery point may not be below our cutoff point
	if recovery_point.has_any_older_clocks_than(&oldest_clocks) {
	return Err(WalDeltaError::Cutoff);
	}

	// If there are no points left, WALs match op so we do not recovery anything
	if recovery_point.is_empty() {
	return Ok(None);
	}

	// Scroll back over the WAL and find a record that covered all clocks, allowing delta resolution
	// Drain satisfied clocks from the recovery point until we have nothing left
	log::trace!("Resolving WAL delta for: {recovery_point}");

	let mut last_op_num = None;

	for (op_num, clock_tag) in operations.rev() {
	// We cannot resolve a delta if we have untagged records
	let Some(clock_tag) = clock_tag else {
	return Err(WalDeltaError::UntaggedRecords);
	};

	// Keep scrolling until we have no clocks left
	let removed_equal = recovery_point.remove_clock_if_newer_or_equal_to_tag(clock_tag);

	if recovery_point.is_empty() {
	// If we only removed newer clocks, delta-ing from the previous record is good enough

	let recover_from = if removed_equal {
	Some(op_num)
	} else {
	last_op_num
	};

	return Ok(recover_from);
	}

	last_op_num.replace(op_num);
	}

	Err(WalDeltaError::NotFound)
	}

	#[derive(Error, Debug, Clone, PartialEq, Eq)]
	#[error("cannot resolve WAL delta: {0}")]
	pub enum WalDeltaError {
	#[error("recovery point has no clocks to resolve delta for")]
	Empty,
	#[error("recovery point requests clocks this WAL does not know about")]
	UnknownClocks,
	#[error("recovery point requests higher clocks this WAL has")]
	HigherThanCurrent,
	#[error("some recovery point clocks are below the cutoff point in our WAL")]
	Cutoff,
	#[error("WAL delta cannot include records without clock tags")]
	UntaggedRecords,
	#[error("cannot find slice of WAL records that satisfies the recovery point")]
	NotFound,
	}

	#[cfg(test)]
	mod tests {
	use std::collections::{HashMap, HashSet, VecDeque};
	use std::ops::Range;
	use std::sync::Arc;

	use parking_lot::Mutex as ParkingMutex;
	use rand::rngs::StdRng;
	use rand::seq::SliceRandom;
	use rand::{Rng, SeedableRng};
	use rstest::rstest;
	use segment::data_types::vectors::VectorStructInternal;
	use tempfile::{Builder, TempDir};
	use wal::WalOptions;

	use super::*;
	use crate::operations::point_ops::{
	PointInsertOperationsInternal, PointOperations, PointStructPersisted,
	};
	use crate::operations::{ClockTag, CollectionUpdateOperations, OperationWithClockTag};
	use crate::shards::local_shard::clock_map::{ClockMap, RecoveryPoint};
	use crate::shards::replica_set::clock_set::ClockSet;
	use crate::wal::SerdeWal;

	fn fixture_empty_wal() -> (RecoverableWal, TempDir) {
	let dir = Builder::new().prefix("wal_test").tempdir().unwrap();
	let options = WalOptions {
	segment_capacity: 1024 * 1024,
	segment_queue_len: 0,
	};
	let wal = SerdeWal::new(dir.path().to_str().unwrap(), options).unwrap();
	(
	RecoverableWal::new(
	Arc::new(ParkingMutex::new(wal)),
	Arc::new(Mutex::new(ClockMap::default())),
	Arc::new(Mutex::new(ClockMap::default())),
	),
	dir,
	)
	}

	fn mock_operation(id: u64) -> CollectionUpdateOperations {
	CollectionUpdateOperations::PointOperation(PointOperations::UpsertPoints(
	PointInsertOperationsInternal::PointsList(vec![PointStructPersisted {
	id: id.into(),
	vector: VectorStructInternal::from(vec![1.0, 2.0, 3.0]).into(),
	payload: None,
	}]),
	))
	}

	/// Test WAL delta resolution with just one missed operation on node C.
	///
	/// See: <https://www.notion.so/qdrant/Testing-suite-4e28a978ec05476080ff26ed07757def?pvs=4>
	#[tokio::test]
	async fn test_resolve_wal_delta_one_operation() {
	// Create WALs for peer A, B and C
	let (a_wal, _a_wal_dir) = fixture_empty_wal();
	let (b_wal, _b_wal_dir) = fixture_empty_wal();
	let (c_wal, _c_wal_dir) = fixture_empty_wal();

	// Create clock set for peer A, start first clock from 1
	let mut a_clock_set = ClockSet::new();
	a_clock_set.get_clock().advance_to(0);

	// Create operation on peer A
	let mut a_clock_0 = a_clock_set.get_clock();
	let clock_tick = a_clock_0.tick_once();
	let clock_tag = ClockTag::new(1, a_clock_0.id(), clock_tick);
	let bare_operation = mock_operation(1);
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operation to peer A, B and C, and advance clocks
	let mut a_operation = operation.clone();
	let mut b_operation = operation.clone();
	let mut c_operation = operation.clone();
	let (_, _) = a_wal.lock_and_write(&mut a_operation).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation).await.unwrap();
	let (_, _) = c_wal.lock_and_write(&mut c_operation).await.unwrap();
	a_clock_0.advance_to(a_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(b_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(c_operation.clock_tag.unwrap().clock_tick);
	drop(a_clock_0);

	// Create operation on peer A
	let mut a_clock_0 = a_clock_set.get_clock();
	let clock_tick = a_clock_0.tick_once();
	let clock_tag = ClockTag::new(1, a_clock_0.id(), clock_tick);
	let bare_operation = mock_operation(2);
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operation to peer A and B, not C, and advance clocks
	let mut a_operation = operation.clone();
	let mut b_operation = operation.clone();
	let (_, _) = a_wal.lock_and_write(&mut a_operation).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation).await.unwrap();
	a_clock_0.advance_to(a_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(b_operation.clock_tag.unwrap().clock_tick);
	drop(a_clock_0);

	let c_recovery_point = c_wal.recovery_point().await;

	// Resolve delta on node A for node C, assert correctness
	let delta_from = a_wal
	.resolve_wal_delta(c_recovery_point.clone())
	.await
	.unwrap()
	.unwrap();
	assert_eq!(delta_from, 1);

	// Resolve delta on node B for node C, assert correctness
	let delta_from = b_wal
	.resolve_wal_delta(c_recovery_point.clone())
	.await
	.unwrap()
	.unwrap();
	assert_eq!(delta_from, 1);

	// Diff should have 1 operation, as C missed just one
	assert_eq!(b_wal.wal.lock().read(delta_from).count(), 1);

	// Recover WAL on node C by writing delta from node B to it
	c_wal.append_from(&b_wal, delta_from).await.unwrap();

	// WALs should match up perfectly now
	a_wal
	.wal
	.lock()
	.read(0)
	.zip(b_wal.wal.lock().read(0))
	.zip(c_wal.wal.lock().read(0))
	.for_each(\|((a, b), c)\| {
	assert_eq!(a, b);
	assert_eq!(b, c);
	});

	assert_wal_ordering_property(&a_wal, false).await;
	assert_wal_ordering_property(&b_wal, false).await;
	assert_wal_ordering_property(&c_wal, false).await;
	}

	/// Test WAL delta resolution when there is gaps in the WAL on all machines.
	///
	/// We normally do not expect this situation. But it's good to support it if it happens
	/// unexpectedly.
	///
	/// See: <https://www.notion.so/qdrant/Testing-suite-4e28a978ec05476080ff26ed07757def?pvs=4>
	#[rstest]
	#[tokio::test]
	async fn test_resolve_wal_delta_with_gaps(#[values(true, false)] with_gap: bool) {
	const N: usize = 5;
	const GAP_SIZE: usize = 10;

	// Create WALs for peer A, B and C
	let (a_wal, _a_wal_dir) = fixture_empty_wal();
	let (b_wal, _b_wal_dir) = fixture_empty_wal();
	let (c_wal, _c_wal_dir) = fixture_empty_wal();

	// Create clock set for peer A, start first clock from 1
	let mut a_clock_set = ClockSet::new();
	a_clock_set.get_clock().advance_to(0);

	// Create N operations on peer A
	for n in 0..N {
	let mut a_clock_0 = a_clock_set.get_clock();
	let clock_tick = a_clock_0.tick_once();
	let clock_tag = ClockTag::new(1, a_clock_0.id(), clock_tick);
	let bare_operation = mock_operation((1 + n) as u64);
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operation to peer A, B and C and advance clocks
	let mut a_operation = operation.clone();
	let mut b_operation = operation.clone();
	let mut c_operation = operation.clone();
	let (_, _) = a_wal.lock_and_write(&mut a_operation).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation).await.unwrap();
	let (_, _) = c_wal.lock_and_write(&mut c_operation).await.unwrap();
	a_clock_0.advance_to(a_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(b_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(c_operation.clock_tag.unwrap().clock_tick);
	}

	// Introduce a gap in the clocks on A
	if with_gap {
	for _ in 0..GAP_SIZE {
	let mut a_clock_0 = a_clock_set.get_clock();
	let clock_tick = a_clock_0.tick_once();
	a_clock_0.advance_to(clock_tick);
	}
	}

	// Create N operations on peer A, which are missed on node C
	for n in 0..N {
	let mut a_clock_0 = a_clock_set.get_clock();
	let clock_tick = a_clock_0.tick_once();
	let clock_tag = ClockTag::new(1, a_clock_0.id(), clock_tick);
	let bare_operation = mock_operation((1 + N + n) as u64);
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operation to peer A and B and advance clocks
	let mut a_operation = operation.clone();
	let mut b_operation = operation.clone();
	let (_, _) = a_wal.lock_and_write(&mut a_operation).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation).await.unwrap();
	a_clock_0.advance_to(a_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(b_operation.clock_tag.unwrap().clock_tick);
	}

	let c_recovery_point = c_wal.recovery_point().await;

	// Resolve delta on node A for node C, assert correctness
	let delta_from = a_wal
	.resolve_wal_delta(c_recovery_point.clone())
	.await
	.unwrap()
	.unwrap();
	assert_eq!(delta_from, N as u64);

	// Resolve delta on node B for node C, assert correctness
	let delta_from = b_wal
	.resolve_wal_delta(c_recovery_point.clone())
	.await
	.unwrap()
	.unwrap();
	assert_eq!(delta_from, N as u64);

	// Diff should have N operation, as C missed just N of them
	assert_eq!(b_wal.wal.lock().read(delta_from).count(), N);

	// Recover WAL on node C by writing delta from node B to it
	c_wal.append_from(&b_wal, delta_from).await.unwrap();

	// WALs should match up perfectly now
	a_wal
	.wal
	.lock()
	.read(0)
	.zip(b_wal.wal.lock().read(0))
	.zip(c_wal.wal.lock().read(0))
	.for_each(\|((a, b), c)\| {
	assert_eq!(a, b);
	assert_eq!(b, c);
	});

	assert_wal_ordering_property(&a_wal, true).await;
	assert_wal_ordering_property(&b_wal, true).await;
	assert_wal_ordering_property(&c_wal, true).await;
	}

	/// Test WAL delta resolution with a many missed operations on node C.
	///
	/// See: <https://www.notion.so/qdrant/Testing-suite-4e28a978ec05476080ff26ed07757def?pvs=4>
	#[tokio::test]
	async fn test_resolve_wal_delta_many_operations() {
	const N: usize = 5;
	const M: usize = 25;

	// Create WALs for peer A, B and C
	let (a_wal, _a_wal_dir) = fixture_empty_wal();
	let (b_wal, _b_wal_dir) = fixture_empty_wal();
	let (c_wal, _c_wal_dir) = fixture_empty_wal();

	// Create clock set for peer A, start first clock from 1
	let mut a_clock_set = ClockSet::new();
	a_clock_set.get_clock().advance_to(0);

	// Create N operations on peer A
	for i in 0..N {
	let mut a_clock_0 = a_clock_set.get_clock();
	let clock_tick = a_clock_0.tick_once();
	let clock_tag = ClockTag::new(1, a_clock_0.id(), clock_tick);
	let bare_operation = mock_operation(i as u64);
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operations to peer A, B and C, and advance clocks
	let mut a_operation = operation.clone();
	let mut b_operation = operation.clone();
	let mut c_operation = operation.clone();
	let (_, _) = a_wal.lock_and_write(&mut a_operation).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation).await.unwrap();
	let (_, _) = c_wal.lock_and_write(&mut c_operation).await.unwrap();
	a_clock_0.advance_to(a_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(b_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(c_operation.clock_tag.unwrap().clock_tick);
	}

	// Create M operations on peer A, which are missed on node C
	for i in N..N + M {
	let mut a_clock_0 = a_clock_set.get_clock();
	let clock_tick = a_clock_0.tick_once();
	let clock_tag = ClockTag::new(1, a_clock_0.id(), clock_tick);
	let bare_operation = mock_operation(i as u64);
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operations to peer A and B, not C, and advance clocks
	let mut a_operation = operation.clone();
	let mut b_operation = operation.clone();
	let (_, _) = a_wal.lock_and_write(&mut a_operation).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation).await.unwrap();
	a_clock_0.advance_to(a_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(b_operation.clock_tag.unwrap().clock_tick);
	}

	let c_recovery_point = c_wal.recovery_point().await;

	// Resolve delta on node A for node C, assert correctness
	let delta_from = a_wal
	.resolve_wal_delta(c_recovery_point.clone())
	.await
	.unwrap()
	.unwrap();
	assert_eq!(delta_from, N as u64);

	// Resolve delta on node B for node C, assert correctness
	let delta_from = b_wal
	.resolve_wal_delta(c_recovery_point)
	.await
	.unwrap()
	.unwrap();
	assert_eq!(delta_from, N as u64);

	// Diff should have M operations, as node C missed M operations
	assert_eq!(b_wal.wal.lock().read(delta_from).count(), M);

	// Recover WAL on node C by writing delta from node B to it
	c_wal.append_from(&b_wal, delta_from).await.unwrap();

	// WALs should match up perfectly now
	a_wal
	.wal
	.lock()
	.read(0)
	.zip(b_wal.wal.lock().read(0))
	.zip(c_wal.wal.lock().read(0))
	.for_each(\|((a, b), c)\| {
	assert_eq!(a, b);
	assert_eq!(b, c);
	});

	assert_wal_ordering_property(&a_wal, false).await;
	assert_wal_ordering_property(&b_wal, false).await;
	assert_wal_ordering_property(&c_wal, false).await;
	}

	/// Test WAL delta resolution with many intermixed operations on node C. Intermixed as in,
	/// from multiple nodes.
	///
	/// See: <https://www.notion.so/qdrant/Testing-suite-4e28a978ec05476080ff26ed07757def?pvs=4>
	#[tokio::test]
	async fn test_resolve_wal_delta_many_intermixed_operations() {
	const N: usize = 3;
	const M: usize = 50;

	// Create WALs for peer A, B and C
	let (a_wal, _a_wal_dir) = fixture_empty_wal();
	let (b_wal, _b_wal_dir) = fixture_empty_wal();
	let (c_wal, _c_wal_dir) = fixture_empty_wal();

	// Create clock sets for peer A and B
	let mut a_clock_set = ClockSet::new();
	let mut b_clock_set = ClockSet::new();

	// Create N operations on peer A
	for i in 0..N {
	let mut a_clock_0 = a_clock_set.get_clock();
	a_clock_0.advance_to(0);
	let clock_tick = a_clock_0.tick_once();
	let clock_tag = ClockTag::new(1, a_clock_0.id(), clock_tick);
	let bare_operation = mock_operation(i as u64);
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operations to peer A, B and C, and advance clocks
	let mut a_operation = operation.clone();
	let mut b_operation = operation.clone();
	let mut c_operation = operation.clone();
	let (_, _) = a_wal.lock_and_write(&mut a_operation).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation).await.unwrap();
	let (_, _) = c_wal.lock_and_write(&mut c_operation).await.unwrap();
	a_clock_0.advance_to(a_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(b_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(c_operation.clock_tag.unwrap().clock_tick);
	}

	// Create M operations on peers A and B, which are missed on node C
	for i in N..N + M {
	let is_node_a = i % 3 == 0;
	let peer_id = if is_node_a { 1 } else { 2 };

	let mut clock = if is_node_a {
	a_clock_set.get_clock()
	} else {
	b_clock_set.get_clock()
	};
	clock.advance_to(0);
	let clock_tick = clock.tick_once();
	let clock_tag = ClockTag::new(peer_id, clock.id(), clock_tick);
	let bare_operation = mock_operation(i as u64);
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operations to peer A and B, not C, and advance clocks
	let mut a_operation = operation.clone();
	let mut b_operation = operation.clone();
	let (_, _) = a_wal.lock_and_write(&mut a_operation).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation).await.unwrap();
	clock.advance_to(a_operation.clock_tag.unwrap().clock_tick);
	clock.advance_to(b_operation.clock_tag.unwrap().clock_tick);
	}

	let c_recovery_point = c_wal.recovery_point().await;

	// Resolve delta on node A for node C, assert correctness
	let delta_from = a_wal
	.resolve_wal_delta(c_recovery_point.clone())
	.await
	.unwrap()
	.unwrap();
	assert_eq!(delta_from, N as u64);

	// Resolve delta on node B for node C, assert correctness
	let delta_from = b_wal
	.resolve_wal_delta(c_recovery_point)
	.await
	.unwrap()
	.unwrap();
	assert_eq!(delta_from, N as u64);

	// Diff should have M operations, as node C missed M operations
	assert_eq!(b_wal.wal.lock().read(delta_from).count(), M);

	// Recover WAL on node C by writing delta from node B to it
	c_wal.append_from(&b_wal, delta_from).await.unwrap();

	// WALs should match up perfectly now
	a_wal
	.wal
	.lock()
	.read(0)
	.zip(b_wal.wal.lock().read(0))
	.zip(c_wal.wal.lock().read(0))
	.for_each(\|((a, b), c)\| {
	assert_eq!(a, b);
	assert_eq!(b, c);
	});

	assert_wal_ordering_property(&a_wal, false).await;
	assert_wal_ordering_property(&b_wal, false).await;
	assert_wal_ordering_property(&c_wal, false).await;
	}

	/// Test WAL delta resolution with operations in a different order on node A and B.
	///
	/// See: <https://www.notion.so/qdrant/Testing-suite-4e28a978ec05476080ff26ed07757def?pvs=4>
	#[tokio::test]
	async fn test_resolve_wal_delta_unordered_operations() {
	// Create WALs for peer A, B and C
	let (a_wal, _a_wal_dir) = fixture_empty_wal();
	let (b_wal, _b_wal_dir) = fixture_empty_wal();
	let (c_wal, _c_wal_dir) = fixture_empty_wal();

	// Create clock sets for peer A and B, start first clocks from 1
	let mut a_clock_set = ClockSet::new();
	let mut b_clock_set = ClockSet::new();
	a_clock_set.get_clock().advance_to(0);
	b_clock_set.get_clock().advance_to(0);

	// Create operation on peer A
	let mut a_clock_0 = a_clock_set.get_clock();
	let clock_tick = a_clock_0.tick_once();
	let clock_tag = ClockTag::new(1, a_clock_0.id(), clock_tick);
	let bare_operation = mock_operation(1);
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operation to peer A, B and C, and advance clocks
	let mut a_operation = operation.clone();
	let mut b_operation = operation.clone();
	let mut c_operation = operation.clone();
	let (_, _) = a_wal.lock_and_write(&mut a_operation).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation).await.unwrap();
	let (_, _) = c_wal.lock_and_write(&mut c_operation).await.unwrap();
	a_clock_0.advance_to(a_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(b_operation.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(c_operation.clock_tag.unwrap().clock_tick);
	drop(a_clock_0);

	// Create operations on nodes A and B
	let mut a_clock_0 = a_clock_set.get_clock();
	let mut b_clock_0 = b_clock_set.get_clock();
	let a_clock_tick = a_clock_0.tick_once();
	let b_clock_tick = b_clock_0.tick_once();
	let a_clock_tag = ClockTag::new(1, a_clock_0.id(), a_clock_tick);
	let b_clock_tag = ClockTag::new(2, a_clock_0.id(), b_clock_tick);
	let bare_operation_1 = mock_operation(2);
	let bare_operation_2 = mock_operation(3);
	let operation_1 = OperationWithClockTag::new(bare_operation_1, Some(a_clock_tag));
	let operation_2 = OperationWithClockTag::new(bare_operation_2, Some(b_clock_tag));

	// Write operations to nodes A and B in different order, but not to node C
	let mut a_operation_1 = operation_1.clone();
	let mut a_operation_2 = operation_2.clone();
	let mut b_operation_1 = operation_1.clone();
	let mut b_operation_2 = operation_2.clone();
	let (_, _) = a_wal.lock_and_write(&mut a_operation_1).await.unwrap();
	let (_, _) = a_wal.lock_and_write(&mut a_operation_2).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation_2).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut b_operation_1).await.unwrap();
	a_clock_0.advance_to(a_operation_1.clock_tag.unwrap().clock_tick);
	a_clock_0.advance_to(a_operation_2.clock_tag.unwrap().clock_tick);
	b_clock_0.advance_to(b_operation_2.clock_tag.unwrap().clock_tick);
	b_clock_0.advance_to(b_operation_1.clock_tag.unwrap().clock_tick);
	drop(a_clock_0);
	drop(b_clock_0);

	let c_recovery_point = c_wal.recovery_point().await;

	// Resolve delta on node A for node C, assert correctness
	let delta_from = a_wal
	.resolve_wal_delta(c_recovery_point.clone())
	.await
	.unwrap()
	.unwrap();
	assert_eq!(delta_from, 1);

	// Resolve delta on node B for node C, assert correctness
	let delta_from = b_wal
	.resolve_wal_delta(c_recovery_point.clone())
	.await
	.unwrap()
	.unwrap();
	assert_eq!(delta_from, 1);

	// Diff should have 2 operations on both nodes
	assert_eq!(a_wal.wal.lock().read(delta_from).count(), 2);
	assert_eq!(b_wal.wal.lock().read(delta_from).count(), 2);

	// Recover WAL on node C by writing delta from node B to it
	c_wal.append_from(&b_wal, delta_from).await.unwrap();

	// WAL on node B and C will match, A is in different order
	assert!(!a_wal
	.wal
	.lock()
	.read(0)
	.zip(c_wal.wal.lock().read(0))
	.all(\|(a, c)\| a == c));
	assert!(b_wal
	.wal
	.lock()
	.read(0)
	.zip(c_wal.wal.lock().read(0))
	.all(\|(b, c)\| b == c));

	// All WALs should have 3 operations
	assert_eq!(a_wal.wal.lock().read(0).count(), 3);
	assert_eq!(b_wal.wal.lock().read(0).count(), 3);
	assert_eq!(c_wal.wal.lock().read(0).count(), 3);

	// All WALs must have operations for point 1, 2 and 3
	let get_point = \|op\| match op {
	OperationWithClockTag {
	operation:
	CollectionUpdateOperations::PointOperation(PointOperations::UpsertPoints(
	PointInsertOperationsInternal::PointsList(points),
	)),
	..
	} => points[0].clone(),
	_ => unreachable!(),
	};
	let a_wal_point_ids = a_wal
	.wal
	.lock()
	.read(0)
	.map(\|(_, op)\| get_point(op).id)
	.collect::<HashSet<_>>();
	let b_wal_point_ids = b_wal
	.wal
	.lock()
	.read(0)
	.map(\|(_, op)\| get_point(op).id)
	.collect::<HashSet<_>>();
	let c_wal_point_ids = c_wal
	.wal
	.lock()
	.read(0)
	.map(\|(_, op)\| get_point(op).id)
	.collect::<HashSet<_>>();
	(1..=3).for_each(\|i\| {
	assert!(a_wal_point_ids.contains(&i.into()));
	assert!(b_wal_point_ids.contains(&i.into()));
	assert!(c_wal_point_ids.contains(&i.into()));
	});

	assert_wal_ordering_property(&a_wal, false).await;
	assert_wal_ordering_property(&b_wal, false).await;
	assert_wal_ordering_property(&c_wal, false).await;
	}

	#[tokio::test]
	async fn test_recover_from_previously_recoverred_with_forward_proxy() {
	// Consider a situation

	// Steps:
	//
	// 1. We initialize 2 operations on A and B, that are successfully written to both from C
	// 2. Operation 3 is written to A, but not B
	// 2.1. Operation 30 is written to A, but not B (Second channel)
	// 3. Operation 4,5 from D is written to both A and B
	// 4. Now B is reported as failed and we need to recover it from A
	// 5. During recovery, we send untagged operations from A to B (full transfer) + node C sends an Update
	// 6. After recovered (need to check consistency of A and B), node D sends an Update to A and B
	// 7. Now we want to recover newly created node E from B, it expectedly fails, because of cutoff point
	// 8. Try to recover A from E (expect no diff, as both have same data)
	// 9. Insert new operation to B but not A to make sure diff resolution starts working after full recovery

	// Recover
	// ┌───┬───────►┌───┐
	// │ A │ │ B │
	// └─▲─┴───────►└─▲─┘
	// │ Forward │
	// │ │
	// │ │
	// │ ┌───┐ │
	// └───┤ C ├────┘
	// Update └───┘ Failed Update
	//
	//
	//
	// ┌───┐
	// ┌───┤ D ├────┐
	// Update│ └───┘ │Update
	// │ │
	// │ │
	// ┌─▼─┐ ┌─▼─┐
	// │ A │ │ B │
	// └───┘ └───┘
	//
	//
	// (Almost Empty)
	// ┌───┐Recover ┌───┐
	// │ B ├───────►│ E │
	// └───┘ └───┘
	//
	//
	// (Identical)
	// ┌───┐Recover ┌───┐
	// │ E ├───────►│ A │
	// └───┘ └───┘

	let (a_wal, _a_wal_dir) = fixture_empty_wal();
	let (b_wal, _b_wal_dir) = fixture_empty_wal();

	let (e_wal, _e_wal_dir) = fixture_empty_wal();

	let mut c_clock_set = ClockSet::new();
	let mut d_clock_set = ClockSet::new();

	let node_c_peer_id = 1;
	let node_d_peer_id = 2;

	let op1: CollectionUpdateOperations = mock_operation(1);

	// Initial normal operation, written to both A and B + additionally Em but we will need it later
	{
	// Node C is sending updates to A and B
	let mut c_clock_0 = c_clock_set.get_clock();
	c_clock_0.advance_to(0);
	let clock_tick = c_clock_0.tick_once();
	let clock_tag = ClockTag::new(node_c_peer_id, c_clock_0.id(), clock_tick);

	let operation_with_clock = OperationWithClockTag::new(op1, Some(clock_tag));

	let mut operation_a = operation_with_clock.clone();
	let mut operation_b = operation_with_clock.clone();
	let mut operation_e = operation_with_clock.clone();

	let (_, _) = a_wal.lock_and_write(&mut operation_a).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut operation_b).await.unwrap();
	let (_, _) = e_wal.lock_and_write(&mut operation_e).await.unwrap();

	c_clock_0.advance_to(operation_a.clock_tag.unwrap().clock_tick);
	c_clock_0.advance_to(operation_b.clock_tag.unwrap().clock_tick);
	c_clock_0.advance_to(operation_e.clock_tag.unwrap().clock_tick);
	}

	let op2: CollectionUpdateOperations = mock_operation(2);

	// Initial normal operation, written to both
	{
	// Node C is sending updates to A and B
	let mut c_clock_0 = c_clock_set.get_clock();
	c_clock_0.advance_to(0);
	let clock_tick = c_clock_0.tick_once();
	let clock_tag = ClockTag::new(node_c_peer_id, c_clock_0.id(), clock_tick);

	let operation_with_clock = OperationWithClockTag::new(op2, Some(clock_tag));

	let mut operation_a = operation_with_clock.clone();
	let mut operation_b = operation_with_clock.clone();

	let (_, _) = a_wal.lock_and_write(&mut operation_a).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut operation_b).await.unwrap();

	c_clock_0.advance_to(operation_a.clock_tag.unwrap().clock_tick);
	c_clock_0.advance_to(operation_b.clock_tag.unwrap().clock_tick);
	}

	let op3: CollectionUpdateOperations = mock_operation(3);
	let op30: CollectionUpdateOperations = mock_operation(30);

	// Next operation gets written to A, but not B
	{
	// Node C is sending updates to A and B
	let mut c_clock_0 = c_clock_set.get_clock();
	let mut c_clock_1 = c_clock_set.get_clock();
	c_clock_0.advance_to(0);
	c_clock_1.advance_to(0);

	{
	// First parallel operation
	let clock_tick = c_clock_0.tick_once();
	let clock_tag = ClockTag::new(node_c_peer_id, c_clock_0.id(), clock_tick);

	let operation_with_clock = OperationWithClockTag::new(op3, Some(clock_tag));

	let mut operation_a = operation_with_clock.clone();

	let (_, _) = a_wal.lock_and_write(&mut operation_a).await.unwrap();

	c_clock_0.advance_to(operation_a.clock_tag.unwrap().clock_tick);
	}

	{
	// Second parallel operation
	let clock_tick = c_clock_1.tick_once();
	let clock_tag = ClockTag::new(node_c_peer_id, c_clock_1.id(), clock_tick);

	let operation_with_clock = OperationWithClockTag::new(op30, Some(clock_tag));

	let mut operation_a = operation_with_clock.clone();

	let (_, _) = a_wal.lock_and_write(&mut operation_a).await.unwrap();

	c_clock_1.advance_to(operation_a.clock_tag.unwrap().clock_tick);
	}
	}

	let op4: CollectionUpdateOperations = mock_operation(4);

	// Node D sends an update to both A and B, both successfully written
	{
	let mut d_clock_0 = d_clock_set.get_clock();
	d_clock_0.advance_to(0);
	let clock_tick = d_clock_0.tick_once();
	let clock_tag = ClockTag::new(node_d_peer_id, d_clock_0.id(), clock_tick);

	let operation_with_clock = OperationWithClockTag::new(op4, Some(clock_tag));

	let mut operation_a = operation_with_clock.clone();
	let mut operation_b = operation_with_clock.clone();

	let (_, _) = a_wal.lock_and_write(&mut operation_a).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut operation_b).await.unwrap();

	d_clock_0.advance_to(operation_a.clock_tag.unwrap().clock_tick);
	d_clock_0.advance_to(operation_b.clock_tag.unwrap().clock_tick);
	}

	let op5: CollectionUpdateOperations = mock_operation(5);

	// Node D sends an update to both A and B, both successfully written
	{
	let mut d_clock_0 = d_clock_set.get_clock();
	d_clock_0.advance_to(0);
	let clock_tick = d_clock_0.tick_once();
	let clock_tag = ClockTag::new(node_d_peer_id, d_clock_0.id(), clock_tick);

	let operation_with_clock = OperationWithClockTag::new(op5, Some(clock_tag));

	let mut operation_a = operation_with_clock.clone();
	let mut operation_b = operation_with_clock.clone();

	let (_, _) = a_wal.lock_and_write(&mut operation_a).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut operation_b).await.unwrap();

	d_clock_0.advance_to(operation_a.clock_tag.unwrap().clock_tick);
	d_clock_0.advance_to(operation_b.clock_tag.unwrap().clock_tick);
	}

	// Now B is reported as failed and we need to recover it from A

	let b_recovery_point = b_wal.recovery_point().await;

	let delta_from = a_wal
	.resolve_wal_delta(b_recovery_point.clone())
	.await
	.unwrap()
	.unwrap();

	// Operation 0 and 1 are written to both and do not need to be recovered
	// All further operations have to be written to B
	assert_eq!(delta_from, 2);

	// But instead of recovering from WAL, we will check full streaming transfer
	{
	let op1 = mock_operation(1);
	let op1_with_clock = OperationWithClockTag::new(op1, None);
	let (_, _) = b_wal
	.lock_and_write(&mut op1_with_clock.clone())
	.await
	.unwrap();

	let op2 = mock_operation(2);
	let op2_with_clock = OperationWithClockTag::new(op2, None);
	let (_, _) = b_wal
	.lock_and_write(&mut op2_with_clock.clone())
	.await
	.unwrap();
	}

	let op6: CollectionUpdateOperations = mock_operation(6);

	// In between the recovery, we have a new update from C
	// It is written to both A and B, plus forwarded to B with forward proxy
	{
	let mut c_clock_0 = c_clock_set.get_clock();
	c_clock_0.advance_to(0);
	let clock_tick = c_clock_0.tick_once();
	let clock_tag = ClockTag::new(node_c_peer_id, c_clock_0.id(), clock_tick);

	let operation_with_clock = OperationWithClockTag::new(op6, Some(clock_tag));

	let mut operation_a = operation_with_clock.clone();
	let mut operation_b = operation_with_clock.clone();
	let mut operation_b_forward = operation_with_clock.clone();

	let (_, _) = a_wal.lock_and_write(&mut operation_a).await.unwrap();
	let (_, _) = b_wal.lock_and_write(&mut operation_b).await.unwrap();
	let (_, _) = b_wal
	.lock_and_write(&mut operation_b_forward)
	.await
	.unwrap();

	c_clock_0.advance_to(operation_a.clock_tag.unwrap().clock_tick);
	c_clock_0.advance_to(operation_b.clock_tag.unwrap().clock_tick);
	}

	// Continue recovery
	{
	let op3 = mock_operation(3);
	let op3_with_clock = OperationWithClockTag::new(op3, None);
	let (_, _) = b_wal
	.lock_and_write(&mut op3_with_clock.clone())
	.await
	.unwrap();

	let op30 = mock_operation(30);
	let op30_with_clock = OperationWithClockTag::new(op30, None);
	let (_, _) = b_wal
	.lock_and_write(&mut op30_with_clock.clone())
	.await
	.unwrap();

	let op4 = mock_operation(4);
	let op4_with_clock = OperationWithClockTag::new(op4, None);
	let (_, _) = b_wal
	.lock_and_write(&mut op4_with_clock.clone())
	.await
	.unwrap();

	let op5 = mock_operation(5);
	let op5_with_clock = OperationWithClockTag::new(op5, None);
	let (_, _) = b_wal
	.lock_and_write(&mut op5_with_clock.clone())
	.await
	.unwrap();

	// Once full transfer is done, we update cutoff point on B to the last seen of A
	b_wal.update_cutoff(&a_wal.recovery_point().await).await;
	}

	// Try to recover E from B
	let e_recovery_point = e_wal.recovery_point().await;

	// Cannot recover E from B, because B has a high cutoff point due to the full transfer
	let delta_from = b_wal.resolve_wal_delta(e_recovery_point.clone()).await;
	assert_eq!(delta_from.unwrap_err(), WalDeltaError::Cutoff);

	// Try to recover A from B

	// Which should also fail, because B has a high cutoff point due to the full transfer

	let a_recovery_point = a_wal.recovery_point().await;

	/*
	a_recovery_point = RecoveryPoint {
	clocks: {
	C_1: 1,
	C: 4,
	D: 2,
	},
	}
	cutoff_point_b = RecoveryPoint {
	clocks: {
	C_1: 1,
	C: 4,
	D: 2,
	},
	}
	*/

	let delta_from = b_wal.resolve_wal_delta(a_recovery_point.clone()).await;

	// No diff expected
	assert_eq!(delta_from, Ok(None));

	let op7 = mock_operation(7);
	// Add operation to B but not A
	{
	// Node D is sending updates to B
	let mut d_clock = d_clock_set.get_clock();
	d_clock.advance_to(0);

	// First parallel operation
	let clock_tick = d_clock.tick_once();
	let clock_tag = ClockTag::new(node_d_peer_id, d_clock.id(), clock_tick);

	let operation_with_clock = OperationWithClockTag::new(op7, Some(clock_tag));

	let mut operation_b = operation_with_clock.clone();

	let (_, _) = b_wal.lock_and_write(&mut operation_b).await.unwrap();

	d_clock.advance_to(operation_b.clock_tag.unwrap().clock_tick);
	}

	let a_recovery_point = a_wal.recovery_point().await;
	let delta_from = b_wal
	.resolve_wal_delta(a_recovery_point.clone())
	.await
	.unwrap()
	.unwrap();

	// Diff expected
	assert_eq!(b_wal.wal.lock().read(delta_from).count(), 1);

	assert_wal_ordering_property(&a_wal, false).await;
	assert_wal_ordering_property(&b_wal, false).await;
	assert_wal_ordering_property(&e_wal, false).await;
	}

	/// A randomized and more extensive test for resolving a WAL delta.
	///
	/// This tests configurations from 2 up to 10 nodes.
	///
	/// This randomizes:
	/// - The number of operations
	/// - What node is used as entry point
	/// - What node dies
	///
	/// This test does the following 25 times:
	/// - insert random number of operations on all nodes
	/// - randomly kill a number of nodes (or rather, mark as killed)
	/// - write random number of operations some operations to all other nodes
	/// - recover the killed nodes
	/// - assert correctness
	///
	/// See: <https://www.notion.so/qdrant/Testing-suite-4e28a978ec05476080ff26ed07757def?pvs=4>
	#[rstest]
	#[case::two_nodes(2, 0..2)]
	#[case::three_nodes(3, 0..3)]
	#[case::four_nodes(4, 1..4)]
	#[case::five_nodes(5, 1..5)]
	#[case::six_nodes(6, 1..6)]
	#[case::seven_nodes(7, 1..7)]
	#[case::eight_nodes(8, 1..8)]
	#[case::nine_nodes(9, 1..9)]
	#[case::ten_nodes(10, 1..10)]
	#[tokio::test]
	async fn test_resolve_wal_delta_randomized(
	#[case] node_count: usize,
	#[case] dead_nodes_range: Range<usize>,
	) {
	let mut rng = StdRng::seed_from_u64(42);
	let mut point_id_source = 1..;

	// Create WALs, clock sets and clock maps for each node
	let mut wals = std::iter::repeat_with(fixture_empty_wal)
	.take(node_count)
	.collect::<Vec<_>>();
	let mut clock_sets = std::iter::repeat_with(ClockSet::new)
	.take(node_count)
	.collect::<Vec<_>>();

	// A list of clocks we don't release for some iterations
	let mut kept_clocks = vec![];

	// 25 times:
	// - insert random number of operations on all nodes
	// - randomly kill a node (or rather, mark as killed)
	// - write random number of operations some operations to all other nodes
	// - recover the killed node
	// - assert correctness
	for _ in 0..25 {
	// Insert random number of operations on all nodes
	for _ in 0..rng.gen_range(0..10) {
	let entrypoint = rng.gen_range(0..node_count);

	let mut clock = clock_sets[entrypoint].get_clock();
	clock.advance_to(0);
	let clock_tick = clock.tick_once();
	let clock_tag = ClockTag::new(entrypoint as u64, clock.id(), clock_tick);

	let bare_operation =
	CollectionUpdateOperations::PointOperation(PointOperations::UpsertPoints(
	PointInsertOperationsInternal::PointsList(vec![PointStructPersisted {
	id: point_id_source.next().unwrap().into(),
	vector: VectorStructInternal::from(
	std::iter::repeat_with(\|\| rng.gen::<f32>())
	.take(3)
	.collect::<Vec<_>>(),
	)
	.into(),
	payload: None,
	}]),
	));
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operations to all WALs
	for (wal, _wal_dir) in wals.iter_mut() {
	let mut operation = operation.clone();
	let (_, _) = wal.lock_and_write(&mut operation).await.unwrap();
	clock.advance_to(operation.clock_tag.unwrap().clock_tick);
	}

	// Maybe keep the clock for some iterations
	let keep_clock_for = rng.gen_range(0..3);
	if keep_clock_for > 0 {
	kept_clocks.push((keep_clock_for, clock));
	}
	}

	// Make a random list of alive and dead nodes
	let mut alive_nodes = (0..node_count).collect::<Vec<_>>();
	alive_nodes.shuffle(&mut rng);
	let dead_nodes = alive_nodes
	.drain(0..rng.gen_range(dead_nodes_range.clone()))
	.collect::<HashSet<_>>();

	// Insert random number of operations into all alive nodes
	let operation_count = rng.gen_range(0..100);
	for _ in 0..operation_count {
	let entrypoint = *alive_nodes.choose(&mut rng).unwrap();

	let mut clock = clock_sets[entrypoint].get_clock();
	clock.advance_to(0);
	let clock_tick = clock.tick_once();
	let clock_tag = ClockTag::new(entrypoint as u64, clock.id(), clock_tick);

	let bare_operation =
	CollectionUpdateOperations::PointOperation(PointOperations::UpsertPoints(
	PointInsertOperationsInternal::PointsList(vec![PointStructPersisted {
	id: point_id_source.next().unwrap().into(),
	vector: VectorStructInternal::from(
	std::iter::repeat_with(\|\| rng.gen::<f32>())
	.take(3)
	.collect::<Vec<_>>(),
	)
	.into(),
	payload: None,
	}]),
	));
	let operation = OperationWithClockTag::new(bare_operation, Some(clock_tag));

	// Write operations to all WALs and clock maps on alive node
	for &alive_node in &alive_nodes {
	let mut operation = operation.clone();
	let (_, _) = wals[alive_node]
	.0
	.lock_and_write(&mut operation)
	.await
	.unwrap();
	clock.advance_to(operation.clock_tag.unwrap().clock_tick);
	}

	// Maybe keep the clock for some iterations
	let keep_clock_for = rng.gen_range(0..10);
	if keep_clock_for > 0 {
	kept_clocks.push((keep_clock_for, clock));
	}
	}

	// Recover dead nodes
	for dead_node in dead_nodes {
	// Resolve WAL on every alive node, to recover the dead node
	let recovery_point = wals[dead_node].0.recovery_point().await;
	let mut from_deltas = HashSet::new();
	for &alive_node in &alive_nodes {
	let delta_from = wals[alive_node]
	.0
	.resolve_wal_delta(recovery_point.clone())
	.await
	.expect("failed to resolve WAL delta on alive node");
	from_deltas.insert(delta_from);
	}
	assert_eq!(from_deltas.len(), 1, "found different delta starting points in different WALs, while all should be the same");
	let delta_from = from_deltas.into_iter().next().unwrap();
	assert_eq!(
	delta_from.is_some(),
	operation_count > 0,
	"if we had operations to some node, we must find a delta, otherwise not",
	);

	// Recover WAL on the dead node from a random alive node
	if let Some(delta_from) = delta_from {
	let alive_node = *alive_nodes.choose(&mut rng).unwrap();
	wals[dead_node]
	.0
	.append_from(&wals[alive_node].0, delta_from)
	.await
	.unwrap();
	}
	}

	// All WALs must be equal, having exactly the same entries
	wals.iter()
	.map(\|wal\| wal.0.wal.lock())
	.collect::<Vec<_>>()
	.windows(2)
	.for_each(\|wals\| {
	assert!(
	wals[0].read(0).eq(wals[1].read(0)),
	"all WALs must have the same entries",
	);
	});

	// Release some kept clocks
	kept_clocks.retain(\|(mut keep_for, _)\| {
	keep_for -= 1;
	keep_for > 0
	});
	}

	for (wal, _) in wals {
	assert_wal_ordering_property(&wal, false).await;
	}
	}

	/// We cannot resolve a WAL delta if the slice contains records without a clock tag.
	#[tokio::test]
	async fn test_cannot_resolve_delta_over_untagged_record() {
	let (wal, _wal_dir) = fixture_empty_wal();

	// Insert 3 operations with clocks
	let (_, _) = wal
	.lock_and_write(&mut OperationWithClockTag::new(
	mock_operation(1),
	Some(ClockTag::new(1, 0, 1)),
	))
	.await
	.unwrap();
	let (_, _) = wal
	.lock_and_write(&mut OperationWithClockTag::new(
	mock_operation(2),
	Some(ClockTag::new(1, 0, 2)),
	))
	.await
	.unwrap();
	let (_, _) = wal
	.lock_and_write(&mut OperationWithClockTag::new(
	mock_operation(3),
	Some(ClockTag::new(1, 0, 3)),
	))
	.await
	.unwrap();

	// Can resolve a delta for the last clock
	let mut recovery_point = RecoveryPoint::default();
	recovery_point.insert(1, 0, 2);
	let resolve_result = wal.resolve_wal_delta(recovery_point).await.unwrap();
	assert_eq!(resolve_result, Some(2));

	// Insert operation 4 and 5, where operation 4 does not have a clock tag
	let (_, _) = wal
	.lock_and_write(&mut OperationWithClockTag::new(mock_operation(4), None))
	.await
	.unwrap();
	let (_, _) = wal
	.lock_and_write(&mut OperationWithClockTag::new(
	mock_operation(5),
	Some(ClockTag::new(1, 0, 4)),
	))
	.await
	.unwrap();

	// Can still resolve a delta for the last clock
	let mut recovery_point = RecoveryPoint::default();
	recovery_point.insert(1, 0, 4);
	let resolve_result = wal.resolve_wal_delta(recovery_point).await.unwrap();
	assert_eq!(resolve_result, None);

	// Cannot resolve a delta for our previous clock, it now has an untagged record after it
	let mut recovery_point = RecoveryPoint::default();
	recovery_point.insert(1, 0, 2);
	let resolve_result = wal.resolve_wal_delta(recovery_point).await;
	assert_eq!(resolve_result.unwrap_err(), WalDeltaError::UntaggedRecords);
	}

	/// Empty recovery point should not resolve any diff.
	#[test]
	fn test_empty_recovery_point() {
	let (wal, _wal_dir) = fixture_empty_wal();

	// Empty recovery points, should not resolve any diff
	let recovery_point = RecoveryPoint::default();
	let newest_clocks = RecoveryPoint::default();

	let resolve_result = resolve_wal_delta(
	wal.wal
	.lock()
	.read_all(true)
	.map(\|(op_num, op)\| (op_num, op.clock_tag)),
	recovery_point,
	newest_clocks,
	RecoveryPoint::default(),
	);
	assert_eq!(resolve_result.unwrap_err(), WalDeltaError::Empty);
	}

	/// Recovery point with a clock our source does not know about cannot resolve a diff.
	#[test]
	fn test_recover_point_has_unknown_clock() {
	let (wal, _wal_dir) = fixture_empty_wal();

	let mut recovery_point = RecoveryPoint::default();
	let mut newest_clocks = RecoveryPoint::default();

	// Recovery point has a clock our source does not know about
	recovery_point.insert(1, 0, 15);
	recovery_point.insert(1, 1, 8);
	recovery_point.insert(2, 1, 5);
	newest_clocks.insert(1, 0, 20);
	newest_clocks.insert(1, 1, 8);

	let resolve_result = resolve_wal_delta(
	wal.wal
	.lock()
	.read_all(true)
	.map(\|(op_num, op)\| (op_num, op.clock_tag)),
	recovery_point,
	newest_clocks,
	RecoveryPoint::default(),
	);
	assert_eq!(resolve_result.unwrap_err(), WalDeltaError::UnknownClocks);
	}

	/// Recovery point with higher clocks than the source cannot resolve a diff.
	#[test]
	fn test_recover_point_higher_than_source() {
	let (wal, _wal_dir) = fixture_empty_wal();

	let mut recovery_point = RecoveryPoint::default();
	let mut newest_clocks = RecoveryPoint::default();

	// Recovery point asks tick 10, but source only has tick 8
	recovery_point.insert(1, 0, 15);
	recovery_point.insert(1, 1, 10);
	newest_clocks.insert(1, 0, 20);
	newest_clocks.insert(1, 1, 8);

	let resolve_result = resolve_wal_delta(
	wal.wal
	.lock()
	.read_all(true)
	.map(\|(op_num, op)\| (op_num, op.clock_tag)),
	recovery_point,
	newest_clocks,
	RecoveryPoint::default(),
	);
	assert_eq!(
	resolve_result.unwrap_err(),
	WalDeltaError::HigherThanCurrent
	);
	}

	/// Recovery point requests clocks that are already truncated
	#[test]
	fn test_recover_point_cutoff() {
	let (wal, _wal_dir) = fixture_empty_wal();

	let mut recovery_point = RecoveryPoint::default();
	let mut newest_clocks = RecoveryPoint::default();
	let mut oldest_clocks = RecoveryPoint::default();

	// Recovery point asks clock tick that has been truncated already
	recovery_point.insert(1, 0, 15);
	recovery_point.insert(1, 1, 10);
	newest_clocks.insert(1, 0, 20);
	newest_clocks.insert(1, 1, 12);
	oldest_clocks.insert(1, 0, 16);

	let resolve_result = resolve_wal_delta(
	wal.wal
	.lock()
	.read_all(true)
	.map(\|(op_num, op)\| (op_num, op.clock_tag)),
	recovery_point,
	newest_clocks,
	oldest_clocks,
	);
	assert_eq!(resolve_result.unwrap_err(), WalDeltaError::Cutoff);
	}

	/// Recovery point operations are not in our WAL.
	#[test]
	fn test_recover_point_not_in_wal() {
	let (wal, _wal_dir) = fixture_empty_wal();

	let mut recovery_point = RecoveryPoint::default();
	let mut newest_clocks = RecoveryPoint::default();

	// Recovery point asks tick 10, but source only has tick 8
	recovery_point.insert(1, 0, 15);
	recovery_point.insert(1, 1, 10);
	newest_clocks.insert(1, 0, 20);
	newest_clocks.insert(1, 1, 12);

	let resolve_result = resolve_wal_delta(
	wal.wal
	.lock()
	.read_all(true)
	.map(\|(op_num, op)\| (op_num, op.clock_tag)),
	recovery_point,
	newest_clocks,
	RecoveryPoint::default(),
	);
	assert_eq!(resolve_result.unwrap_err(), WalDeltaError::NotFound);
	}

	/// Assert that we `check_clock_tag_ordering_property` on the WAL.
	async fn assert_wal_ordering_property(wal: &RecoverableWal, allow_gaps: bool) {
	// Grab list of clock tags from WAL records, skip non-existent or below cutoff tags
	let clock_tags = {
	let cutoff = wal.oldest_clocks.lock().await;
	wal.wal
	.lock()
	.read(0)
	// Only take records with clock tags
	.filter_map(\|(_, operation)\| operation.clock_tag)
	// Clock tags must be equal or higher to cutoff point
	.filter(\|clock_tag\| {
	cutoff
	.current_tick(clock_tag.peer_id, clock_tag.clock_id)
	.map_or(true, \|cutoff_tick\| clock_tag.clock_tick >= cutoff_tick)
	})
	.collect::<Vec<_>>()
	};

	check_clock_tag_ordering_property(&clock_tags, allow_gaps)
	.expect("WAL ordering property violated");
	}

	/// Test that we satisfy the clock ordering property, allowing WAL recovery resolution.
	///
	/// Property:
	/// For each operation with peer+clock tick X, all following operations having the same
	/// peer+clock must cover ticks X+1, X+2, ..., X+n in order up to the highest tick value of
	/// that peer+clock in the WAL.
	///
	/// More specifically, this tests the property again on every clock tag. The result of this
	/// check is that the sequence always ends in order at the end, going up to the highest clock
	/// clock tick.
	///
	/// `allow_gaps` specifies whether gaps in the clock tick sequences are allowed.
	///
	/// This logic is validated with examples in `validate_clock_tag_ordering_property`.
	///
	/// This property may not be valid if a diff transfer has not been resolved correctly or
	/// completely, or if the WAL got malformed in another way.
	fn check_clock_tag_ordering_property(
	clock_tags: &[ClockTag],
	allow_gaps: bool,
	) -> Result<(), String> {
	// Get the highest clock value for each clock+peer
	let mut highest_clocks = HashMap::new();
	for clock_tag in clock_tags {
	highest_clocks
	.entry((clock_tag.peer_id, clock_tag.clock_id))
	.and_modify(\|highest\| highest = clock_tag.clock_tick.max(highest))
	.or_insert(clock_tag.clock_tick);
	}

	// Test each clock tag for the ordering property
	for (i, clock_tag) in clock_tags.iter().enumerate() {
	let key = (clock_tag.peer_id, clock_tag.clock_id);
	let highest = highest_clocks[&key];

	// An ordered list of ticks we must see for this peer+clock
	let mut must_see_ticks =
	((clock_tag.clock_tick + 1)..=highest).collect::<VecDeque<_>>();

	// For all the following clock tags of the same peer+clock, remove their tick value
	for newer in clock_tags.iter().skip(i + 1) {
	// Skip other peer and clock pairs
	if (newer.peer_id, newer.clock_id) != key {
	continue;
	}

	// Keep removing ticks we must see from the beginning of the list
	// If we don't allow gaps, we only remove this exact tick from the beginning
	// If we do allow gaps, we remove this tick and all lower ones from the beginning
	while {
	must_see_ticks.front().map_or(false, \|&tick\| {
	if allow_gaps {
	tick <= newer.clock_tick
	} else {
	tick == newer.clock_tick
	}
	})
	} {
	must_see_ticks.pop_front().unwrap();
	}
	}

	// If list is not empty, we have not seen all numbers
	if !must_see_ticks.is_empty() {
	return Err(format!(
	"following clock tags did not cover ticks [{}] in order (peer_id: {}, clock_id: {}, max_tick: {highest})",
	must_see_ticks.into_iter().map(\|tick\| tick.to_string()).collect::<Vec<_>>().join(", "),
	clock_tag.peer_id,
	clock_tag.clock_id,
	));
	}
	}

	Ok(())
	}

	/// Validate that `check_clock_tag_ordering_property` works as expected.
	///
	/// Yes, this is a test for a test for a test. (⌐■_■)
	#[rstest]
	fn validate_clock_tag_ordering_property(#[values(false, true)] allow_gaps: bool) {
	// Empty is fine
	check_clock_tag_ordering_property(&[], allow_gaps).unwrap();

	// Any one clock tag is fine
	check_clock_tag_ordering_property(&[ClockTag::new(1, 2, 3)], allow_gaps).unwrap();

	// Clock tags in order are allowed
	check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 3),
	],
	allow_gaps,
	)
	.unwrap();

	// Clock tags in order with gaps are only allowed if specified
	let result = check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	// Misses 1:0:3-9
	ClockTag::new(1, 0, 10),
	ClockTag::new(1, 0, 11),
	],
	allow_gaps,
	);
	assert_eq!(result.is_ok(), allow_gaps);

	// Not starting at zero (truncated) is allowed
	check_clock_tag_ordering_property(
	&[
	// Truncated
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 3),
	ClockTag::new(1, 0, 4),
	],
	allow_gaps,
	)
	.unwrap();

	// Repeated clock tags are allowed
	check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 2),
	],
	allow_gaps,
	)
	.unwrap();

	// Repeating clock tag sequence is allowed
	check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	// Repeats 1:0:0-2 two more times
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	],
	allow_gaps,
	)
	.unwrap();

	// Repeating part of clock tag sequence is allowed
	check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 3),
	// Repeats 1:0:2-3 two more times
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 3),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 3),
	],
	allow_gaps,
	)
	.unwrap();

	// Repeating clock tag sequence with new ones at the end is allowed
	check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	// Repeats 1:0:0-2 one more time
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	// Adds 1:0:3-6 on top of it
	ClockTag::new(1, 0, 3),
	ClockTag::new(1, 0, 4),
	ClockTag::new(1, 0, 5),
	ClockTag::new(1, 0, 6),
	],
	allow_gaps,
	)
	.unwrap();

	// Repeating clock tags in random order is allowed, as long as the end is in order
	check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	// Repeats 1:0:0-2 a few more times in random order
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	// Adds 1:0:3 on top of it
	ClockTag::new(1, 0, 3),
	],
	allow_gaps,
	)
	.unwrap();

	// Repeating clock tag sequence must not miss clock tags at the end
	check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	// Misses 1:0:2
	],
	allow_gaps,
	)
	.unwrap_err();

	// Repeating clock tag sequence must only miss clock tags in the middle if specified
	let result = check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 0, 0),
	// Misses 1:0:1
	ClockTag::new(1, 0, 2),
	],
	allow_gaps,
	);
	assert_eq!(result.is_ok(), allow_gaps);

	// Skipping a clock ID is allowed
	check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	// Skipped clock ID 1
	ClockTag::new(1, 2, 10),
	ClockTag::new(1, 2, 11),
	ClockTag::new(1, 2, 12),
	],
	allow_gaps,
	)
	.unwrap();

	// Intermixed repeating clock tag sequence is allowed
	check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 1, 0),
	ClockTag::new(2, 0, 0),
	ClockTag::new(2, 0, 1),
	ClockTag::new(2, 0, 2),
	ClockTag::new(1, 1, 1),
	ClockTag::new(2, 0, 0),
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 1, 2),
	ClockTag::new(1, 1, 3),
	ClockTag::new(2, 0, 1),
	ClockTag::new(1, 0, 1),
	ClockTag::new(2, 0, 2),
	ClockTag::new(1, 1, 4),
	ClockTag::new(1, 0, 2),
	ClockTag::new(1, 1, 5),
	],
	allow_gaps,
	)
	.unwrap();

	// Intermixed clock tag sequence where one tick for peer 2 is missing is only allowed if specified
	let result = check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(2, 0, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(2, 0, 1),
	ClockTag::new(1, 0, 2),
	// Misses 2:0:2
	ClockTag::new(1, 0, 3),
	ClockTag::new(2, 0, 3),
	],
	allow_gaps,
	);
	assert_eq!(result.is_ok(), allow_gaps);

	// Intermixed clock tag sequence where one tick for clock ID 1 is missing is only allowed if specified
	let result = check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(1, 1, 0),
	ClockTag::new(1, 0, 1),
	ClockTag::new(1, 1, 1),
	ClockTag::new(1, 0, 2),
	// Misses 1:1:2
	ClockTag::new(1, 0, 3),
	ClockTag::new(1, 1, 3),
	],
	allow_gaps,
	);
	assert_eq!(result.is_ok(), allow_gaps);

	// Intermixed clock tag sequence where one tick is missing is not allowed
	check_clock_tag_ordering_property(
	&[
	ClockTag::new(1, 0, 0),
	ClockTag::new(2, 0, 0),
	ClockTag::new(3, 0, 0),
	ClockTag::new(3, 0, 1),
	ClockTag::new(1, 0, 1),
	ClockTag::new(2, 0, 1),
	ClockTag::new(3, 0, 2),
	ClockTag::new(2, 0, 2),
	ClockTag::new(1, 0, 2),
	// Peer 2 only partially recovering here, missing 2:0:2
	ClockTag::new(2, 0, 0),
	ClockTag::new(2, 0, 1),
	// Peer 1 and 3 continue
	ClockTag::new(1, 0, 3),
	ClockTag::new(1, 0, 4),
	ClockTag::new(3, 0, 3),
	ClockTag::new(3, 0, 4),
	],
	allow_gaps,
	)
	.unwrap_err();
	}
	}