#[cfg(feature = "web")] mod actix; mod common; mod consensus; mod greeting; mod issues_setup; mod migrations; mod settings; mod snapshots; mod startup; mod tonic; mod tracing; use std::io::Error; use std::sync::Arc; use std::thread; use std::thread::JoinHandle; use std::time::Duration; use ::common::cpu::{get_cpu_budget, CpuBudget}; use ::tonic::transport::Uri; use api::grpc::transport_channel_pool::TransportChannelPool; use clap::Parser; use collection::shards::channel_service::ChannelService; use consensus::Consensus; use slog::Drain; use startup::setup_panic_hook; use storage::content_manager::consensus::operation_sender::OperationSender; use storage::content_manager::consensus::persistent::Persistent; use storage::content_manager::consensus_manager::{ConsensusManager, ConsensusStateRef}; use storage::content_manager::toc::dispatcher::TocDispatcher; use storage::content_manager::toc::TableOfContent; use storage::dispatcher::Dispatcher; use storage::rbac::Access; #[cfg(all( not(target_env = "msvc"), any(target_arch = "x86_64", target_arch = "aarch64") ))] use tikv_jemallocator::Jemalloc; use crate::common::helpers::{ create_general_purpose_runtime, create_search_runtime, create_update_runtime, load_tls_client_config, }; use crate::common::inference::service::InferenceService; use crate::common::telemetry::TelemetryCollector; use crate::common::telemetry_reporting::TelemetryReporter; use crate::greeting::welcome; use crate::migrations::single_to_cluster::handle_existing_collections; use crate::settings::Settings; use crate::snapshots::{recover_full_snapshot, recover_snapshots}; use crate::startup::{remove_started_file_indicator, touch_started_file_indicator}; #[cfg(all( not(target_env = "msvc"), any(target_arch = "x86_64", target_arch = "aarch64") ))] #[global_allocator] static GLOBAL: Jemalloc = Jemalloc; const FULL_ACCESS: Access = Access::full("For main"); /// Qdrant (read: quadrant ) is a vector similarity search engine. /// It provides a production-ready service with a convenient API to store, search, and manage points - vectors with an additional payload. /// /// This CLI starts a Qdrant peer/server. #[derive(Parser, Debug)] #[command(version, about)] struct Args { /// Uri of the peer to bootstrap from in case of multi-peer deployment. /// If not specified - this peer will be considered as a first in a new deployment. #[arg(long, value_parser, value_name = "URI")] bootstrap: Option, /// Uri of this peer. /// Other peers should be able to reach it by this uri. /// /// This value has to be supplied if this is the first peer in a new deployment. /// /// In case this is not the first peer and it bootstraps the value is optional. /// If not supplied then qdrant will take internal grpc port from config and derive the IP address of this peer on bootstrap peer (receiving side) #[arg(long, value_parser, value_name = "URI")] uri: Option, /// Force snapshot re-creation /// If provided - existing collections will be replaced with snapshots. /// Default is to not recreate from snapshots. #[arg(short, long, action, default_value_t = false)] force_snapshot: bool, /// List of paths to snapshot files. /// Format: : /// /// WARN: Do not use this option if you are recovering collection in existing distributed cluster. /// Use `/collections//snapshots/recover` API instead. #[arg(long, value_name = "PATH:NAME", alias = "collection-snapshot")] snapshot: Option>, /// Path to snapshot of multiple collections. /// Format: /// /// WARN: Do not use this option if you are recovering collection in existing distributed cluster. /// Use `/collections//snapshots/recover` API instead. #[arg(long, value_name = "PATH")] storage_snapshot: Option, /// Path to an alternative configuration file. /// Format: /// /// Default path: config/config.yaml #[arg(long, value_name = "PATH")] config_path: Option, /// Disable telemetry sending to developers /// If provided - telemetry collection will be disabled. /// Read more: #[arg(long, action, default_value_t = false)] disable_telemetry: bool, /// Run stacktrace collector. Used for debugging. #[arg(long, action, default_value_t = false)] stacktrace: bool, /// Reinit consensus state. /// When enabled, the service will assume the consensus should be reinitialized. /// The exact behavior depends on if this current node has bootstrap URI or not. /// If it has - it'll remove current consensus state and consensus WAL (while keeping peer ID) /// and will try to receive state from the bootstrap peer. /// If it doesn't have - it'll remove other peers from voters promote /// the current peer to the leader and the single member of the cluster. /// It'll also compact consensus WAL to force snapshot #[arg(long, action, default_value_t = false)] reinit: bool, } fn main() -> anyhow::Result<()> { let args = Args::parse(); // Run backtrace collector, expected to used by `rstack` crate if args.stacktrace { #[cfg(all(target_os = "linux", feature = "stacktrace"))] { let _ = rstack_self::child(); } return Ok(()); } remove_started_file_indicator(); let settings = Settings::new(args.config_path)?; let reporting_enabled = !settings.telemetry_disabled && !args.disable_telemetry; let reporting_id = TelemetryCollector::generate_id(); let logger_handle = tracing::setup( settings .logger .with_top_level_directive(settings.log_level.clone()), )?; setup_panic_hook(reporting_enabled, reporting_id.to_string()); memory::madvise::set_global(settings.storage.mmap_advice); segment::vector_storage::common::set_async_scorer( settings .storage .performance .async_scorer .unwrap_or_default(), ); welcome(&settings); if let Some(recovery_warning) = &settings.storage.recovery_mode { log::warn!("Qdrant is loaded in recovery mode: {}", recovery_warning); log::warn!( "Read more: https://qdrant.tech/documentation/guides/administration/#recovery-mode" ); } // Validate as soon as possible, but we must initialize logging first settings.validate_and_warn(); // Saved state of the consensus. let persistent_consensus_state = Persistent::load_or_init( &settings.storage.storage_path, args.bootstrap.is_none(), args.reinit, )?; let is_distributed_deployment = settings.cluster.enabled; let temp_path = settings.storage.temp_path.as_deref(); let restored_collections = if let Some(full_snapshot) = args.storage_snapshot { recover_full_snapshot( temp_path, &full_snapshot, &settings.storage.storage_path, args.force_snapshot, persistent_consensus_state.this_peer_id(), is_distributed_deployment, ) } else if let Some(snapshots) = args.snapshot { // recover from snapshots recover_snapshots( &snapshots, args.force_snapshot, temp_path, &settings.storage.storage_path, persistent_consensus_state.this_peer_id(), is_distributed_deployment, ) } else { vec![] }; // Create and own search runtime out of the scope of async context to ensure correct // destruction of it let search_runtime = create_search_runtime(settings.storage.performance.max_search_threads) .expect("Can't search create runtime."); let update_runtime = create_update_runtime(settings.storage.performance.max_optimization_threads) .expect("Can't optimizer create runtime."); let general_runtime = create_general_purpose_runtime().expect("Can't optimizer general purpose runtime."); let runtime_handle = general_runtime.handle().clone(); // Use global CPU budget for optimizations based on settings let optimizer_cpu_budget = CpuBudget::new(get_cpu_budget( settings.storage.performance.optimizer_cpu_budget, )); // Create a signal sender and receiver. It is used to communicate with the consensus thread. let (propose_sender, propose_receiver) = std::sync::mpsc::channel(); let propose_operation_sender = if settings.cluster.enabled { // High-level channel which could be used to send User-space consensus operations Some(OperationSender::new(propose_sender)) } else { // We don't need sender for the single-node mode None }; // Channel service is used to manage connections between peers. // It allocates required number of channels and manages proper reconnection handling let mut channel_service = ChannelService::new(settings.service.http_port, settings.service.api_key.clone()); if is_distributed_deployment { // We only need channel_service in case if cluster is enabled. // So we initialize it with real values here let p2p_grpc_timeout = Duration::from_millis(settings.cluster.grpc_timeout_ms); let connection_timeout = Duration::from_millis(settings.cluster.connection_timeout_ms); let tls_config = load_tls_client_config(&settings)?; channel_service.channel_pool = Arc::new(TransportChannelPool::new( p2p_grpc_timeout, connection_timeout, settings.cluster.p2p.connection_pool_size, tls_config, )); channel_service.id_to_address = persistent_consensus_state.peer_address_by_id.clone(); channel_service.id_to_metadata = persistent_consensus_state.peer_metadata_by_id.clone(); } // Table of content manages the list of collections. // It is a main entry point for the storage. let toc = TableOfContent::new( &settings.storage, search_runtime, update_runtime, general_runtime, optimizer_cpu_budget, channel_service.clone(), persistent_consensus_state.this_peer_id(), propose_operation_sender.clone(), ); toc.clear_all_tmp_directories()?; // Here we load all stored collections. runtime_handle.block_on(async { for collection in toc.all_collections(&FULL_ACCESS).await { log::debug!("Loaded collection: {collection}"); } }); let toc_arc = Arc::new(toc); let storage_path = toc_arc.storage_path(); // Holder for all actively running threads of the service: web, gPRC, consensus, etc. let mut handles: Vec>> = vec![]; // Router for external queries. // It decides if query should go directly to the ToC or through the consensus. let mut dispatcher = Dispatcher::new(toc_arc.clone()); let (telemetry_collector, dispatcher_arc, health_checker) = if is_distributed_deployment { let consensus_state: ConsensusStateRef = ConsensusManager::new( persistent_consensus_state, toc_arc.clone(), propose_operation_sender.unwrap(), storage_path, ) .into(); let is_new_deployment = consensus_state.is_new_deployment(); dispatcher = dispatcher.with_consensus(consensus_state.clone()); let toc_dispatcher = TocDispatcher::new(Arc::downgrade(&toc_arc), consensus_state.clone()); toc_arc.with_toc_dispatcher(toc_dispatcher); let dispatcher_arc = Arc::new(dispatcher); // Monitoring and telemetry. let telemetry_collector = TelemetryCollector::new(settings.clone(), dispatcher_arc.clone(), reporting_id); let tonic_telemetry_collector = telemetry_collector.tonic_telemetry_collector.clone(); // `raft` crate uses `slog` crate so it is needed to use `slog_stdlog::StdLog` to forward // logs from it to `log` crate let slog_logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), slog::o!()); // Runs raft consensus in a separate thread. // Create a pipe `message_sender` to communicate with the consensus let health_checker = Arc::new(common::health::HealthChecker::spawn( toc_arc.clone(), consensus_state.clone(), &runtime_handle, // NOTE: `wait_for_bootstrap` should be calculated *before* starting `Consensus` thread consensus_state.is_new_deployment() && args.bootstrap.is_some(), )); let handle = Consensus::run( &slog_logger, consensus_state.clone(), args.bootstrap, args.uri.map(|uri| uri.to_string()), settings.clone(), channel_service, propose_receiver, tonic_telemetry_collector, toc_arc.clone(), runtime_handle.clone(), args.reinit, ) .expect("Can't initialize consensus"); handles.push(handle); let toc_arc_clone = toc_arc.clone(); let consensus_state_clone = consensus_state.clone(); let _cancel_transfer_handle = runtime_handle.spawn(async move { consensus_state_clone.is_leader_established.await_ready(); match toc_arc_clone .cancel_outgoing_all_transfers("Source peer restarted") .await { Ok(_) => { log::debug!("All transfers if any cancelled"); } Err(err) => { log::error!("Can't cancel outgoing transfers: {}", err); } } }); // TODO(resharding): Remove resharding driver? // // runtime_handle.block_on(async { // toc_arc.resume_resharding_tasks().await; // }); let collections_to_recover_in_consensus = if is_new_deployment { let existing_collections = runtime_handle.block_on(toc_arc.all_collections(&FULL_ACCESS)); existing_collections .into_iter() .map(|pass| pass.name().to_string()) .collect() } else { restored_collections }; if !collections_to_recover_in_consensus.is_empty() { runtime_handle.block_on(handle_existing_collections( toc_arc.clone(), consensus_state.clone(), dispatcher_arc.clone(), consensus_state.this_peer_id(), collections_to_recover_in_consensus, )); } (telemetry_collector, dispatcher_arc, Some(health_checker)) } else { log::info!("Distributed mode disabled"); let dispatcher_arc = Arc::new(dispatcher); // Monitoring and telemetry. let telemetry_collector = TelemetryCollector::new(settings.clone(), dispatcher_arc.clone(), reporting_id); (telemetry_collector, dispatcher_arc, None) }; let tonic_telemetry_collector = telemetry_collector.tonic_telemetry_collector.clone(); // // Telemetry reporting // let reporting_id = telemetry_collector.reporting_id(); let telemetry_collector = Arc::new(tokio::sync::Mutex::new(telemetry_collector)); if reporting_enabled { log::info!("Telemetry reporting enabled, id: {}", reporting_id); runtime_handle.spawn(TelemetryReporter::run(telemetry_collector.clone())); } else { log::info!("Telemetry reporting disabled"); } // Setup subscribers to listen for issue-able events issues_setup::setup_subscribers(&settings); // Helper to better log start errors let log_err_if_any = |server_name, result| match result { Err(err) => { log::error!("Error while starting {} server: {}", server_name, err); Err(err) } ok => ok, }; // // Inference Service // if let Some(inference_config) = settings.inference.clone() { match InferenceService::init_global(inference_config) { Ok(_) => { log::info!("Inference service is configured."); } Err(err) => { log::error!("{err}"); } } } else { log::info!("Inference service is not configured."); } // // REST API server // #[cfg(feature = "web")] { let dispatcher_arc = dispatcher_arc.clone(); let settings = settings.clone(); let handle = thread::Builder::new() .name("web".to_string()) .spawn(move || { log_err_if_any( "REST", actix::init( dispatcher_arc.clone(), telemetry_collector, health_checker, settings, logger_handle, ), ) }) .unwrap(); handles.push(handle); } // // gRPC server // if let Some(grpc_port) = settings.service.grpc_port { let settings = settings.clone(); let handle = thread::Builder::new() .name("grpc".to_string()) .spawn(move || { log_err_if_any( "gRPC", tonic::init( dispatcher_arc, tonic_telemetry_collector, settings, grpc_port, runtime_handle, ), ) }) .unwrap(); handles.push(handle); } else { log::info!("gRPC endpoint disabled"); } #[cfg(feature = "service_debug")] { use std::fmt::Write; use parking_lot::deadlock; const DEADLOCK_CHECK_PERIOD: Duration = Duration::from_secs(10); thread::Builder::new() .name("deadlock_checker".to_string()) .spawn(move || loop { thread::sleep(DEADLOCK_CHECK_PERIOD); let deadlocks = deadlock::check_deadlock(); if deadlocks.is_empty() { continue; } let mut error = format!("{} deadlocks detected\n", deadlocks.len()); for (i, threads) in deadlocks.iter().enumerate() { writeln!(error, "Deadlock #{i}").expect("fail to writeln!"); for t in threads { writeln!( error, "Thread Id {:#?}\n{:#?}", t.thread_id(), t.backtrace() ) .expect("fail to writeln!"); } } log::error!("{}", error); }) .unwrap(); } touch_started_file_indicator(); for handle in handles { log::debug!( "Waiting for thread {} to finish", handle.thread().name().unwrap() ); handle.join().expect("thread is not panicking")?; } drop(toc_arc); drop(settings); Ok(()) }