use std::ffi::CString; use std::sync::Arc; use ash::vk; use gpu_allocator::vulkan::{Allocation, AllocationCreateDesc, Allocator, AllocatorCreateDesc}; use parking_lot::Mutex; use crate::*; /// GPU device structure. /// It's a wrapper around Vulkan device. pub struct Device { /// Instance that owns the device. instance: Arc, /// Native Vulkan device handle. vk_device: ash::Device, /// GPU memory allocator from `gpu-allocator` crate. /// It's an Option because of drop order. We need to drop it before the device. /// But `Allocator` is destroyed by it's own drop. gpu_allocator: Option>, /// All found compute queues. compute_queues: Vec, /// All found transfer queues. _transfer_queues: Vec, /// GPU subgroup (warp in CUDA terms) size. subgroup_size: usize, /// Is subgroup size (warp) dynamic. /// If true, we need to use additional subgroup size control in the pipeline. /// And use Vulkan extension that allows to set subgroup size. is_dynamic_subgroup_size: bool, /// Maximum work group size for compute shaders. /// It's used in bounds checking in Context. max_compute_work_group_count: [usize; 3], /// Selected queue index to use. queue_index: usize, } // GPU execution queue. #[derive(Clone)] pub struct Queue { // Native Vulkan queue handler. pub vk_queue: vk::Queue, // Queue family index for the native Vulkan queue. pub vk_queue_family_index: usize, // Index in the family for the native Vulkan queue. pub vk_queue_index: usize, } impl Device { pub fn new( instance: Arc, vk_physical_device: &PhysicalDevice, ) -> GpuResult> { Self::new_with_queue_index(instance, vk_physical_device, 0) } pub fn new_with_queue_index( instance: Arc, vk_physical_device: &PhysicalDevice, queue_index: usize, ) -> GpuResult> { #[allow(unused_mut)] let mut extensions_cstr: Vec = vec![CString::from(ash::khr::maintenance1::NAME)]; #[cfg(target_os = "macos")] { extensions_cstr.push(CString::from(ash::khr::portability_subset::NAME)); } let vk_queue_families = unsafe { instance .vk_instance() .get_physical_device_queue_family_properties(vk_physical_device.vk_physical_device) }; let max_queue_priorities_count = vk_queue_families .iter() .map(|vk_queue_family| vk_queue_family.queue_count as usize) .max() .ok_or_else(|| GpuError::Other("No queue families found".to_string()))?; let queue_priorities = vec![0.; max_queue_priorities_count]; let queue_create_infos: Vec = (0..vk_queue_families.len()) .map(|queue_family_index| { vk::DeviceQueueCreateInfo::default() .flags(vk::DeviceQueueCreateFlags::empty()) .queue_family_index(queue_family_index as u32) .queue_priorities(queue_priorities.as_slice()) }) .collect(); let physical_device_features = vk::PhysicalDeviceFeatures::default(); // TODO(gpu): check presence of features // Define Vulkan features that we need. let mut enabled_physical_device_features_1_1 = vk::PhysicalDeviceVulkan11Features::default(); let mut enabled_physical_device_features_1_2 = vk::PhysicalDeviceVulkan12Features::default(); let mut enabled_physical_device_features_1_3 = vk::PhysicalDeviceVulkan13Features::default(); let mut enabled_physical_devices_features = vk::PhysicalDeviceFeatures2::default() .push_next(&mut enabled_physical_device_features_1_1) .push_next(&mut enabled_physical_device_features_1_2) .push_next(&mut enabled_physical_device_features_1_3); unsafe { instance.vk_instance().get_physical_device_features2( vk_physical_device.vk_physical_device, &mut enabled_physical_devices_features, ); }; // From Vulkan 1.1 we need storage buffer 16 bit access. if !enabled_physical_device_features_1_1.storage_buffer16_bit_access == 0 { return Err(GpuError::NotSupported( "Storage buffer 16 bit access is not supported".to_string(), )); } let mut physical_device_features_1_1 = vk::PhysicalDeviceVulkan11Features::default().storage_buffer16_bit_access(true); // From Vulkan 1.2 we need int8/float16 support. if !enabled_physical_device_features_1_2.shader_int8 == 0 { return Err(GpuError::NotSupported("Int8 is not supported".to_string())); } if !enabled_physical_device_features_1_2.shader_float16 == 0 { return Err(GpuError::NotSupported( "Float16 is not supported".to_string(), )); } if !enabled_physical_device_features_1_2.storage_buffer8_bit_access == 0 { return Err(GpuError::NotSupported( "Storage buffer 8 bit access is not supported".to_string(), )); } let mut physical_device_features_1_2 = vk::PhysicalDeviceVulkan12Features::default() .shader_int8(true) .shader_float16(true) .storage_buffer8_bit_access(true); // From Vulkan 1.3 we need subgroup size control if it's dynamic. let mut physical_device_features_1_3 = vk::PhysicalDeviceVulkan13Features::default(); let max_compute_work_group_count; let mut is_dynamic_subgroup_size = false; let subgroup_size = unsafe { let props = instance .vk_instance() .get_physical_device_properties(vk_physical_device.vk_physical_device); max_compute_work_group_count = [ props.limits.max_compute_work_group_count[0] as usize, props.limits.max_compute_work_group_count[1] as usize, props.limits.max_compute_work_group_count[2] as usize, ]; let mut subgroup_properties = vk::PhysicalDeviceSubgroupProperties::default(); let mut vulkan_1_3_properties = vk::PhysicalDeviceVulkan13Properties::default(); let mut props2 = vk::PhysicalDeviceProperties2::default() .push_next(&mut subgroup_properties) .push_next(&mut vulkan_1_3_properties); instance.vk_instance().get_physical_device_properties2( vk_physical_device.vk_physical_device, &mut props2, ); let subgroup_size = if vulkan_1_3_properties.min_subgroup_size != vulkan_1_3_properties.max_subgroup_size { if !enabled_physical_device_features_1_3.subgroup_size_control == 0 { return Err(GpuError::NotSupported( "Subgroup size control is not supported".to_string(), )); } physical_device_features_1_3 = physical_device_features_1_3.subgroup_size_control(true); if !vulkan_1_3_properties .required_subgroup_size_stages .contains(vk::ShaderStageFlags::COMPUTE) { // A strange situation where subgroup size can be different but we cannot set it. // We cannot handle this case (we have to know subgroup size), so skip device creation. return Err(GpuError::NotSupported( "Subgroup size is dynamic but not supported for compute stage".to_string(), )); } is_dynamic_subgroup_size = true; // prefer max subgroup size vulkan_1_3_properties.max_subgroup_size as usize } else { subgroup_properties.subgroup_size as usize }; log::info!("Create GPU device {}", vk_physical_device.name); log::debug!("GPU subgroup size: {subgroup_size}"); subgroup_size }; // convert extension names to raw pointers to provide to Vulkan Self::check_extensions_list( &instance, vk_physical_device.vk_physical_device, &extensions_cstr, )?; let extension_names_raw: Vec<*const i8> = extensions_cstr .iter() .map(|raw_name| raw_name.as_ptr()) .collect(); let device_create_info = vk::DeviceCreateInfo::default() .flags(vk::DeviceCreateFlags::empty()) .queue_create_infos(&queue_create_infos) .enabled_extension_names(&extension_names_raw) .enabled_features(&physical_device_features) .push_next(&mut physical_device_features_1_1) .push_next(&mut physical_device_features_1_2) .push_next(&mut physical_device_features_1_3); let vk_device_result = unsafe { instance.vk_instance().create_device( vk_physical_device.vk_physical_device, &device_create_info, instance.cpu_allocation_callbacks(), ) }; let vk_device = match vk_device_result { Ok(vk_device) => vk_device, Err(e) => return Err(GpuError::from(e)), }; let mut compute_queues = Vec::new(); let mut transfer_queues = Vec::new(); for (vk_queue_family_index, vk_queue_family) in vk_queue_families.iter().enumerate() { for vk_queue_index in 0..vk_queue_family.queue_count as usize { let vk_queue = unsafe { vk_device.get_device_queue(vk_queue_family_index as u32, vk_queue_index as u32) }; let queue = Queue { vk_queue, vk_queue_index, vk_queue_family_index, }; let queue_flags = vk_queue_family.queue_flags; if vk_queue != vk::Queue::null() { if queue_flags.contains(vk::QueueFlags::TRANSFER) { transfer_queues.push(queue.clone()); } if queue_flags.contains(vk::QueueFlags::COMPUTE) { compute_queues.push(queue); } } } } let gpu_allocator_result = Allocator::new(&AllocatorCreateDesc { instance: instance.vk_instance().clone(), device: vk_device.clone(), physical_device: vk_physical_device.vk_physical_device, debug_settings: Default::default(), buffer_device_address: false, allocation_sizes: Default::default(), }); let gpu_allocator = match gpu_allocator_result { Ok(gpu_allocator) => Some(Mutex::new(gpu_allocator)), Err(e) => { unsafe { vk_device.destroy_device(instance.cpu_allocation_callbacks()); } return Err(GpuError::from(e)); } }; Ok(Arc::new(Device { instance: instance.clone(), vk_device, gpu_allocator, compute_queues, _transfer_queues: transfer_queues, subgroup_size, max_compute_work_group_count, is_dynamic_subgroup_size, queue_index, })) } /// Get CPU allocator. pub fn cpu_allocation_callbacks(&self) -> Option<&vk::AllocationCallbacks> { self.instance.cpu_allocation_callbacks() } /// Allocate GPU memory. pub fn allocate(&self, allocation_desc: &AllocationCreateDesc) -> GpuResult { if let Some(gpu_allocator) = &self.gpu_allocator { let mut gpu_allocator = gpu_allocator.lock(); gpu_allocator .allocate(allocation_desc) .map_err(GpuError::from) } else { Err(GpuError::Other( "GPU allocator is not available".to_string(), )) } } /// Free GPU memory. pub fn free(&self, allocation: Allocation) { if let Some(gpu_allocator) = &self.gpu_allocator { let mut gpu_allocator = gpu_allocator.lock(); if let Err(e) = gpu_allocator.free(allocation) { // Log error because free is called from Drop. log::error!("Failed to free GPU memory: {:?}", e); } } else { log::error!("GPU allocator is not available"); } } /// Get subgroup size (warp in terms of CUDA). pub fn subgroup_size(&self) -> usize { self.subgroup_size } pub fn instance(&self) -> Arc { self.instance.clone() } pub fn vk_device(&self) -> &ash::Device { &self.vk_device } pub fn is_dynamic_subgroup_size(&self) -> bool { self.is_dynamic_subgroup_size } pub fn max_compute_work_group_count(&self) -> [usize; 3] { self.max_compute_work_group_count } pub fn compute_queue(&self) -> &Queue { &self.compute_queues[self.queue_index % self.compute_queues.len()] } fn check_extensions_list( instance: &Instance, vk_physical_device: vk::PhysicalDevice, required_extensions: &[CString], ) -> GpuResult<()> { let available_extensions = unsafe { instance .vk_instance() .enumerate_device_extension_properties(vk_physical_device)? }; for required_extension in required_extensions { let is_extension_available = available_extensions.iter().any(|extension| { let extension_name = unsafe { std::ffi::CStr::from_ptr(extension.extension_name.as_ptr()) }; extension_name == required_extension.as_c_str() }); if !is_extension_available { return Err(GpuError::NotSupported(format!( "Extension {:?} is not supported", required_extension ))); } } Ok(()) } } impl Drop for Device { fn drop(&mut self) { self.gpu_allocator = None; unsafe { // For now, we don't need to wait for device idle. // It doesn't have timeout, so it can hang the application. // Moreover, we control all execution by Context and catch timeout. // It we have infinity loops in shader we leak this device and let it running intil OS stops it. // self.vk_device.device_wait_idle().unwrap(); self.vk_device .destroy_device(self.cpu_allocation_callbacks()); } } }