File size: 15,078 Bytes
84d2a97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
use std::ffi::CString;
use std::sync::Arc;

use ash::vk;
use gpu_allocator::vulkan::{Allocation, AllocationCreateDesc, Allocator, AllocatorCreateDesc};
use parking_lot::Mutex;

use crate::*;

/// GPU device structure.
/// It's a wrapper around Vulkan device.
pub struct Device {
    /// Instance that owns the device.
    instance: Arc<Instance>,

    /// Native Vulkan device handle.
    vk_device: ash::Device,

    /// GPU memory allocator from `gpu-allocator` crate.
    /// It's an Option because of drop order. We need to drop it before the device.
    /// But `Allocator` is destroyed by it's own drop.
    gpu_allocator: Option<Mutex<Allocator>>,

    /// All found compute queues.
    compute_queues: Vec<Queue>,

    /// All found transfer queues.
    _transfer_queues: Vec<Queue>,

    /// GPU subgroup (warp in CUDA terms) size.
    subgroup_size: usize,

    /// Is subgroup size (warp) dynamic.
    /// If true, we need to use additional subgroup size control in the pipeline.
    /// And use Vulkan extension that allows to set subgroup size.
    is_dynamic_subgroup_size: bool,

    /// Maximum work group size for compute shaders.
    /// It's used in bounds checking in Context.
    max_compute_work_group_count: [usize; 3],

    /// Selected queue index to use.
    queue_index: usize,
}

// GPU execution queue.
#[derive(Clone)]
pub struct Queue {
    // Native Vulkan queue handler.
    pub vk_queue: vk::Queue,

    // Queue family index for the native Vulkan queue.
    pub vk_queue_family_index: usize,

    // Index in the family for the native Vulkan queue.
    pub vk_queue_index: usize,
}

impl Device {
    pub fn new(
        instance: Arc<Instance>,
        vk_physical_device: &PhysicalDevice,
    ) -> GpuResult<Arc<Device>> {
        Self::new_with_queue_index(instance, vk_physical_device, 0)
    }

    pub fn new_with_queue_index(
        instance: Arc<Instance>,
        vk_physical_device: &PhysicalDevice,
        queue_index: usize,
    ) -> GpuResult<Arc<Device>> {
        #[allow(unused_mut)]
        let mut extensions_cstr: Vec<CString> = vec![CString::from(ash::khr::maintenance1::NAME)];
        #[cfg(target_os = "macos")]
        {
            extensions_cstr.push(CString::from(ash::khr::portability_subset::NAME));
        }

        let vk_queue_families = unsafe {
            instance
                .vk_instance()
                .get_physical_device_queue_family_properties(vk_physical_device.vk_physical_device)
        };

        let max_queue_priorities_count = vk_queue_families
            .iter()
            .map(|vk_queue_family| vk_queue_family.queue_count as usize)
            .max()
            .ok_or_else(|| GpuError::Other("No queue families found".to_string()))?;
        let queue_priorities = vec![0.; max_queue_priorities_count];

        let queue_create_infos: Vec<vk::DeviceQueueCreateInfo> = (0..vk_queue_families.len())
            .map(|queue_family_index| {
                vk::DeviceQueueCreateInfo::default()
                    .flags(vk::DeviceQueueCreateFlags::empty())
                    .queue_family_index(queue_family_index as u32)
                    .queue_priorities(queue_priorities.as_slice())
            })
            .collect();

        let physical_device_features = vk::PhysicalDeviceFeatures::default();

        // TODO(gpu): check presence of features

        // Define Vulkan features that we need.
        let mut enabled_physical_device_features_1_1 =
            vk::PhysicalDeviceVulkan11Features::default();
        let mut enabled_physical_device_features_1_2 =
            vk::PhysicalDeviceVulkan12Features::default();
        let mut enabled_physical_device_features_1_3 =
            vk::PhysicalDeviceVulkan13Features::default();
        let mut enabled_physical_devices_features = vk::PhysicalDeviceFeatures2::default()
            .push_next(&mut enabled_physical_device_features_1_1)
            .push_next(&mut enabled_physical_device_features_1_2)
            .push_next(&mut enabled_physical_device_features_1_3);
        unsafe {
            instance.vk_instance().get_physical_device_features2(
                vk_physical_device.vk_physical_device,
                &mut enabled_physical_devices_features,
            );
        };

        // From Vulkan 1.1 we need storage buffer 16 bit access.
        if !enabled_physical_device_features_1_1.storage_buffer16_bit_access == 0 {
            return Err(GpuError::NotSupported(
                "Storage buffer 16 bit access is not supported".to_string(),
            ));
        }
        let mut physical_device_features_1_1 =
            vk::PhysicalDeviceVulkan11Features::default().storage_buffer16_bit_access(true);

        // From Vulkan 1.2 we need int8/float16 support.
        if !enabled_physical_device_features_1_2.shader_int8 == 0 {
            return Err(GpuError::NotSupported("Int8 is not supported".to_string()));
        }
        if !enabled_physical_device_features_1_2.shader_float16 == 0 {
            return Err(GpuError::NotSupported(
                "Float16 is not supported".to_string(),
            ));
        }
        if !enabled_physical_device_features_1_2.storage_buffer8_bit_access == 0 {
            return Err(GpuError::NotSupported(
                "Storage buffer 8 bit access is not supported".to_string(),
            ));
        }
        let mut physical_device_features_1_2 = vk::PhysicalDeviceVulkan12Features::default()
            .shader_int8(true)
            .shader_float16(true)
            .storage_buffer8_bit_access(true);

        // From Vulkan 1.3 we need subgroup size control if it's dynamic.
        let mut physical_device_features_1_3 = vk::PhysicalDeviceVulkan13Features::default();

        let max_compute_work_group_count;
        let mut is_dynamic_subgroup_size = false;
        let subgroup_size = unsafe {
            let props = instance
                .vk_instance()
                .get_physical_device_properties(vk_physical_device.vk_physical_device);
            max_compute_work_group_count = [
                props.limits.max_compute_work_group_count[0] as usize,
                props.limits.max_compute_work_group_count[1] as usize,
                props.limits.max_compute_work_group_count[2] as usize,
            ];
            let mut subgroup_properties = vk::PhysicalDeviceSubgroupProperties::default();
            let mut vulkan_1_3_properties = vk::PhysicalDeviceVulkan13Properties::default();
            let mut props2 = vk::PhysicalDeviceProperties2::default()
                .push_next(&mut subgroup_properties)
                .push_next(&mut vulkan_1_3_properties);
            instance.vk_instance().get_physical_device_properties2(
                vk_physical_device.vk_physical_device,
                &mut props2,
            );

            let subgroup_size = if vulkan_1_3_properties.min_subgroup_size
                != vulkan_1_3_properties.max_subgroup_size
            {
                if !enabled_physical_device_features_1_3.subgroup_size_control == 0 {
                    return Err(GpuError::NotSupported(
                        "Subgroup size control is not supported".to_string(),
                    ));
                }
                physical_device_features_1_3 =
                    physical_device_features_1_3.subgroup_size_control(true);

                if !vulkan_1_3_properties
                    .required_subgroup_size_stages
                    .contains(vk::ShaderStageFlags::COMPUTE)
                {
                    // A strange situation where subgroup size can be different but we cannot set it.
                    // We cannot handle this case (we have to know subgroup size), so skip device creation.
                    return Err(GpuError::NotSupported(
                        "Subgroup size is dynamic but not supported for compute stage".to_string(),
                    ));
                }
                is_dynamic_subgroup_size = true;
                // prefer max subgroup size
                vulkan_1_3_properties.max_subgroup_size as usize
            } else {
                subgroup_properties.subgroup_size as usize
            };

            log::info!("Create GPU device {}", vk_physical_device.name);
            log::debug!("GPU subgroup size: {subgroup_size}");
            subgroup_size
        };

        // convert extension names to raw pointers to provide to Vulkan
        Self::check_extensions_list(
            &instance,
            vk_physical_device.vk_physical_device,
            &extensions_cstr,
        )?;
        let extension_names_raw: Vec<*const i8> = extensions_cstr
            .iter()
            .map(|raw_name| raw_name.as_ptr())
            .collect();

        let device_create_info = vk::DeviceCreateInfo::default()
            .flags(vk::DeviceCreateFlags::empty())
            .queue_create_infos(&queue_create_infos)
            .enabled_extension_names(&extension_names_raw)
            .enabled_features(&physical_device_features)
            .push_next(&mut physical_device_features_1_1)
            .push_next(&mut physical_device_features_1_2)
            .push_next(&mut physical_device_features_1_3);

        let vk_device_result = unsafe {
            instance.vk_instance().create_device(
                vk_physical_device.vk_physical_device,
                &device_create_info,
                instance.cpu_allocation_callbacks(),
            )
        };

        let vk_device = match vk_device_result {
            Ok(vk_device) => vk_device,
            Err(e) => return Err(GpuError::from(e)),
        };

        let mut compute_queues = Vec::new();
        let mut transfer_queues = Vec::new();

        for (vk_queue_family_index, vk_queue_family) in vk_queue_families.iter().enumerate() {
            for vk_queue_index in 0..vk_queue_family.queue_count as usize {
                let vk_queue = unsafe {
                    vk_device.get_device_queue(vk_queue_family_index as u32, vk_queue_index as u32)
                };
                let queue = Queue {
                    vk_queue,
                    vk_queue_index,
                    vk_queue_family_index,
                };

                let queue_flags = vk_queue_family.queue_flags;
                if vk_queue != vk::Queue::null() {
                    if queue_flags.contains(vk::QueueFlags::TRANSFER) {
                        transfer_queues.push(queue.clone());
                    }
                    if queue_flags.contains(vk::QueueFlags::COMPUTE) {
                        compute_queues.push(queue);
                    }
                }
            }
        }

        let gpu_allocator_result = Allocator::new(&AllocatorCreateDesc {
            instance: instance.vk_instance().clone(),
            device: vk_device.clone(),
            physical_device: vk_physical_device.vk_physical_device,
            debug_settings: Default::default(),
            buffer_device_address: false,
            allocation_sizes: Default::default(),
        });

        let gpu_allocator = match gpu_allocator_result {
            Ok(gpu_allocator) => Some(Mutex::new(gpu_allocator)),
            Err(e) => {
                unsafe {
                    vk_device.destroy_device(instance.cpu_allocation_callbacks());
                }
                return Err(GpuError::from(e));
            }
        };

        Ok(Arc::new(Device {
            instance: instance.clone(),
            vk_device,
            gpu_allocator,
            compute_queues,
            _transfer_queues: transfer_queues,
            subgroup_size,
            max_compute_work_group_count,
            is_dynamic_subgroup_size,
            queue_index,
        }))
    }

    /// Get CPU allocator.
    pub fn cpu_allocation_callbacks(&self) -> Option<&vk::AllocationCallbacks> {
        self.instance.cpu_allocation_callbacks()
    }

    /// Allocate GPU memory.
    pub fn allocate(&self, allocation_desc: &AllocationCreateDesc) -> GpuResult<Allocation> {
        if let Some(gpu_allocator) = &self.gpu_allocator {
            let mut gpu_allocator = gpu_allocator.lock();
            gpu_allocator
                .allocate(allocation_desc)
                .map_err(GpuError::from)
        } else {
            Err(GpuError::Other(
                "GPU allocator is not available".to_string(),
            ))
        }
    }

    /// Free GPU memory.
    pub fn free(&self, allocation: Allocation) {
        if let Some(gpu_allocator) = &self.gpu_allocator {
            let mut gpu_allocator = gpu_allocator.lock();
            if let Err(e) = gpu_allocator.free(allocation) {
                // Log error because free is called from Drop.
                log::error!("Failed to free GPU memory: {:?}", e);
            }
        } else {
            log::error!("GPU allocator is not available");
        }
    }

    /// Get subgroup size (warp in terms of CUDA).
    pub fn subgroup_size(&self) -> usize {
        self.subgroup_size
    }

    pub fn instance(&self) -> Arc<Instance> {
        self.instance.clone()
    }

    pub fn vk_device(&self) -> &ash::Device {
        &self.vk_device
    }

    pub fn is_dynamic_subgroup_size(&self) -> bool {
        self.is_dynamic_subgroup_size
    }

    pub fn max_compute_work_group_count(&self) -> [usize; 3] {
        self.max_compute_work_group_count
    }

    pub fn compute_queue(&self) -> &Queue {
        &self.compute_queues[self.queue_index % self.compute_queues.len()]
    }

    fn check_extensions_list(
        instance: &Instance,
        vk_physical_device: vk::PhysicalDevice,
        required_extensions: &[CString],
    ) -> GpuResult<()> {
        let available_extensions = unsafe {
            instance
                .vk_instance()
                .enumerate_device_extension_properties(vk_physical_device)?
        };

        for required_extension in required_extensions {
            let is_extension_available = available_extensions.iter().any(|extension| {
                let extension_name =
                    unsafe { std::ffi::CStr::from_ptr(extension.extension_name.as_ptr()) };
                extension_name == required_extension.as_c_str()
            });

            if !is_extension_available {
                return Err(GpuError::NotSupported(format!(
                    "Extension {:?} is not supported",
                    required_extension
                )));
            }
        }

        Ok(())
    }
}

impl Drop for Device {
    fn drop(&mut self) {
        self.gpu_allocator = None;
        unsafe {
            // For now, we don't need to wait for device idle.
            // It doesn't have timeout, so it can hang the application.
            // Moreover, we control all execution by Context and catch timeout.
            // It we have infinity loops in shader we leak this device and let it running intil OS stops it.
            // self.vk_device.device_wait_idle().unwrap();
            self.vk_device
                .destroy_device(self.cpu_allocation_callbacks());
        }
    }
}