// SPDX-License-Identifier: Apache-2.0 #include "gtest/gtest.h" #include #include "kompute/Kompute.hpp" #include "kompute/logger/Logger.hpp" #include "shaders/Utils.hpp" TEST(TestAsyncOperations, TestManagerParallelExecution) { // This test is built for NVIDIA 1650. It assumes: // * Queue family 0 and 2 have compute capabilities // * GPU is able to process parallel shader code across different families uint32_t size = 10; uint32_t numParallel = 2; std::string shader(R"( #version 450 layout (local_size_x = 1) in; layout(set = 0, binding = 0) buffer b { float pb[]; }; shared uint sharedTotal[1]; void main() { uint index = gl_GlobalInvocationID.x; sharedTotal[0] = 0; for (int i = 0; i < 100000000; i++) { atomicAdd(sharedTotal[0], 1); } pb[index] = sharedTotal[0]; } )"); std::vector spirv = compileSource(shader); std::vector data(size, 0.0); std::vector resultSync(size, 100000000); std::vector resultAsync(size, 100000000); kp::Manager mgr; std::shared_ptr sq = mgr.sequence(); std::vector> inputsSyncB; std::vector> algorithms; for (uint32_t i = 0; i < numParallel; i++) { inputsSyncB.push_back(mgr.tensor(data)); algorithms.push_back(mgr.algorithm({ inputsSyncB[i] }, spirv)); } sq->eval(inputsSyncB); mgr.sequence()->eval(inputsSyncB); auto startSync = std::chrono::high_resolution_clock::now(); for (uint32_t i = 0; i < numParallel; i++) { sq->eval(algorithms[i]); } auto endSync = std::chrono::high_resolution_clock::now(); auto durationSync = std::chrono::duration_cast(endSync - startSync) .count(); sq->eval(inputsSyncB); for (uint32_t i = 0; i < numParallel; i++) { EXPECT_EQ(inputsSyncB[i]->vector(), resultSync); } kp::Manager mgrAsync(0, { 0, 2 }); std::vector> inputsAsyncB; std::vector> algosAsync; for (uint32_t i = 0; i < numParallel; i++) { inputsAsyncB.push_back(mgr.tensor(data)); algosAsync.push_back(mgr.algorithm({ inputsAsyncB[i] }, spirv)); } std::vector> sqs; for (uint32_t i = 0; i < numParallel; i++) { sqs.push_back(mgrAsync.sequence(i)); } auto startAsync = std::chrono::high_resolution_clock::now(); for (uint32_t i = 0; i < numParallel; i++) { sqs[i]->evalAsync(algosAsync[i]); } for (uint32_t i = 0; i < numParallel; i++) { sqs[i]->evalAwait(); } auto endAsync = std::chrono::high_resolution_clock::now(); auto durationAsync = std::chrono::duration_cast( endAsync - startAsync) .count(); sq->eval({ inputsAsyncB }); for (uint32_t i = 0; i < numParallel; i++) { EXPECT_EQ((inputsAsyncB[i]->vector()), resultAsync); } // The speedup should be at least 40% EXPECT_LT(durationAsync, durationSync * 0.6); } TEST(TestAsyncOperations, TestManagerAsyncExecution) { uint32_t size = 10; std::string shader(R"( #version 450 layout (local_size_x = 1) in; layout(set = 0, binding = 0) buffer b { float pb[]; }; shared uint sharedTotal[1]; void main() { uint index = gl_GlobalInvocationID.x; sharedTotal[0] = 0; for (int i = 0; i < 100000000; i++) { atomicAdd(sharedTotal[0], 1); } pb[index] = sharedTotal[0]; } )"); std::vector spirv = compileSource(shader); std::vector data(size, 0.0); std::vector resultAsync(size, 100000000); kp::Manager mgr; std::shared_ptr> tensorA = mgr.tensor(data); std::shared_ptr> tensorB = mgr.tensor(data); std::shared_ptr sq1 = mgr.sequence(); std::shared_ptr sq2 = mgr.sequence(); sq1->eval({ tensorA, tensorB }); std::shared_ptr algo1 = mgr.algorithm({ tensorA }, spirv); std::shared_ptr algo2 = mgr.algorithm({ tensorB }, spirv); // AMD Drivers in Windows may see an error in this line due to timeout. // In order to fix this, it requires a change on Windows registries. // More details on this can be found here: // https://docs.substance3d.com/spdoc/gpu-drivers-crash-with-long-computations-128745489.html // Context on solution discussed in github: // https://github.com/KomputeProject/kompute/issues/196#issuecomment-808866505 sq1->evalAsync(algo1); sq2->evalAsync(algo2); sq1->evalAwait(); sq2->evalAwait(); sq1->evalAsync({ tensorA, tensorB }); sq1->evalAwait(); EXPECT_EQ(tensorA->vector(), resultAsync); EXPECT_EQ(tensorB->vector(), resultAsync); } TEST(TestAsyncOperations, TestManagerAsyncExecutionTimeout) { uint32_t size = 10; std::string shader(R"( #version 450 layout (local_size_x = 1) in; layout(set = 0, binding = 0) buffer b { float pb[]; }; shared uint sharedTotal[1]; void main() { uint index = gl_GlobalInvocationID.x; sharedTotal[0] = 0; for (int i = 0; i < 100000000; i++) { atomicAdd(sharedTotal[0], 1); } pb[index] = sharedTotal[0]; } )"); std::vector spirv = compileSource(shader); std::vector data(size, 0.0); std::vector resultAsync(size, 100000000); kp::Manager mgr; std::shared_ptr> tensorA = mgr.tensor(data); std::shared_ptr> tensorB = mgr.tensor(data); std::shared_ptr sq1 = mgr.sequence(); std::shared_ptr sq2 = mgr.sequence(); sq1->eval({ tensorA, tensorB }); std::shared_ptr algo1 = mgr.algorithm({ tensorA }, spirv); std::shared_ptr algo2 = mgr.algorithm({ tensorB }, spirv); auto startSync = std::chrono::high_resolution_clock::now(); // AMD Drivers in Windows may see an error in this line due to timeout. // In order to fix this, it requires a change on Windows registries. // More details on this can be found here: // https://docs.substance3d.com/spdoc/gpu-drivers-crash-with-long-computations-128745489.html // Context on solution discussed in github: // https://github.com/KomputeProject/kompute/issues/196#issuecomment-808866505 sq1->evalAsync(algo1); sq2->evalAsync(algo2); sq1->evalAwait(1); sq2->evalAwait(1); auto endSync = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(endSync - startSync) .count(); // The time should several orders of magnitude smaller (in this 100k instead // of 1m ns) EXPECT_LT(duration, 100000); sq1->evalAsync({ tensorA, tensorB }); sq1->evalAwait(); EXPECT_EQ(tensorA->vector(), resultAsync); EXPECT_EQ(tensorB->vector(), resultAsync); }