From a8a2526128970dbe47bc25c28b8d2bfb52ac4a26 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Thu, 25 Jun 2020 17:12:33 -0300 Subject: [PATCH] gl_arb_decompiler: Use NV_shader_buffer_{load,store} on assembly shaders NV_shader_buffer_{load,store} is a 2010 extension that allows GL applications to use what in Vulkan is known as physical pointers, this is basically C pointers. On GLASM these is exposed through the LOAD/STORE/ATOM instructions. Up until now, assembly shaders were using NV_shader_storage_buffer_object. These work fine, but have a (probably unintended) limitation that forces us to have the limit of a single stage for all shader stages. In contrast, with NV_shader_buffer_{load,store} we can pass GPU addresses to the shader through local parameters (GLASM equivalent uniform constants, or push constants on Vulkan). Local parameters have the advantage of being per stage, allowing us to generate code without worrying about binding overlaps. --- .../renderer_opengl/gl_arb_decompiler.cpp | 84 +++++++++----- .../renderer_opengl/gl_buffer_cache.cpp | 2 +- .../renderer_opengl/gl_rasterizer.cpp | 103 +++++++++++------- .../renderer_opengl/gl_rasterizer.h | 4 +- .../renderer_opengl/gl_shader_manager.cpp | 71 +++++++----- .../renderer_opengl/gl_shader_manager.h | 17 +-- .../renderer_opengl/gl_stream_buffer.cpp | 2 +- 7 files changed, 173 insertions(+), 110 deletions(-) diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp index eb5158407d..4489abf618 100644 --- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -185,10 +185,6 @@ std::string TextureType(const MetaTexture& meta) { return type; } -std::string GlobalMemoryName(const GlobalMemoryBase& base) { - return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset); -} - class ARBDecompiler final { public: explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, @@ -199,6 +195,8 @@ public: } private: + void DefineGlobalMemory(); + void DeclareHeader(); void DeclareVertex(); void DeclareGeometry(); @@ -228,6 +226,7 @@ private: std::pair BuildCoords(Operation); std::string BuildAoffi(Operation); + std::string GlobalMemoryPointer(const GmemNode& gmem); void Exit(); std::string Assign(Operation); @@ -378,10 +377,8 @@ private: std::string address; std::string_view opname; if (const auto gmem = std::get_if(&*operation[0])) { - AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), - Visit(gmem->GetBaseAddress())); - address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary); - opname = "ATOMB"; + address = GlobalMemoryPointer(*gmem); + opname = "ATOM"; } else if (const auto smem = std::get_if(&*operation[0])) { address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress())); opname = "ATOMS"; @@ -456,9 +453,13 @@ private: shader_source += '\n'; } - std::string AllocTemporary() { - max_temporaries = std::max(max_temporaries, num_temporaries + 1); - return fmt::format("T{}.x", num_temporaries++); + std::string AllocLongVectorTemporary() { + max_long_temporaries = std::max(max_long_temporaries, num_long_temporaries + 1); + return fmt::format("L{}", num_long_temporaries++); + } + + std::string AllocLongTemporary() { + return fmt::format("{}.x", AllocLongVectorTemporary()); } std::string AllocVectorTemporary() { @@ -466,8 +467,13 @@ private: return fmt::format("T{}", num_temporaries++); } + std::string AllocTemporary() { + return fmt::format("{}.x", AllocVectorTemporary()); + } + void ResetTemporaries() noexcept { num_temporaries = 0; + num_long_temporaries = 0; } const Device& device; @@ -478,6 +484,11 @@ private: std::size_t num_temporaries = 0; std::size_t max_temporaries = 0; + std::size_t num_long_temporaries = 0; + std::size_t max_long_temporaries = 0; + + std::map global_memory_names; + std::string shader_source; static constexpr std::string_view ADD_F32 = "ADD.F32"; @@ -784,6 +795,8 @@ private: ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, ShaderType stage, std::string_view identifier) : device{device}, ir{ir}, registry{registry}, stage{stage} { + DefineGlobalMemory(); + AddLine("TEMP RC;"); AddLine("TEMP FSWZA[4];"); AddLine("TEMP FSWZB[4];"); @@ -829,12 +842,20 @@ std::string_view HeaderStageName(ShaderType stage) { } } +void ARBDecompiler::DefineGlobalMemory() { + u32 binding = 0; + for (const auto& pair : ir.GetGlobalMemory()) { + const GlobalMemoryBase base = pair.first; + global_memory_names.emplace(base, binding); + ++binding; + } +} + void ARBDecompiler::DeclareHeader() { AddLine("!!NV{}5.0", HeaderStageName(stage)); // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D AddLine("OPTION NV_internal;"); AddLine("OPTION NV_gpu_program_fp64;"); - AddLine("OPTION NV_shader_storage_buffer;"); AddLine("OPTION NV_shader_thread_group;"); if (ir.UsesWarps() && device.HasWarpIntrinsics()) { AddLine("OPTION NV_shader_thread_shuffle;"); @@ -951,11 +972,10 @@ void ARBDecompiler::DeclareLocalMemory() { } void ARBDecompiler::DeclareGlobalMemory() { - u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer; - for (const auto& pair : ir.GetGlobalMemory()) { - const auto& base = pair.first; - AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding); - ++binding; + const std::size_t num_entries = ir.GetGlobalMemory().size(); + if (num_entries > 0) { + const std::size_t num_vectors = Common::AlignUp(num_entries, 2) / 2; + AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_vectors, num_vectors - 1); } } @@ -977,6 +997,9 @@ void ARBDecompiler::DeclareTemporaries() { for (std::size_t i = 0; i < max_temporaries; ++i) { AddLine("TEMP T{};", i); } + for (std::size_t i = 0; i < max_long_temporaries; ++i) { + AddLine("LONG TEMP L{};", i); + } } void ARBDecompiler::DeclarePredicates() { @@ -1339,10 +1362,7 @@ std::string ARBDecompiler::Visit(const Node& node) { if (const auto gmem = std::get_if(&*node)) { std::string temporary = AllocTemporary(); - AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), - Visit(gmem->GetBaseAddress())); - AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()), - temporary); + AddLine("LOAD.U32 {}, {};", temporary, GlobalMemoryPointer(*gmem)); return temporary; } @@ -1419,6 +1439,22 @@ std::string ARBDecompiler::BuildAoffi(Operation operation) { return fmt::format(", offset({})", temporary); } +std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) { + const u32 binding = global_memory_names.at(gmem.GetDescriptor()); + const char result_swizzle = binding % 2 == 0 ? 'x' : 'y'; + + const std::string pointer = AllocLongVectorTemporary(); + std::string temporary = AllocTemporary(); + + const u32 local_index = binding / 2; + AddLine("PK64.U {}, c[{}];", pointer, local_index); + AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()), + Visit(gmem.GetBaseAddress())); + AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary); + AddLine("ADD.U64 {}.x, {}.{}, {}.z;", pointer, pointer, result_swizzle, pointer); + return fmt::format("{}.x", pointer); +} + void ARBDecompiler::Exit() { if (stage != ShaderType::Fragment) { AddLine("RET;"); @@ -1515,11 +1551,7 @@ std::string ARBDecompiler::Assign(Operation operation) { ResetTemporaries(); return {}; } else if (const auto gmem = std::get_if(&*dest)) { - const std::string temporary = AllocTemporary(); - AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()), - Visit(gmem->GetBaseAddress())); - AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()), - temporary); + AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem)); ResetTemporaries(); return {}; } else { diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index e461e4c70a..e866d8f2f4 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -26,7 +26,7 @@ Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} { gl_buffer.Create(); glNamedBufferData(gl_buffer.handle, static_cast(size), nullptr, GL_DYNAMIC_DRAW); - if (device.HasVertexBufferUnifiedMemory()) { + if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); } diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index c3fad563cc..03e82c5992 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -139,6 +139,18 @@ void oglEnable(GLenum cap, bool state) { (state ? glEnable : glDisable)(cap); } +void UpdateBindlessPointers(GLenum target, GLuint64EXT* pointers, std::size_t num_entries) { + if (num_entries == 0) { + return; + } + if (num_entries % 2 == 1) { + pointers[num_entries] = 0; + } + const GLsizei num_vectors = static_cast((num_entries + 1) / 2); + glProgramLocalParametersI4uivNV(target, 0, num_vectors, + reinterpret_cast(pointers)); +} + } // Anonymous namespace RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window, @@ -324,7 +336,6 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() { void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { MICROPROFILE_SCOPE(OpenGL_Shader); auto& gpu = system.GPU().Maxwell3D(); - std::size_t num_ssbos = 0; u32 clip_distances = 0; for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { @@ -347,29 +358,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } // Currently this stages are not supported in the OpenGL backend. - // Todo(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL - if (program == Maxwell::ShaderProgram::TesselationControl) { - continue; - } else if (program == Maxwell::ShaderProgram::TesselationEval) { + // TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL + if (program == Maxwell::ShaderProgram::TesselationControl || + program == Maxwell::ShaderProgram::TesselationEval) { continue; } - Shader* shader = shader_cache.GetStageProgram(program, async_shaders); - - if (device.UseAssemblyShaders()) { - // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this - // all stages share the same bindings. - const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size(); - ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage"); - num_ssbos += num_stage_ssbos; - } - - // Stage indices are 0 - 5 - const std::size_t stage = index == 0 ? 0 : index - 1; - SetupDrawConstBuffers(stage, shader); - SetupDrawGlobalMemory(stage, shader); - SetupDrawTextures(stage, shader); - SetupDrawImages(stage, shader); + Shader* const shader = shader_cache.GetStageProgram(program, async_shaders); const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0; switch (program) { @@ -388,6 +383,13 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { shader_config.enable.Value(), shader_config.offset); } + // Stage indices are 0 - 5 + const std::size_t stage = index == 0 ? 0 : index - 1; + SetupDrawConstBuffers(stage, shader); + SetupDrawGlobalMemory(stage, shader); + SetupDrawTextures(stage, shader); + SetupDrawImages(stage, shader); + // Workaround for Intel drivers. // When a clip distance is enabled but not set in the shader it crops parts of the screen // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the @@ -749,6 +751,8 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { current_cbuf = 0; auto kernel = shader_cache.GetComputeKernel(code_addr); + program_manager.BindCompute(kernel->GetHandle()); + SetupComputeTextures(kernel); SetupComputeImages(kernel); @@ -763,7 +767,6 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { buffer_cache.Unmap(); const auto& launch_desc = system.GPU().KeplerCompute().launch_description; - program_manager.BindCompute(kernel->GetHandle()); glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); ++num_queued_commands; } @@ -1023,40 +1026,66 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, } void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) { + static constexpr std::array TARGET_LUT = { + GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV, + GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, + }; + auto& gpu{system.GPU()}; auto& memory_manager{gpu.MemoryManager()}; - const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; + const auto& cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]}; + const auto& entries{shader->GetEntries().global_memory_entries}; - u32 binding = - device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer; - for (const auto& entry : shader->GetEntries().global_memory_entries) { + std::array pointers; + ASSERT(entries.size() < pointers.size()); + + const bool assembly_shaders = device.UseAssemblyShaders(); + u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer; + for (const auto& entry : entries) { const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; const GPUVAddr gpu_addr{memory_manager.Read(addr)}; const u32 size{memory_manager.Read(addr + 8)}; - SetupGlobalMemory(binding++, entry, gpu_addr, size); + SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]); + ++binding; + } + if (assembly_shaders) { + UpdateBindlessPointers(TARGET_LUT[stage_index], pointers.data(), entries.size()); } } void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { auto& gpu{system.GPU()}; auto& memory_manager{gpu.MemoryManager()}; - const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; + const auto& cbufs{gpu.KeplerCompute().launch_description.const_buffer_config}; + const auto& entries{kernel->GetEntries().global_memory_entries}; + + std::array pointers; + ASSERT(entries.size() < pointers.size()); u32 binding = 0; - for (const auto& entry : kernel->GetEntries().global_memory_entries) { - const auto addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset}; - const auto gpu_addr{memory_manager.Read(addr)}; - const auto size{memory_manager.Read(addr + 8)}; - SetupGlobalMemory(binding++, entry, gpu_addr, size); + for (const auto& entry : entries) { + const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset}; + const GPUVAddr gpu_addr{memory_manager.Read(addr)}; + const u32 size{memory_manager.Read(addr + 8)}; + SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]); + ++binding; + } + if (device.UseAssemblyShaders()) { + UpdateBindlessPointers(GL_COMPUTE_PROGRAM_NV, pointers.data(), entries.size()); } } void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, - GPUVAddr gpu_addr, std::size_t size) { - const auto alignment{device.GetShaderStorageBufferAlignment()}; + GPUVAddr gpu_addr, std::size_t size, + GLuint64EXT* pointer) { + const std::size_t alignment{device.GetShaderStorageBufferAlignment()}; const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, - static_cast(size)); + if (device.UseAssemblyShaders()) { + *pointer = info.address + info.offset; + } else { + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, + static_cast(size)); + } } void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index a956469364..ccc6f50f60 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -124,9 +124,9 @@ private: /// Configures the current global memory entries to use for the kernel invocation. void SetupComputeGlobalMemory(Shader* kernel); - /// Configures a constant buffer. + /// Configures a global memory buffer. void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, - std::size_t size); + std::size_t size, GLuint64EXT* pointer); /// Configures the current textures to use for the draw command. void SetupDrawTextures(std::size_t stage_index, Shader* shader); diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 8e754fa900..691c6c79bc 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -11,8 +11,30 @@ namespace OpenGL { -ProgramManager::ProgramManager(const Device& device) { - use_assembly_programs = device.UseAssemblyShaders(); +namespace { + +void BindProgram(GLenum stage, GLuint current, GLuint old, bool& enabled) { + if (current == old) { + return; + } + if (current == 0) { + if (enabled) { + enabled = false; + glDisable(stage); + } + return; + } + if (!enabled) { + enabled = true; + glEnable(stage); + } + glBindProgramARB(stage, current); +} + +} // Anonymous namespace + +ProgramManager::ProgramManager(const Device& device) + : use_assembly_programs{device.UseAssemblyShaders()} { if (use_assembly_programs) { glEnable(GL_COMPUTE_PROGRAM_NV); } else { @@ -33,9 +55,7 @@ void ProgramManager::BindCompute(GLuint program) { } void ProgramManager::BindGraphicsPipeline() { - if (use_assembly_programs) { - UpdateAssemblyPrograms(); - } else { + if (!use_assembly_programs) { UpdateSourcePrograms(); } } @@ -63,32 +83,25 @@ void ProgramManager::RestoreGuestPipeline() { } } -void ProgramManager::UpdateAssemblyPrograms() { - const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) { - if (current == old) { - return; - } - if (current == 0) { - if (enabled) { - enabled = false; - glDisable(stage); - } - return; - } - if (!enabled) { - enabled = true; - glEnable(stage); - } - glBindProgramARB(stage, current); - }; +void ProgramManager::UseVertexShader(GLuint program) { + if (use_assembly_programs) { + BindProgram(GL_VERTEX_PROGRAM_NV, program, current_state.vertex, vertex_enabled); + } + current_state.vertex = program; +} - update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex); - update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry, - old_state.geometry); - update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment, - old_state.fragment); +void ProgramManager::UseGeometryShader(GLuint program) { + if (use_assembly_programs) { + BindProgram(GL_GEOMETRY_PROGRAM_NV, program, current_state.vertex, geometry_enabled); + } + current_state.geometry = program; +} - old_state = current_state; +void ProgramManager::UseFragmentShader(GLuint program) { + if (use_assembly_programs) { + BindProgram(GL_FRAGMENT_PROGRAM_NV, program, current_state.vertex, fragment_enabled); + } + current_state.fragment = program; } void ProgramManager::UpdateSourcePrograms() { diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 0f03b4f123..950e0dfcbe 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -45,17 +45,9 @@ public: /// Rewinds BindHostPipeline state changes. void RestoreGuestPipeline(); - void UseVertexShader(GLuint program) { - current_state.vertex = program; - } - - void UseGeometryShader(GLuint program) { - current_state.geometry = program; - } - - void UseFragmentShader(GLuint program) { - current_state.fragment = program; - } + void UseVertexShader(GLuint program); + void UseGeometryShader(GLuint program); + void UseFragmentShader(GLuint program); private: struct PipelineState { @@ -64,9 +56,6 @@ private: GLuint fragment = 0; }; - /// Update NV_gpu_program5 programs. - void UpdateAssemblyPrograms(); - /// Update GLSL programs. void UpdateSourcePrograms(); diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 3655ff6292..887995cf4d 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -35,7 +35,7 @@ OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool ver mapped_ptr = static_cast( glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); - if (device.HasVertexBufferUnifiedMemory()) { + if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); }