forked from suyu/suyu
shader: Split SSY and PBK stack
Hardware testing revealed that SSY and PBK push to a different stack, allowing code like this: SSY label1; PBK label2; SYNC; label1: PBK; label2: EXIT;
This commit is contained in:
parent
cd2d9628c9
commit
fe8e6618f2
4 changed files with 78 additions and 27 deletions
|
@ -143,6 +143,24 @@ u32 GetGenericAttributeIndex(Attribute::Index index) {
|
|||
return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
|
||||
}
|
||||
|
||||
constexpr const char* GetFlowStackPrefix(MetaStackClass stack) {
|
||||
switch (stack) {
|
||||
case MetaStackClass::Ssy:
|
||||
return "ssy";
|
||||
case MetaStackClass::Pbk:
|
||||
return "pbk";
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string FlowStackName(MetaStackClass stack) {
|
||||
return fmt::format("{}_flow_stack", GetFlowStackPrefix(stack));
|
||||
}
|
||||
|
||||
std::string FlowStackTopName(MetaStackClass stack) {
|
||||
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
|
||||
}
|
||||
|
||||
class GLSLDecompiler final {
|
||||
public:
|
||||
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
|
||||
|
@ -173,8 +191,10 @@ public:
|
|||
// TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
|
||||
// unlikely that shaders will use 20 nested SSYs and PBKs.
|
||||
constexpr u32 FLOW_STACK_SIZE = 20;
|
||||
code.AddLine("uint flow_stack[{}];", FLOW_STACK_SIZE);
|
||||
code.AddLine("uint flow_stack_top = 0u;");
|
||||
for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
|
||||
code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
|
||||
code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
|
||||
}
|
||||
|
||||
code.AddLine("while (true) {{");
|
||||
++code.scope;
|
||||
|
@ -1438,15 +1458,18 @@ private:
|
|||
}
|
||||
|
||||
std::string PushFlowStack(Operation operation) {
|
||||
const auto stack = std::get<MetaStackClass>(operation.GetMeta());
|
||||
const auto target = std::get_if<ImmediateNode>(&*operation[0]);
|
||||
UNIMPLEMENTED_IF(!target);
|
||||
|
||||
code.AddLine("flow_stack[flow_stack_top++] = 0x{:x}u;", target->GetValue());
|
||||
code.AddLine("{}[{}++] = 0x{:x}u;", FlowStackName(stack), FlowStackTopName(stack),
|
||||
target->GetValue());
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string PopFlowStack(Operation operation) {
|
||||
code.AddLine("jmp_to = flow_stack[--flow_stack_top];");
|
||||
const auto stack = std::get<MetaStackClass>(operation.GetMeta());
|
||||
code.AddLine("jmp_to = {}[--{}];", FlowStackName(stack), FlowStackTopName(stack));
|
||||
code.AddLine("break;");
|
||||
return {};
|
||||
}
|
||||
|
|
|
@ -132,20 +132,16 @@ public:
|
|||
branch_labels.push_back(label);
|
||||
}
|
||||
|
||||
// TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
|
||||
// that shaders will use 20 nested SSYs and PBKs.
|
||||
constexpr u32 FLOW_STACK_SIZE = 20;
|
||||
const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
|
||||
jmp_to = Emit(OpVariable(TypePointer(spv::StorageClass::Function, t_uint),
|
||||
spv::StorageClass::Function, Constant(t_uint, first_address)));
|
||||
flow_stack = Emit(OpVariable(TypePointer(spv::StorageClass::Function, flow_stack_type),
|
||||
spv::StorageClass::Function, ConstantNull(flow_stack_type)));
|
||||
flow_stack_top =
|
||||
Emit(OpVariable(t_func_uint, spv::StorageClass::Function, Constant(t_uint, 0)));
|
||||
std::tie(ssy_flow_stack, ssy_flow_stack_top) = CreateFlowStack();
|
||||
std::tie(pbk_flow_stack, pbk_flow_stack_top) = CreateFlowStack();
|
||||
|
||||
Name(jmp_to, "jmp_to");
|
||||
Name(flow_stack, "flow_stack");
|
||||
Name(flow_stack_top, "flow_stack_top");
|
||||
Name(ssy_flow_stack, "ssy_flow_stack");
|
||||
Name(ssy_flow_stack_top, "ssy_flow_stack_top");
|
||||
Name(pbk_flow_stack, "pbk_flow_stack");
|
||||
Name(pbk_flow_stack_top, "pbk_flow_stack_top");
|
||||
|
||||
Emit(OpBranch(loop_label));
|
||||
Emit(loop_label);
|
||||
|
@ -952,6 +948,7 @@ private:
|
|||
const auto target = std::get_if<ImmediateNode>(&*operation[0]);
|
||||
ASSERT(target);
|
||||
|
||||
const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
|
||||
const Id current = Emit(OpLoad(t_uint, flow_stack_top));
|
||||
const Id next = Emit(OpIAdd(t_uint, current, Constant(t_uint, 1)));
|
||||
const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, current));
|
||||
|
@ -962,6 +959,7 @@ private:
|
|||
}
|
||||
|
||||
Id PopFlowStack(Operation operation) {
|
||||
const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
|
||||
const Id current = Emit(OpLoad(t_uint, flow_stack_top));
|
||||
const Id previous = Emit(OpISub(t_uint, current, Constant(t_uint, 1)));
|
||||
const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, previous));
|
||||
|
@ -1172,6 +1170,31 @@ private:
|
|||
Emit(skip_label);
|
||||
}
|
||||
|
||||
std::tuple<Id, Id> CreateFlowStack() {
|
||||
// TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
|
||||
// that shaders will use 20 nested SSYs and PBKs.
|
||||
constexpr u32 FLOW_STACK_SIZE = 20;
|
||||
constexpr auto storage_class = spv::StorageClass::Function;
|
||||
|
||||
const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
|
||||
const Id stack = Emit(OpVariable(TypePointer(storage_class, flow_stack_type), storage_class,
|
||||
ConstantNull(flow_stack_type)));
|
||||
const Id top = Emit(OpVariable(t_func_uint, storage_class, Constant(t_uint, 0)));
|
||||
return std::tie(stack, top);
|
||||
}
|
||||
|
||||
std::pair<Id, Id> GetFlowStack(Operation operation) {
|
||||
const auto stack_class = std::get<MetaStackClass>(operation.GetMeta());
|
||||
switch (stack_class) {
|
||||
case MetaStackClass::Ssy:
|
||||
return {ssy_flow_stack, ssy_flow_stack_top};
|
||||
case MetaStackClass::Pbk:
|
||||
return {pbk_flow_stack, pbk_flow_stack_top};
|
||||
}
|
||||
UNREACHABLE();
|
||||
return {};
|
||||
}
|
||||
|
||||
static constexpr OperationDecompilersArray operation_decompilers = {
|
||||
&SPIRVDecompiler::Assign,
|
||||
|
||||
|
@ -1414,8 +1437,10 @@ private:
|
|||
|
||||
Id execute_function{};
|
||||
Id jmp_to{};
|
||||
Id flow_stack_top{};
|
||||
Id flow_stack{};
|
||||
Id ssy_flow_stack_top{};
|
||||
Id pbk_flow_stack_top{};
|
||||
Id ssy_flow_stack{};
|
||||
Id pbk_flow_stack{};
|
||||
Id continue_label{};
|
||||
std::map<u32, Id> labels;
|
||||
};
|
||||
|
|
|
@ -109,22 +109,20 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
|
|||
UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
|
||||
"Constant buffer flow is not supported");
|
||||
|
||||
// The SSY opcode tells the GPU where to re-converge divergent execution paths, it sets the
|
||||
// target of the jump that the SYNC instruction will make. The SSY opcode has a similar
|
||||
// structure to the BRA opcode.
|
||||
// The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC.
|
||||
const u32 target = pc + instr.bra.GetBranchTarget();
|
||||
bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
|
||||
bb.push_back(
|
||||
Operation(OperationCode::PushFlowStack, MetaStackClass::Ssy, Immediate(target)));
|
||||
break;
|
||||
}
|
||||
case OpCode::Id::PBK: {
|
||||
UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
|
||||
"Constant buffer PBK is not supported");
|
||||
|
||||
// PBK pushes to a stack the address where BRK will jump to. This shares stack with SSY but
|
||||
// using SYNC on a PBK address will kill the shader execution. We don't emulate this because
|
||||
// it's very unlikely a driver will emit such invalid shader.
|
||||
// PBK pushes to a stack the address where BRK will jump to.
|
||||
const u32 target = pc + instr.bra.GetBranchTarget();
|
||||
bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
|
||||
bb.push_back(
|
||||
Operation(OperationCode::PushFlowStack, MetaStackClass::Pbk, Immediate(target)));
|
||||
break;
|
||||
}
|
||||
case OpCode::Id::SYNC: {
|
||||
|
@ -133,7 +131,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
|
|||
static_cast<u32>(cc));
|
||||
|
||||
// The SYNC opcode jumps to the address previously set by the SSY opcode
|
||||
bb.push_back(Operation(OperationCode::PopFlowStack));
|
||||
bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy));
|
||||
break;
|
||||
}
|
||||
case OpCode::Id::BRK: {
|
||||
|
@ -142,7 +140,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
|
|||
static_cast<u32>(cc));
|
||||
|
||||
// The BRK opcode jumps to the address previously set by the PBK opcode
|
||||
bb.push_back(Operation(OperationCode::PopFlowStack));
|
||||
bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk));
|
||||
break;
|
||||
}
|
||||
case OpCode::Id::IPA: {
|
||||
|
|
|
@ -174,6 +174,11 @@ enum class InternalFlag {
|
|||
Amount = 4,
|
||||
};
|
||||
|
||||
enum class MetaStackClass {
|
||||
Ssy,
|
||||
Pbk,
|
||||
};
|
||||
|
||||
class OperationNode;
|
||||
class ConditionalNode;
|
||||
class GprNode;
|
||||
|
@ -285,7 +290,7 @@ struct MetaTexture {
|
|||
};
|
||||
|
||||
/// Parameters that modify an operation but are not part of any particular operand
|
||||
using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
|
||||
using Meta = std::variant<MetaArithmetic, MetaTexture, MetaStackClass, Tegra::Shader::HalfType>;
|
||||
|
||||
/// Holds any kind of operation that can be done in the IR
|
||||
class OperationNode final {
|
||||
|
|
Loading…
Reference in a new issue