Shrink DrawData from 128 to 64 bytes!;

- Last row of transform matrix is unused, make it 4x3
  - Requires funny row-major packing due to vec3 std140 padding.
  - Teach spirv parser to tolerate non-square matrix types, though
    they aren't supported anywhere else yet.
- Compute cofactor in shader for normal matrix, ALU is free,
  optimize out many terms, rm maf_cofactor.
- Take out complex UBO alignment logic since stuff is PO2 these days.

This was a common bottleneck for some workloads, so there are measurable
performance gains (up to 2x faster pass submission on CPU).  GPU time is
identical, at least on desktop.
This commit is contained in:
bjorn 2023-08-01 18:45:37 -07:00
parent 90994d023a
commit 339e6cf94b
5 changed files with 61 additions and 58 deletions

View File

@ -28,14 +28,13 @@ struct Camera {
};
struct Draw {
mat4 transform;
mat3 normalMatrix;
mat4x3 transform;
vec4 color;
};
layout(set = 0, binding = 0) uniform Globals { vec2 Resolution; float Time; };
layout(set = 0, binding = 1) uniform CameraBuffer { Camera Cameras[6]; };
layout(set = 0, binding = 2) uniform DrawBuffer { Draw Draws[256]; };
layout(set = 0, binding = 2) uniform DrawBuffer { layout(row_major) Draw Draws[256]; };
layout(set = 0, binding = 3) uniform sampler Sampler;
layout(set = 1, binding = 0) uniform MaterialBuffer {
@ -154,8 +153,8 @@ layout(location = 14) in vec3 Tangent;
#ifdef GL_VERTEX_SHADER
#define DrawID gl_BaseInstance
#define Transform Draws[DrawID].transform
#define NormalMatrix Draws[DrawID].normalMatrix
#define Transform mat4(Draws[DrawID].transform)
#define NormalMatrix (cofactor3(Draws[DrawID].transform))
#define PassColor Draws[DrawID].color
#define ClipFromLocal (ViewProjection * Transform)
#define ClipFromWorld (ViewProjection)
@ -167,6 +166,20 @@ layout(location = 14) in vec3 Tangent;
#define WorldFromView (inverse(View))
#define WorldFromClip (inverse(ViewProjection))
#define DefaultPosition (ClipFromLocal * VertexPosition)
mat3 cofactor3(mat4x3 m) {
return mat3(vec3(
(m[1][1] * m[2][2] - m[2][1] * m[1][2]),
-(m[1][0] * m[2][2] - m[2][0] * m[1][2]),
(m[1][0] * m[2][1] - m[2][0] * m[1][1])), vec3(
-(m[0][1] * m[2][2] - m[2][1] * m[0][2]),
(m[0][0] * m[2][2] - m[2][0] * m[0][2]),
-(m[0][0] * m[2][1] - m[2][0] * m[0][1])), vec3(
(m[0][1] * m[1][2] - m[1][1] * m[0][2]),
-(m[0][0] * m[1][2] - m[1][0] * m[0][2]),
(m[0][0] * m[1][1] - m[1][0] * m[0][1])
));
}
#endif
#ifdef GL_FRAGMENT_SHADER

View File

@ -616,30 +616,6 @@ MAF mat4 mat4_invert(mat4 m) {
return m;
}
MAF mat4 mat4_cofactor(mat4 m) {
float m00 = m[0], m04 = m[4], m08 = m[8], m12 = m[12];
float m01 = m[1], m05 = m[5], m09 = m[9], m13 = m[13];
float m02 = m[2], m06 = m[6], m10 = m[10], m14 = m[14];
float m03 = m[3], m07 = m[7], m11 = m[11], m15 = m[15];
m[0] = (m05 * (m10 * m15 - m11 * m14) - m09 * (m06 * m15 - m07 * m14) + m13 * (m06 * m11 - m07 * m10));
m[1] = -(m04 * (m10 * m15 - m11 * m14) - m08 * (m06 * m15 - m07 * m14) + m12 * (m06 * m11 - m07 * m10));
m[2] = (m04 * (m09 * m15 - m11 * m13) - m08 * (m05 * m15 - m07 * m13) + m12 * (m05 * m11 - m07 * m09));
m[3] = -(m04 * (m09 * m14 - m10 * m13) - m08 * (m05 * m14 - m06 * m13) + m12 * (m05 * m10 - m06 * m09));
m[4] = -(m01 * (m10 * m15 - m11 * m14) - m09 * (m02 * m15 - m03 * m14) + m13 * (m02 * m11 - m03 * m10));
m[5] = (m00 * (m10 * m15 - m11 * m14) - m08 * (m02 * m15 - m03 * m14) + m12 * (m02 * m11 - m03 * m10));
m[6] = -(m00 * (m09 * m15 - m11 * m13) - m08 * (m01 * m15 - m03 * m13) + m12 * (m01 * m11 - m03 * m09));
m[7] = (m00 * (m09 * m14 - m10 * m13) - m08 * (m01 * m14 - m02 * m13) + m12 * (m01 * m10 - m02 * m09));
m[8] = (m01 * (m06 * m15 - m07 * m14) - m05 * (m02 * m15 - m03 * m14) + m13 * (m02 * m07 - m03 * m06));
m[9] = -(m00 * (m06 * m15 - m07 * m14) - m04 * (m02 * m15 - m03 * m14) + m12 * (m02 * m07 - m03 * m06));
m[10] = (m00 * (m05 * m15 - m07 * m13) - m04 * (m01 * m15 - m03 * m13) + m12 * (m01 * m07 - m03 * m05));
m[11] = -(m00 * (m05 * m14 - m06 * m13) - m04 * (m01 * m14 - m02 * m13) + m12 * (m01 * m06 - m02 * m05));
m[12] = -(m01 * (m06 * m11 - m07 * m10) - m05 * (m02 * m11 - m03 * m10) + m09 * (m02 * m07 - m03 * m06));
m[13] = (m00 * (m06 * m11 - m07 * m10) - m04 * (m02 * m11 - m03 * m10) + m08 * (m02 * m07 - m03 * m06));
m[14] = -(m00 * (m05 * m11 - m07 * m09) - m04 * (m01 * m11 - m03 * m09) + m08 * (m01 * m07 - m03 * m05));
m[15] = (m00 * (m05 * m10 - m06 * m09) - m04 * (m01 * m10 - m02 * m09) + m08 * (m01 * m06 - m02 * m05));
return m;
}
// Calculate matrix equivalent to "apply n, then m"
MAF mat4 mat4_mul(mat4 m, mat4 n) {
float m00 = m[0], m01 = m[1], m02 = m[2], m03 = m[3],

View File

@ -610,8 +610,8 @@ static spv_result spv_parse_field(spv_context* spv, const uint32_t* word, spv_fi
}
if (OP_CODE(word) == 22 && word[2] == 32) { // OpTypeFloat
if (columnCount >= 2 && columnCount <= 4 && componentCount == columnCount) {
field->type = SPV_MAT2 + columnCount - 2;
if (columnCount >= 2 && columnCount <= 4 && componentCount >= 2 && componentCount <= 4) {
field->type = SPV_MAT2x2 + (columnCount - 2) * 3 + (componentCount - 2);
} else if (columnCount == 1 && componentCount >= 2 && componentCount <= 4) {
field->type = SPV_F32x2 + componentCount - 2;
} else if (columnCount == 1 && componentCount == 1) {

View File

@ -17,9 +17,15 @@ typedef enum {
SPV_F32x2,
SPV_F32x3,
SPV_F32x4,
SPV_MAT2,
SPV_MAT3,
SPV_MAT4,
SPV_MAT2x2,
SPV_MAT2x3,
SPV_MAT2x4,
SPV_MAT3x2,
SPV_MAT3x3,
SPV_MAT3x4,
SPV_MAT4x2,
SPV_MAT4x3,
SPV_MAT4x4,
SPV_STRUCT
} spv_type;

View File

@ -194,8 +194,7 @@ struct Mesh {
};
typedef struct {
float transform[16];
float cofactor[12];
float transform[12];
float color[4];
} DrawData;
@ -1277,27 +1276,29 @@ static void recordRenderPass(Pass* pass, gpu_stream* stream) {
// DrawData
uint32_t drawPageCount = (activeDrawCount + 255) / 256;
uint32_t drawPageSize = (uint32_t) ALIGN(256 * sizeof(DrawData), align);
mapped = mapBuffer(&state.streamBuffers, drawPageCount * drawPageSize, align);
builtins[2].buffer = (gpu_buffer_binding) { mapped.buffer, mapped.offset, drawPageSize };
mapped = mapBuffer(&state.streamBuffers, activeDrawCount * sizeof(DrawData), align);
builtins[2].buffer = (gpu_buffer_binding) { mapped.buffer, mapped.offset, 256 * sizeof(DrawData) };
DrawData* data = mapped.pointer;
for (uint32_t i = 0; i < activeDrawCount; i++, data++) {
Draw* draw = &pass->draws[activeDraws[i] >> 8][activeDraws[i] & 0xff];
if ((i & 0xff) == 0) {
data = (DrawData*) ALIGN(data, align);
}
float cofactor[16];
mat4_init(cofactor, draw->transform);
memcpy(cofactor + 12, (float[4]) { 0.f, 0.f, 0.f, 1.f }, 4 * sizeof(float));
mat4_cofactor(cofactor);
memcpy(data->transform, draw->transform, 16 * sizeof(float));
memcpy(data->cofactor, cofactor, 12 * sizeof(float));
memcpy(data->color, draw->color, 4 * sizeof(float));
// transform is provided as 4x3 row-major matrix for packing reasons, need to transpose
data->transform[0] = draw->transform[0];
data->transform[1] = draw->transform[4];
data->transform[2] = draw->transform[8];
data->transform[3] = draw->transform[12];
data->transform[4] = draw->transform[1];
data->transform[5] = draw->transform[5];
data->transform[6] = draw->transform[9];
data->transform[7] = draw->transform[13];
data->transform[8] = draw->transform[2];
data->transform[9] = draw->transform[6];
data->transform[10] = draw->transform[10];
data->transform[11] = draw->transform[14];
data->color[0] = draw->color[0];
data->color[1] = draw->color[1];
data->color[2] = draw->color[2];
data->color[3] = draw->color[3];
}
gpu_bundle_write(&builtinBundle, &builtinfo, 1);
@ -1426,7 +1427,7 @@ static void recordRenderPass(Pass* pass, gpu_stream* stream) {
}
if ((i & 0xff) == 0 || draw->camera != cameraIndex || constantsDirty) {
uint32_t dynamicOffsets[] = { draw->camera * canvas->views * sizeof(Camera), (i >> 8) * drawPageSize };
uint32_t dynamicOffsets[] = { draw->camera * canvas->views * sizeof(Camera), (i >> 8) * 256 * sizeof(DrawData) };
gpu_bind_bundles(stream, draw->shader->gpu, &builtinBundle, 0, 1, dynamicOffsets, COUNTOF(dynamicOffsets));
cameraIndex = draw->camera;
}
@ -2792,9 +2793,16 @@ Shader* lovrShaderCreate(const ShaderInfo* info) {
[SPV_F32x2] = TYPE_F32x2,
[SPV_F32x3] = TYPE_F32x3,
[SPV_F32x4] = TYPE_F32x4,
[SPV_MAT2] = TYPE_MAT2,
[SPV_MAT3] = TYPE_MAT3,
[SPV_MAT4] = TYPE_MAT4
[SPV_MAT2x2] = TYPE_MAT2,
[SPV_MAT2x3] = ~0u,
[SPV_MAT2x4] = ~0u,
[SPV_MAT3x2] = ~0u,
[SPV_MAT3x3] = TYPE_MAT3,
[SPV_MAT3x4] = ~0u,
[SPV_MAT4x2] = ~0u,
[SPV_MAT4x3] = ~0u,
[SPV_MAT4x4] = TYPE_MAT4,
[SPV_STRUCT] = ~0u
};
spv_field* field = &spv[s].fields[i];
@ -2802,7 +2810,7 @@ Shader* lovrShaderCreate(const ShaderInfo* info) {
uint32_t base = s == 1 ? spv[0].fieldCount : 0;
shader->fields[base + i] = (DataField) {
.type = field->type == SPV_STRUCT ? ~0u : dataTypes[field->type],
.type = dataTypes[field->type],
.offset = field->offset,
.length = field->arrayLength,
.stride = field->arrayStride,