Implement buddy allocator to be able to use a single buffer for all geometry on GPU

This will allow me to draw everything in a single CPU draw call
This commit is contained in:
sergeypdev 2024-07-20 17:50:25 +04:00
parent 23f5c6836a
commit 2c385c7656
5 changed files with 399 additions and 89 deletions

View File

@ -247,7 +247,7 @@ fn buildAssetCompiler(b: *Build, optimize: std.builtin.OptimizeMode, assets_mod:
assetc.linkLibC();
assetc.linkLibCpp();
assetc.addCSourceFile(.{ .file = .{ .src_path = .{ .owner = b, .sub_path = "libs/stb/stb_image.c" } }, .flags = &.{"-std=c99"} });
assetc.addCSourceFile(.{ .file = b.path("libs/stb/stb_image.c"), .flags = &.{"-std=c99"} });
assetc.addIncludePath(.{ .src_path = .{ .owner = b, .sub_path = "libs/stb" } });
return assetc;

View File

@ -21,7 +21,7 @@ const formats = @import("formats.zig");
const asset_manifest = @import("asset_manifest");
const assets = @import("assets");
const checkGLError = @import("Render.zig").checkGLError;
// const basisu = @import("mach-basisu");
const BuddyAllocator = @import("BuddyAllocator.zig");
const Vec2 = @import("zalgebra").Vec2;
const Vec3 = @import("zalgebra").Vec3;
@ -50,6 +50,8 @@ dependencies: std.AutoHashMapUnmanaged(AssetId, std.SegmentedList(AssetId, 4)) =
dependees: std.AutoHashMapUnmanaged(AssetId, std.SegmentedList(AssetId, 4)) = .{},
loaded_assets: std.AutoHashMapUnmanaged(AssetId, LoadedAsset) = .{},
vertex_heap: VertexBufferHeap,
pub fn init(allocator: std.mem.Allocator, frame_arena: std.mem.Allocator) AssetManager {
// basisu.init_transcoder();
@ -61,6 +63,7 @@ pub fn init(allocator: std.mem.Allocator, frame_arena: std.mem.Allocator) AssetM
.allocator = allocator,
.frame_arena = frame_arena,
.exe_dir = exe_dir,
.vertex_heap = VertexBufferHeap.init(allocator) catch @panic("OOM"),
};
}
@ -259,6 +262,7 @@ const NullShaderProgram = LoadedShaderProgram{
const NullMesh = LoadedMesh{
.aabb = .{},
.heap_handle = .{},
.positions = BufferSlice{
.buffer = 0,
.offset = 0,
@ -284,6 +288,7 @@ const NullMesh = LoadedMesh{
.offset = 0,
.count = 0,
.type = gl.UNSIGNED_SHORT,
.base_vertex = 0,
},
.material = .{},
};
@ -313,83 +318,56 @@ fn loadMeshErr(self: *AssetManager, id: AssetId) !LoadedMesh {
defer self.frame_arena.free(data.bytes);
const mesh = formats.Mesh.fromBuffer(data.bytes);
var bufs = [_]gl.GLuint{ 0, 0, 0, 0, 0 };
gl.createBuffers(bufs.len, &bufs);
errdefer gl.deleteBuffers(bufs.len, &bufs);
const vertices_len = mesh.vertices.len;
const allocation = try self.vertex_heap.alloc(vertices_len, mesh.indices.len);
const vertices = bufs[0];
std.debug.assert(vertices != 0);
const normals = bufs[1];
std.debug.assert(normals != 0);
const tangents = bufs[2];
std.debug.assert(tangents != 0);
const uvs = bufs[3];
std.debug.assert(uvs != 0);
const indices = bufs[4];
std.debug.assert(indices != 0);
const vertex_offset = allocation.vertex.offset;
gl.namedBufferStorage(
vertices,
@intCast(mesh.vertices.len * @sizeOf(formats.Vector3)),
@ptrCast(mesh.vertices.ptr),
0,
);
gl.namedBufferStorage(
normals,
@intCast(mesh.normals.len * @sizeOf(formats.Vector3)),
@ptrCast(mesh.normals.ptr),
0,
);
gl.namedBufferStorage(
tangents,
@intCast(mesh.tangents.len * @sizeOf(formats.Vector3)),
@ptrCast(mesh.tangents.ptr),
0,
);
gl.namedBufferStorage(
uvs,
@intCast(mesh.uvs.len * @sizeOf(formats.Vector2)),
@ptrCast(mesh.uvs.ptr),
0,
);
gl.namedBufferStorage(
indices,
@intCast(mesh.indices.len * @sizeOf(formats.Index)),
@ptrCast(mesh.indices.ptr),
0,
);
gl.namedBufferSubData(self.vertex_heap.vertices, @intCast(vertex_offset * @sizeOf(formats.Vector3)), @intCast(vertices_len * @sizeOf(formats.Vector3)), @ptrCast(mesh.vertices.ptr));
checkGLError();
gl.namedBufferSubData(self.vertex_heap.normals, @intCast(vertex_offset * @sizeOf(formats.Vector3)), @intCast(vertices_len * @sizeOf(formats.Vector3)), @ptrCast(mesh.normals.ptr));
checkGLError();
gl.namedBufferSubData(self.vertex_heap.tangents, @intCast(vertex_offset * @sizeOf(formats.Vector3)), @intCast(vertices_len * @sizeOf(formats.Vector3)), @ptrCast(mesh.tangents.ptr));
checkGLError();
gl.namedBufferSubData(self.vertex_heap.uvs, @intCast(vertex_offset * @sizeOf(formats.Vector2)), @intCast(vertices_len * @sizeOf(formats.Vector2)), @ptrCast(mesh.uvs.ptr));
checkGLError();
const index_offset = allocation.index.offset;
gl.namedBufferSubData(self.vertex_heap.indices, @intCast(index_offset * @sizeOf(formats.Index)), @intCast(mesh.indices.len * @sizeOf(formats.Index)), @ptrCast(mesh.indices.ptr));
const loaded_mesh = LoadedMesh{
.aabb = .{
.min = Vec3.new(mesh.aabb.min.x, mesh.aabb.min.y, mesh.aabb.min.z),
.max = Vec3.new(mesh.aabb.max.x, mesh.aabb.max.y, mesh.aabb.max.z),
},
.heap_handle = allocation,
.material = mesh.material,
.positions = .{
.buffer = vertices,
.offset = 0,
.buffer = self.vertex_heap.vertices,
.offset = @intCast(vertex_offset * @sizeOf(formats.Vector3)),
.stride = @sizeOf(formats.Vector3),
},
.normals = .{
.buffer = normals,
.offset = 0,
.buffer = self.vertex_heap.normals,
.offset = @intCast(vertex_offset * @sizeOf(formats.Vector3)),
.stride = @sizeOf(formats.Vector3),
},
.tangents = .{
.buffer = tangents,
.offset = 0,
.buffer = self.vertex_heap.tangents,
.offset = @intCast(vertex_offset * @sizeOf(formats.Vector3)),
.stride = @sizeOf(formats.Vector3),
},
.uvs = .{
.buffer = uvs,
.offset = 0,
.buffer = self.vertex_heap.uvs,
.offset = @intCast(vertex_offset * @sizeOf(formats.Vector2)),
.stride = @sizeOf(formats.Vector2),
},
.indices = .{
.buffer = indices,
.offset = 0,
.buffer = self.vertex_heap.indices,
.offset = @intCast(index_offset * @sizeOf(formats.Index)),
.count = @intCast(mesh.indices.len),
.type = gl.UNSIGNED_INT,
.base_vertex = @intCast(vertex_offset),
},
};
@ -550,6 +528,7 @@ const LoadedShaderProgram = struct {
const LoadedMesh = struct {
aabb: AABB,
heap_handle: VertexBufferHeap.Alloc,
positions: BufferSlice,
normals: BufferSlice,
tangents: BufferSlice,
@ -581,7 +560,7 @@ pub const BufferSlice = struct {
stride: gl.GLsizei,
pub fn bind(self: *const BufferSlice, index: gl.GLuint) void {
gl.bindVertexBuffer(index, self.buffer, self.offset, self.stride);
gl.bindVertexBuffer(index, self.buffer, 0, self.stride);
}
};
@ -590,6 +569,7 @@ pub const IndexSlice = struct {
offset: gl.GLuint,
count: gl.GLsizei,
type: gl.GLenum,
base_vertex: gl.GLint,
pub fn bind(self: *const IndexSlice) void {
gl.bindBuffer(gl.ELEMENT_ARRAY_BUFFER, self.buffer);
@ -717,7 +697,7 @@ fn unloadAssetWithDependees(self: *AssetManager, id: AssetId) void {
switch (asset.*) {
.mesh => |*mesh| {
gl.deleteBuffers(5, &[_]gl.GLuint{ mesh.positions.buffer, mesh.normals.buffer, mesh.tangents.buffer, mesh.uvs.buffer, mesh.indices.buffer });
self.vertex_heap.free(mesh.heap_handle);
},
.shader => |*shader| {
self.allocator.free(shader.source);
@ -740,3 +720,114 @@ fn unloadAssetWithDependees(self: *AssetManager, id: AssetId) void {
_ = self.dependees.remove(id);
_ = self.dependencies.remove(id);
}
const VertexBufferHeap = struct {
const Self = @This();
pub const Alloc = struct {
vertex: BuddyAllocator.Alloc = .{},
index: BuddyAllocator.Alloc = .{},
};
vertex_buddy: BuddyAllocator,
index_buddy: BuddyAllocator,
vertices: gl.GLuint,
normals: gl.GLuint,
tangents: gl.GLuint,
uvs: gl.GLuint,
indices: gl.GLuint,
pub fn init(allocator: std.mem.Allocator) !Self {
// 256 mega vertices :)
// memory usage for vertices (- indices) = n * 11 * 4
// 4096, 12 will take 704 mb for vertices
var vertex_buddy = try BuddyAllocator.init(allocator, 4096, 12);
errdefer vertex_buddy.deinit();
var index_buddy = try BuddyAllocator.init(allocator, 4096, 12);
errdefer index_buddy.deinit();
const vertex_buf_size = vertex_buddy.getSize();
const index_buf_size = index_buddy.getSize();
var bufs = [_]gl.GLuint{ 0, 0, 0, 0, 0 };
gl.createBuffers(bufs.len, &bufs);
errdefer gl.deleteBuffers(bufs.len, &bufs);
for (bufs) |buf| {
if (buf == 0) {
return error.BufferAllocationFailed;
}
}
const vertices = bufs[0];
const normals = bufs[1];
const tangents = bufs[2];
const uvs = bufs[3];
const indices = bufs[4];
gl.namedBufferStorage(
vertices,
@intCast(vertex_buf_size * @sizeOf(formats.Vector3)),
null,
gl.DYNAMIC_STORAGE_BIT,
);
gl.namedBufferStorage(
normals,
@intCast(vertex_buf_size * @sizeOf(formats.Vector3)),
null,
gl.DYNAMIC_STORAGE_BIT,
);
gl.namedBufferStorage(
tangents,
@intCast(vertex_buf_size * @sizeOf(formats.Vector3)),
null,
gl.DYNAMIC_STORAGE_BIT,
);
gl.namedBufferStorage(
uvs,
@intCast(vertex_buf_size * @sizeOf(formats.Vector2)),
null,
gl.DYNAMIC_STORAGE_BIT,
);
gl.namedBufferStorage(
indices,
@intCast(index_buf_size * @sizeOf(formats.Index)),
null,
gl.DYNAMIC_STORAGE_BIT,
);
return .{
.vertex_buddy = vertex_buddy,
.index_buddy = index_buddy,
.vertices = vertices,
.normals = normals,
.tangents = tangents,
.uvs = uvs,
.indices = indices,
};
}
pub fn deinit(self: *Self) void {
self.index_buddy.deinit();
self.vertex_buddy.deinit();
const bufs = [_]gl.GLuint{ self.vertices, self.normals, self.tangents, self.uvs, self.indices };
gl.deleteBuffers(bufs.len, &bufs);
}
pub fn alloc(self: *Self, vertex_len: usize, index_len: usize) !Alloc {
const vertex_alloc = try self.vertex_buddy.alloc(vertex_len);
errdefer self.vertex_buddy.free(vertex_alloc);
const index_alloc = try self.index_buddy.alloc(index_len);
errdefer self.index_buddy.free(index_alloc);
return Alloc{ .vertex = vertex_alloc, .index = index_alloc };
}
pub fn free(self: *Self, allocation: Alloc) void {
self.vertex_buddy.free(allocation.vertex);
self.index_buddy.free(allocation.index);
}
};

213
src/BuddyAllocator.zig Normal file
View File

@ -0,0 +1,213 @@
const std = @import("std");
pub const BuddyAllocator = @This();
pub const Alloc = struct {
offset: u32 = 0,
depth: u6 = 0,
};
const BlockState = enum(u2) {
Free,
/// This block is actually allocated
Allocated,
/// When small block is allocated out of a large block, the large block and all its parents are split
Split,
};
const BlockStateArray = std.PackedIntSlice(u2);
allocator: std.mem.Allocator,
depth: u6,
min_block_size: usize,
states: []BlockState,
/// min_block_size - is the size of leaf blocks at the bottom of the binary tree
/// depth - depth of the binary tree
///
/// Largest block in the buffer will be: 2^(log2(min_block_size) + depth)
/// or more simply put (min_block_size << depth)
///
/// e.g.
/// min_block_size = 64
/// depth = 2
/// 256
/// / \
/// / \
/// / \
/// 128 128
/// / \ / \
/// / \ / \
/// 64 64 64 64
pub fn init(allocator: std.mem.Allocator, min_block_size: usize, depth: u6) !BuddyAllocator {
std.debug.assert(min_block_size > 0 and (min_block_size & (min_block_size - 1)) == 0);
const size = min_block_size << depth;
const states_len = (size * 2) - 1;
const states = try allocator.alloc(BlockState, states_len);
@memset(states, BlockState.Free);
return .{
.allocator = allocator,
.depth = depth,
.min_block_size = min_block_size,
.states = states,
};
}
pub fn deinit(self: *BuddyAllocator) void {
self.allocator.free(self.states);
}
/// Returns a memory address
pub fn alloc(self: *BuddyAllocator, size: usize) !Alloc {
var ctx = SearchContext{};
self.findBlock(size, 0, false, &ctx);
if (ctx.found) {
self.states[ctx.node] = BlockState.Allocated;
var parent = @divFloor(@as(isize, @intCast(ctx.node)) - 1, 2);
while (parent >= 0) : (parent = @divFloor(parent - 1, 2)) {
self.states[@intCast(parent)] = BlockState.Split;
}
const depth = getNodeDepth(ctx.node);
const block_size = self.getDepthSize(depth);
// Index of the first node at this depth
// All nodes at a single depth are laid out sequentially
const depth_offset = (@as(usize, @intCast(1)) << depth) - 1;
const idx = ctx.node - depth_offset;
return .{ .offset = @intCast(idx * block_size), .depth = depth };
}
return error.OutOfMemory;
}
pub fn free(self: *BuddyAllocator, allocation: Alloc) void {
if (allocation.depth == 0) {
std.debug.assert(self.states[0] == BlockState.Allocated);
self.states[0] = BlockState.Free;
} else {
const size = self.getDepthSize(@intCast(allocation.depth));
const depth_offset = (@as(usize, @intCast(1)) << allocation.depth) - 1;
const idx = @as(usize, @intCast(allocation.offset)) / size;
const node = idx + depth_offset;
std.debug.assert(self.states[node] == BlockState.Allocated);
self.states[node] = BlockState.Free;
// Merge
{
var parent = @divFloor(@as(isize, @intCast(node)) - 1, 2);
while (parent >= 0) : (parent = @divFloor(parent - 1, 2)) {
const parent_usize: usize = @intCast(parent);
if (self.states[parent_usize * 2 + 1] == BlockState.Free and self.states[parent_usize * 2 + 2] == BlockState.Free) {
self.states[parent_usize] = BlockState.Free;
}
}
}
}
}
const SearchContext = struct {
sort_key: usize = std.math.maxInt(usize),
node: usize = 0,
found: bool = false,
};
/// e.g.
/// min_block_size = 64
/// depth = 2
/// 256
/// / \
/// / \
/// / \
/// 128 128
/// / \ / \
/// / \ / \
/// 64 64 64 64
fn findBlock(self: *BuddyAllocator, size: usize, node: usize, under_split: bool, ctx: *SearchContext) void {
// Size of the block on the current level
const depth = getNodeDepth(node);
const depth_size = self.getDepthSize(depth);
if (size <= depth_size) {
const state: BlockState = self.states[node];
if (state != BlockState.Allocated) {
// Trying to find the best place for allocation
// It has to be the smallest block that fits size
// and preferrably it must be allocated from already split large blocks
// to leave as many free large blocks as possible
const sort_key = depth_size << (if (under_split) 0 else 1);
if (state == BlockState.Free and (!ctx.found or sort_key < ctx.sort_key)) {
ctx.sort_key = sort_key;
ctx.node = node;
ctx.found = true;
}
if (depth < self.depth) {
findBlock(self, size, node * 2 + 1, state == BlockState.Split and node != 0, ctx);
findBlock(self, size, node * 2 + 2, state == BlockState.Split and node != 0, ctx);
}
}
}
}
/// Returns node's depth by index
fn getNodeDepth(node: usize) u6 {
return @intCast(std.math.log2(node + 1));
}
/// Returns byte size of blocks at this depth
fn getDepthSize(self: *const BuddyAllocator, depth: u6) usize {
return (@as(usize, 1) << (self.depth - depth)) * self.min_block_size;
}
pub fn getSize(self: *const BuddyAllocator) usize {
return self.getDepthSize(0);
}
test "Buddy Allocator" {
var buddy = try BuddyAllocator.init(std.testing.allocator, 64, 2);
defer buddy.deinit();
const al1 = try buddy.alloc(64);
try std.testing.expectEqual(Alloc{ .offset = 0, .depth = 2 }, al1);
const al2 = try buddy.alloc(65);
try std.testing.expectEqual(Alloc{ .offset = 128, .depth = 1 }, al2);
const al3 = try buddy.alloc(32);
try std.testing.expectEqual(Alloc{ .offset = 64, .depth = 2 }, al3);
buddy.free(al2);
const al4 = try buddy.alloc(32);
try std.testing.expectEqual(Alloc{ .offset = 128, .depth = 2 }, al4);
const al5 = try buddy.alloc(32);
try std.testing.expectEqual(Alloc{ .offset = 192, .depth = 2 }, al5);
buddy.free(al1);
buddy.free(al3);
buddy.free(al5);
const al6 = try buddy.alloc(32);
try std.testing.expectEqual(Alloc{ .offset = 192, .depth = 2 }, al6);
buddy.free(al4);
buddy.free(al6);
const al7 = try buddy.alloc(129);
try std.testing.expectEqual(Alloc{ .offset = 0, .depth = 0 }, al7);
buddy.free(al7);
try std.testing.expect(std.mem.allEqual(BlockState, buddy.states, BlockState.Free));
}

View File

@ -831,16 +831,23 @@ pub fn finish(self: *Render) void {
gl.GL_ARB_bindless_texture.uniformHandleui64ARB(Uniform.ShadowMapCube.value(), self.cube_shadow_texture_handle);
mesh.positions.bind(Render.Attrib.Position.value());
checkGLError();
mesh.normals.bind(Render.Attrib.Normal.value());
checkGLError();
mesh.tangents.bind(Render.Attrib.Tangent.value());
checkGLError();
mesh.uvs.bind(Render.Attrib.UV.value());
checkGLError();
gl.bindBuffer(gl.ELEMENT_ARRAY_BUFFER, mesh.indices.buffer);
gl.drawElements(
checkGLError();
gl.drawElementsBaseVertex(
gl.TRIANGLES,
mesh.indices.count,
mesh.indices.type,
@ptrFromInt(mesh.indices.offset),
mesh.indices.base_vertex,
);
checkGLError();
}
gl.disable(gl.BLEND);
@ -873,11 +880,12 @@ pub fn finish(self: *Render) void {
for (view_proj_matrices) |frustum_view_proj| {
const frustum_model_mat = frustum_view_proj.inv().mul(model);
gl.uniformMatrix4fv(Uniform.ModelMatrix.value(), 1, gl.FALSE, @ptrCast(&frustum_model_mat.data));
gl.drawElements(
gl.drawElementsBaseVertex(
gl.TRIANGLES,
mesh.indices.count,
mesh.indices.type,
@ptrFromInt(mesh.indices.offset),
mesh.indices.base_vertex,
);
}
}
@ -893,7 +901,7 @@ pub fn finish(self: *Render) void {
for (self.world_view_frustum_corners[split_idx]) |corner| {
const model = Mat4.fromTranslate(corner);
gl.uniformMatrix4fv(Uniform.ModelMatrix.value(), 1, gl.FALSE, @ptrCast(&model.data));
gl.drawElements(gl.TRIANGLES, mesh.indices.count, mesh.indices.type, @ptrFromInt(mesh.indices.offset));
gl.drawElementsBaseVertex(gl.TRIANGLES, mesh.indices.count, mesh.indices.type, @ptrFromInt(mesh.indices.offset), mesh.indices.base_vertex);
}
}
}
@ -931,11 +939,12 @@ pub fn finish(self: *Render) void {
gl.viewport(0, 0, size.x(), size.y());
gl.uniform1i(Uniform.SRCMipLevel.value(), @intCast(src_mip_level));
gl.drawElements(
gl.drawElementsBaseVertex(
gl.TRIANGLES,
quad.indices.count,
quad.indices.type,
@ptrFromInt(quad.indices.offset),
quad.indices.base_vertex,
);
}
}
@ -957,11 +966,12 @@ pub fn finish(self: *Render) void {
gl.uniform1i(Uniform.SRCMipLevel.value(), @intCast(src_mip_level));
gl.uniform1f(Uniform.BloomStrength.value(), if (dst_mip_level == 0) 0.04 else 1);
gl.drawElements(
gl.drawElementsBaseVertex(
gl.TRIANGLES,
quad.indices.count,
quad.indices.type,
@ptrFromInt(quad.indices.offset),
quad.indices.base_vertex,
);
}
}
@ -978,12 +988,7 @@ pub fn finish(self: *Render) void {
gl.bindTextureUnit(0, self.screen_color_texture);
defer gl.bindTextureUnit(0, 0);
gl.drawElements(
gl.TRIANGLES,
quad.indices.count,
quad.indices.type,
@ptrFromInt(quad.indices.offset),
);
gl.drawElementsBaseVertex(gl.TRIANGLES, quad.indices.count, quad.indices.type, @ptrFromInt(quad.indices.offset), quad.indices.base_vertex);
}
self.gl_fences[self.tripple_buffer_index] = gl.fenceSync(gl.SYNC_GPU_COMMANDS_COMPLETE, 0);
@ -1051,11 +1056,12 @@ fn renderShadow(self: *Render, frustum: *const math.Frustum) void {
mesh.positions.bind(Render.Attrib.Position.value());
gl.bindBuffer(gl.ELEMENT_ARRAY_BUFFER, mesh.indices.buffer);
gl.drawElements(
gl.drawElementsBaseVertex(
gl.TRIANGLES,
mesh.indices.count,
mesh.indices.type,
@ptrFromInt(mesh.indices.offset),
mesh.indices.base_vertex,
);
}
}

View File

@ -265,26 +265,26 @@ pub fn writeTexture(writer: anytype, value: Texture, endian: std.builtin.Endian)
}
}
test "texture write/parse" {
var data = [_]u8{ 'h', 'e', 'l', 'l', 'o' };
const source = Texture{
.header = .{
.format = .bc7,
.width = 123,
.height = 234,
.mip_count = 1,
},
.data = &.{&data},
};
var buf: [@sizeOf(Texture.Header) + data.len + 4]u8 = undefined;
var stream = std.io.fixedBufferStream(&buf);
try writeTexture(stream.writer(), source, native_endian);
var decoded = try Texture.fromBuffer(std.testing.allocator, &buf);
defer decoded.free(std.testing.allocator);
try std.testing.expectEqualDeep(source, decoded);
}
// test "texture write/parse" {
// var data = [_]u8{ 'h', 'e', 'l', 'l', 'o' };
// const source = Texture{
// .header = .{
// .format = .bc7,
// .width = 123,
// .height = 234,
// .mip_count = 1,
// },
// .data = &.{&data},
// };
//
// var buf: [@sizeOf(Texture.Header) + data.len + 4]u8 = undefined;
// var stream = std.io.fixedBufferStream(&buf);
// try writeTexture(stream.writer(), source, native_endian);
//
// var decoded = try Texture.fromBuffer(std.testing.allocator, &buf);
// defer decoded.free(std.testing.allocator);
// try std.testing.expectEqualDeep(source, decoded);
// }
pub const Scene = struct {
pub const Header = extern struct {