engine/src/Render2.zig
sergeypdev 4fd797c048 Slowly building out the rendering framework
- Ditch VMA for per frame data
- Add basic global descriptor set and all the boilerplate to manage that
- Add global uniform buffer that only has camera matrices right now
- Implement a per frame GPU memory arena, one large buffer that wraps around holding data for all frames in flight
- Get free look camera working again
2024-12-08 21:39:09 +04:00

504 lines
19 KiB
Zig

const std = @import("std");
const GraphicsContext = @import("GraphicsContext.zig");
const AssetManager = @import("AssetManager.zig");
const ShaderManager = @import("ShaderManager.zig");
const vk = @import("vk");
const a = @import("asset_manifest");
const za = @import("zalgebra");
const Vec3 = za.Vec3;
const Mat4 = za.Mat4;
const Render2 = @This();
// TODO: support ortho
pub const Camera = struct {
pos: Vec3 = Vec3.zero(),
fovy: f32 = 60,
aspect: f32 = 1,
near: f32 = 0.1,
far: f32 = 10,
view_mat: Mat4 = Mat4.identity(),
pub fn projection(self: *const Camera) Mat4 {
return za.perspective(self.fovy, self.aspect, self.near, self.far);
}
};
var default_camera: Camera = .{};
const MAX_FRAME_LAG = 3;
const PER_FRAME_ARENA_SIZE = 64 * 1024 * 1024; // 64mb TODO: should I handle cases when even 64mb is not available
gc: *GraphicsContext,
shaderman: *ShaderManager,
assetman: *AssetManager,
command_pool: GraphicsContext.CommandPool,
vulkan_frame_arena: VulkanPerFrameArena,
camera: *Camera = &default_camera,
frame: u32 = 0,
frame_data: [MAX_FRAME_LAG]FrameData = undefined,
// Ring buffer/arena for per frame data
pub const VulkanPerFrameArena = struct {
const Self = @This();
pub const FrameRegion = struct {
start: u64 = 0,
end: u64 = 0,
pub fn init(start: u64, end: u64) FrameRegion {
return FrameRegion{ .start = start, .end = end };
}
// If region is wrapping (end < start), returns 2 non wrapping regions
pub fn unwrap(self: *const FrameRegion, len: u64, out_non_wrapping_regions: []FrameRegion) []FrameRegion {
std.debug.assert(out_non_wrapping_regions.len >= 2);
if (self.end < self.start) {
out_non_wrapping_regions[0].start = self.start;
out_non_wrapping_regions[0].end = len;
out_non_wrapping_regions[1].start = 0;
out_non_wrapping_regions[1].end = self.end;
return out_non_wrapping_regions[0..2];
} else {
out_non_wrapping_regions[0] = self.*;
return out_non_wrapping_regions[0..1];
}
}
pub fn intersectsNonWrapping(self: *const FrameRegion, other: *const FrameRegion) bool {
return !(other.start > self.end or self.start > other.end);
}
pub fn intersectsWrapping(self: *const FrameRegion, other: *const FrameRegion, len: u64) bool {
var buf_a: [2]FrameRegion = undefined;
var buf_b: [2]FrameRegion = undefined;
const non_wrapping_regions_a = self.unwrap(len, &buf_a);
const non_wrapping_regions_b = other.unwrap(len, &buf_b);
for (non_wrapping_regions_a) |region_a| {
for (non_wrapping_regions_b) |region_b| {
if (region_a.intersectsNonWrapping(&region_b)) {
return true;
}
}
}
return false;
}
};
memory: vk.DeviceMemory,
size: u64,
tail: u64 = 0,
frame: u32 = 0,
// Tracks where the start offset for each frame is,
// Allocations will fail if you
// NOTE: bug in zig? Tried to use [MAX_FRAME_LAG]?u64 here, but optional checks pass even when value is null, wtf??
frame_regions: [MAX_FRAME_LAG]?FrameRegion = [_]?FrameRegion{null} ** MAX_FRAME_LAG,
// Tracking allocated resources per frame, unfortunately have to wait for frame to finish before we can destroy them :(
buffers: [MAX_FRAME_LAG][1024]vk.Buffer = undefined,
buffer_counts: [MAX_FRAME_LAG]u16 = [_]u16{0} ** MAX_FRAME_LAG,
pub fn init(memory: vk.DeviceMemory, size: u64) Self {
return Self{
.memory = memory,
.size = size,
};
}
pub fn startFrame(self: *VulkanPerFrameArena, device: GraphicsContext.Device, frame_index: u32) void {
// TODO: tail pointer should be aligned to nonCoherentAtomSize to avoid accidentally flushing memory being used by previous frames
// if we end up allocating right up until the previous frame's head
// Record start position of this frame
if (self.frame_regions[self.frame]) |*cur_region| {
cur_region.end = self.tail;
}
self.frame = frame_index;
self.frame_regions[self.frame] = FrameRegion.init(self.tail, self.tail);
for (self.buffers[self.frame][0..self.buffer_counts[self.frame]]) |buf| {
device.destroyBuffer(buf, null);
}
self.buffer_counts[self.frame] = 0;
}
// Caller guarantees that memory from given frame can be safely stomped, buffers destroyed etc.
pub fn resetFrame(self: *VulkanPerFrameArena, frame_index: u32) void {
self.frame_regions[frame_index] = null;
}
pub fn getModifiedMemoryRanges(self: *VulkanPerFrameArena, out_ranges: []vk.MappedMemoryRange) []const vk.MappedMemoryRange {
std.debug.assert(out_ranges.len >= 2);
std.debug.assert(self.frame_regions[self.frame] != null);
const region = self.frame_regions[self.frame].?;
// We wrapped, use two regions
if (self.tail < region.start) {
out_ranges[0] = vk.MappedMemoryRange{
.memory = self.memory,
.offset = region.start,
.size = self.size - region.start,
};
out_ranges[1] = vk.MappedMemoryRange{
.memory = self.memory,
.offset = 0,
.size = self.tail,
};
return out_ranges[0..];
} else {
out_ranges[0] = vk.MappedMemoryRange{
.memory = self.memory,
.offset = region.start,
.size = self.tail - region.start,
};
return out_ranges[0..1];
}
}
// Finds offset where memory can be put, handles wrapping, doesn't handle inter-frame stomping
fn findSlotOptimistic(self: *const Self, size: u64, alignment: u64) !u64 {
const offset = std.mem.alignForward(u64, self.tail, alignment);
if (offset + size <= self.size) {
return offset;
} else if (size <= self.size) {
return 0;
} else {
return error.OutOfMemory;
}
}
fn findSlotChecked(self: *const Self, size: u64, alignment: u64) !u64 {
const next_frame = (self.frame + 1) % MAX_FRAME_LAG;
const offset = try self.findSlotOptimistic(size, alignment);
if (self.frame_regions[next_frame]) |next_frame_region| {
const allocated_region = FrameRegion.init(offset, offset + size);
if (next_frame_region.intersectsWrapping(&allocated_region, self.size)) {
return error.OverlapsPreviousFrame;
}
}
return offset;
}
pub fn allocate(self: *Self, size: u64, alignment: u64) !u64 {
const offset = try self.findSlotChecked(size, alignment);
self.tail = offset + size;
return offset;
}
pub fn createBufferRaw(self: *Self, device: GraphicsContext.Device, usage: vk.BufferUsageFlags, size: u64, out_addr: *u64) !vk.Buffer {
// NOTE: Allocating buffers just in time, hopefully vulkan impl is smart about allocation here and not doing new each time...
const buffer = try device.createBuffer(&vk.BufferCreateInfo{
.flags = .{},
.usage = usage,
.size = size,
.sharing_mode = .exclusive,
}, null);
errdefer device.destroyBuffer(buffer, null);
const mem_reqs = device.getBufferMemoryRequirements(buffer);
out_addr.* = try self.allocate(mem_reqs.size, mem_reqs.alignment);
try device.bindBufferMemory(buffer, self.memory, out_addr.*);
self.buffers[self.frame][self.buffer_counts[self.frame]] = buffer;
self.buffer_counts[self.frame] += 1;
return buffer;
}
pub fn reset(self: *Self) void {
self.head = 0;
}
};
pub fn init(self: *Render2, gc: *GraphicsContext, shaderman: *ShaderManager, assetman: *AssetManager) !void {
const per_frame_upload_memory = try gc.device.allocateMemory(&.{
.memory_type_index = gc.memory_config.cpu_to_gpu.type_index,
.allocation_size = PER_FRAME_ARENA_SIZE,
}, null);
self.* = Render2{
.gc = gc,
.shaderman = shaderman,
.assetman = assetman,
.command_pool = try gc.queues.graphics.createCommandPool(.{ .reset_command_buffer_bit = true }),
.vulkan_frame_arena = VulkanPerFrameArena.init(per_frame_upload_memory, PER_FRAME_ARENA_SIZE),
};
errdefer self.command_pool.deinit();
// NOTE: TEST
for (0..MAX_FRAME_LAG) |i| {
self.frame_data[i] = try FrameData.init(gc, self.command_pool);
}
}
fn createPerFrameBuffer(self: *Render2, usage: vk.BufferUsageFlags, size: u64, out_addr: *u64) !vk.Buffer {
while (true) {
if (self.vulkan_frame_arena.createBufferRaw(self.gc.device, usage, size, out_addr)) |buffer| {
return buffer;
} else |err| switch (err) {
error.OverlapsPreviousFrame => {
const overlapped_frame = (self.frame + 1) % MAX_FRAME_LAG;
std.debug.print("Vulkan Frame Allocator Overlapped frame {}, waiting for it to finish...", .{overlapped_frame});
try self.frame_data[overlapped_frame].waitForDrawAndReset(self.gc.device);
self.vulkan_frame_arena.resetFrame(overlapped_frame);
},
else => return err,
}
}
}
fn frameAllocMemReqs(self: *Render2, mem_reqs: vk.MemoryRequirements) !u64 {
return self.frameAlloc(mem_reqs.size, mem_reqs.alignment);
}
pub fn draw(self: *Render2) !void {
const device = self.gc.device;
const frame = &self.frame_data[self.frame];
try frame.waitForDrawAndReset(self.gc.device);
self.vulkan_frame_arena.resetFrame(self.frame);
self.vulkan_frame_arena.startFrame(self.gc.device, self.frame);
const frame_arena_mem: []u8 = @as([*c]u8, @ptrCast((try device.mapMemory(self.vulkan_frame_arena.memory, 0, self.vulkan_frame_arena.size, .{})).?))[0..self.vulkan_frame_arena.size];
var global_buffer_addr: u64 = 0;
const global_uniform_buffer = try self.createPerFrameBuffer(.{ .uniform_buffer_bit = true }, @sizeOf(GlobalUniform), &global_buffer_addr);
{
const global_uniform: *align(1) GlobalUniform = std.mem.bytesAsValue(GlobalUniform, frame_arena_mem[global_buffer_addr .. global_buffer_addr + @sizeOf(GlobalUniform)]);
{
const view = self.camera.view_mat;
// const fwidth: f32 = @floatFromInt(self.gc.swapchain_extent.width);
// const fheight: f32 = @floatFromInt(self.gc.swapchain_extent.height);
const projection = self.camera.projection();
const view_projection = projection.mul(view);
global_uniform.* = .{
.view = .{
.world_to_view = view,
.view_to_clip = projection,
.world_to_clip = view_projection,
},
};
}
}
// Move this out into a separate func
const swapchain_image_index: u32 = try self.gc.acquireSwapchainImage(frame.acquire_swapchain_image);
var current_image = GraphicsContext.Image{ .handle = self.gc.swapchain_images[swapchain_image_index], .mip_count = 1, .layer_count = 1, .format = .r8g8b8a8_unorm };
const current_image_view = try current_image.createView(self.gc.device, .{ .color_bit = true });
defer self.gc.device.destroyImageView(current_image_view, null);
const cmds = frame.command_buffer;
try cmds.beginCommandBuffer(&.{});
{
// Transition global uniform buffer
cmds.pipelineBarrier2(&vk.DependencyInfo{
.buffer_memory_barrier_count = 1,
.p_buffer_memory_barriers = &.{
vk.BufferMemoryBarrier2{
.buffer = global_uniform_buffer,
.src_stage_mask = .{ .host_bit = true },
.src_access_mask = .{ .host_write_bit = true },
.dst_stage_mask = .{ .vertex_shader_bit = true },
.dst_access_mask = .{ .shader_read_bit = true },
.offset = 0,
.size = @sizeOf(GlobalUniform),
.src_queue_family_index = vk.QUEUE_FAMILY_IGNORED,
.dst_queue_family_index = vk.QUEUE_FAMILY_IGNORED,
},
},
});
const global_descriptor_set = try frame.allocateDescriptorSet(device, self.shaderman.descriptor_set_layouts.global);
device.updateDescriptorSets(1, &.{
vk.WriteDescriptorSet{
.dst_set = global_descriptor_set,
.dst_binding = 0,
.dst_array_element = 0,
.descriptor_type = .uniform_buffer,
.descriptor_count = 1,
.p_buffer_info = &.{
vk.DescriptorBufferInfo{
.buffer = global_uniform_buffer,
.offset = 0,
.range = @sizeOf(GlobalUniform),
},
},
.p_image_info = &[_]vk.DescriptorImageInfo{},
.p_texel_buffer_view = &[_]vk.BufferView{},
},
}, 0, null);
try current_image.sync(cmds, .{ .stage_mask = .{ .color_attachment_output_bit = true }, .access_mask = .{ .color_attachment_write_bit = true } }, .attachment_optimal);
{
cmds.beginRendering(&.{
.render_area = vk.Rect2D{ .offset = .{ .x = 0, .y = 0 }, .extent = self.gc.swapchain_extent },
.layer_count = 1,
.view_mask = 0,
.color_attachment_count = 1,
.p_color_attachments = &.{
vk.RenderingAttachmentInfo{
.clear_value = .{ .color = .{ .float_32 = .{ 0.8, 0.7, 0.6, 1.0 } } },
.load_op = .clear,
.store_op = .store,
.image_layout = .attachment_optimal,
.image_view = current_image_view,
.resolve_image_layout = .attachment_optimal,
.resolve_mode = .{},
},
},
});
defer cmds.endRendering();
const triangle = self.assetman.resolveShaderProgram(a.ShaderPrograms.shaders.triangle);
cmds.bindPipeline(.graphics, triangle.pipeline);
cmds.bindDescriptorSets(.graphics, triangle.layout, 0, 1, &.{global_descriptor_set}, 0, null);
cmds.setViewportWithCount(1, &.{vk.Viewport{
.x = 0,
.y = 0,
.width = @floatFromInt(self.gc.swapchain_extent.width),
.height = @floatFromInt(self.gc.swapchain_extent.height),
.min_depth = 0,
.max_depth = 1,
}});
cmds.setScissorWithCount(1, &.{vk.Rect2D{
.offset = .{ .x = 0, .y = 0 },
.extent = self.gc.swapchain_extent,
}});
cmds.draw(3, 1, 0, 0);
}
try current_image.sync(cmds, .{}, .present_src_khr);
}
try cmds.endCommandBuffer();
var vulkan_frame_arena_modified_ranges_buf: [2]vk.MappedMemoryRange = undefined;
const vulkan_frame_arena_modified_ranges = self.vulkan_frame_arena.getModifiedMemoryRanges(&vulkan_frame_arena_modified_ranges_buf);
try device.flushMappedMemoryRanges(@intCast(vulkan_frame_arena_modified_ranges.len), vulkan_frame_arena_modified_ranges.ptr);
// NOTE: Unmap DEVICE_LOCAL, HOST_VISIBLE memory before submit as it can be slow on Windows (according to Reddit...)
device.unmapMemory(self.vulkan_frame_arena.memory);
try self.gc.queues.graphics.submit(
&GraphicsContext.SubmitInfo{
.wait_semaphores = &.{frame.acquire_swapchain_image},
.wait_dst_stage_mask = &.{vk.PipelineStageFlags{}},
.command_buffers = &.{cmds.handle},
.signal_semaphores = &.{frame.draw_sema},
},
frame.draw_fence,
);
_ = try self.gc.device.queuePresentKHR(self.gc.queues.graphics.handle, &.{
.swapchain_count = 1,
.wait_semaphore_count = 1,
.p_wait_semaphores = &.{frame.draw_sema},
.p_swapchains = &.{self.gc.swapchain},
.p_image_indices = &.{swapchain_image_index},
});
self.frame = (self.frame + 1) % MAX_FRAME_LAG;
}
fn uploadData(self: *Render2, cmds: GraphicsContext.CommandBuffer, dst: GraphicsContext.Buffer, dst_offset: usize, len: usize) !void {
cmds.copyBuffer2(&.{
.src_buffer = self.upload_buffer.handle,
.dst_buffer = dst.handle,
.p_regions = &.{
vk.BufferCopy2{
.src_offset = self.upload_buffer_cursor,
.dst_offset = dst_offset,
.size = len,
},
},
});
self.upload_buffer_cursor += len;
}
// Per frame stuff
const FrameData = struct {
// Sync
acquire_swapchain_image: vk.Semaphore,
draw_sema: vk.Semaphore,
draw_fence: vk.Fence,
command_buffer: GraphicsContext.CommandBuffer,
descriptor_pool: vk.DescriptorPool = .null_handle,
pub fn init(gc: *GraphicsContext, command_pool: GraphicsContext.CommandPool) !FrameData {
return FrameData{
.acquire_swapchain_image = try gc.device.createSemaphore(&.{}, null),
.draw_sema = try gc.device.createSemaphore(&.{}, null),
.draw_fence = try gc.device.createFence(&.{ .flags = .{ .signaled_bit = true } }, null),
.command_buffer = try command_pool.allocateCommandBuffer(),
.descriptor_pool = try gc.device.createDescriptorPool(&vk.DescriptorPoolCreateInfo{
.max_sets = 1024,
.p_pool_sizes = &.{
vk.DescriptorPoolSize{
.type = .uniform_buffer,
.descriptor_count = 8,
},
},
.pool_size_count = 1,
}, null),
// TODO: maybe cache memory requirements?
};
}
pub fn allocateDescriptorSet(self: *FrameData, device: GraphicsContext.Device, layout: vk.DescriptorSetLayout) !vk.DescriptorSet {
var result: [1]vk.DescriptorSet = .{.null_handle};
try device.allocateDescriptorSets(&vk.DescriptorSetAllocateInfo{
.descriptor_pool = self.descriptor_pool,
.descriptor_set_count = 1,
.p_set_layouts = &.{layout},
}, &result);
return result[0];
}
pub fn waitForDrawAndReset(self: *FrameData, device: GraphicsContext.Device) !void {
_ = try device.waitForFences(1, &.{self.draw_fence}, vk.TRUE, std.math.maxInt(u64));
try device.resetFences(1, &.{self.draw_fence});
try self.command_buffer.resetCommandBuffer(.{ .release_resources_bit = true });
try device.resetDescriptorPool(self.descriptor_pool, .{});
}
};
const GlobalUniform = extern struct {
pub const View = extern struct {
world_to_clip: Mat4,
view_to_clip: Mat4,
world_to_view: Mat4,
};
view: View,
};