From 48ef1fb4b7ec8e2156ec8b9f36398841d75be124 Mon Sep 17 00:00:00 2001 From: sergeypdev Date: Sun, 10 Aug 2025 20:26:22 +0400 Subject: [PATCH] A bunch of optimizations and becnhmarks for XARR, almost as fast as a simple slice now --- common/container/xarr/xarr.odin | 90 +++++--- common/container/xarr/xarr_test.odin | 313 ++++++++++++++++++++++++++- test.sh | 3 +- 3 files changed, 375 insertions(+), 31 deletions(-) diff --git a/common/container/xarr/xarr.odin b/common/container/xarr/xarr.odin index df41cf8..2331e2f 100644 --- a/common/container/xarr/xarr.odin +++ b/common/container/xarr/xarr.odin @@ -9,35 +9,35 @@ BASE_CHUNK_SHIFT :: BASE_CHUNK_SIZE_LOG2 - 1 NUM_CHUNKS :: 30 Xarr :: struct($T: typeid, $SOA := false) { - chunks: ([NUM_CHUNKS]#soa[]T when SOA else [NUM_CHUNKS][^]T), len: int, allocated_chunks_mask: u32, + chunks: ([NUM_CHUNKS]#soa[]T when SOA else [NUM_CHUNKS][^]T), } UINT_BITS :: size_of(uint) * 8 -msb :: #force_inline proc "contextless" (#any_int idx: uint) -> i8 { - return i8(UINT_BITS - intrinsics.count_leading_zeros(idx)) - 1 +msb :: #force_inline proc "contextless" (#any_int idx: uint) -> i32 { + return i32(UINT_BITS - intrinsics.count_leading_zeros(idx)) - 1 } -chunk_by_index :: #force_inline proc "contextless" (#any_int idx: uint) -> (chunk: i8) { +chunk_by_index :: #force_inline proc "contextless" (#any_int idx: uint) -> (chunk: i32) { return max(msb(idx) - BASE_CHUNK_SHIFT, 0) } -chunk_size :: #force_inline proc "contextless" (chunk_idx: i8) -> uint { - return BASE_CHUNK_SIZE << u32(max(chunk_idx - 1, 0)) +chunk_size :: #force_inline proc "contextless" (chunk_idx: i32) -> uint { + return BASE_CHUNK_SIZE << intrinsics.saturating_sub(u32(chunk_idx), 1) } get_chunk_slice_scalar :: #force_inline proc "contextless" ( a: $T/Xarr($E, false), - chunk_idx: i8, + chunk_idx: i32, ) -> []E { return a.chunks[chunk_idx][:chunk_size(chunk_idx)] } get_chunk_slice_soa :: #force_inline proc "contextless" ( a: $T/Xarr($E, true), - chunk_idx: i8, + chunk_idx: i32, ) -> #soa[]E { return a.chunks[chunk_idx] } @@ -47,24 +47,26 @@ get_chunk_slice :: proc { get_chunk_slice_soa, } -capacity_from_allocated_mask :: #force_inline proc(allocated_mask: uint) -> uint { +capacity_from_allocated_mask :: #force_inline proc "contextless" (allocated_mask: u32) -> uint { return( - (allocated_mask >> 1) << BASE_CHUNK_SIZE_LOG2 + - (allocated_mask & 1) << BASE_CHUNK_SIZE_LOG2 \ + uint(allocated_mask >> 1) << BASE_CHUNK_SIZE_LOG2 + + uint(allocated_mask & 1) << BASE_CHUNK_SIZE_LOG2 \ ) } -capacity :: #force_inline proc(a: $T/Xarr($E, $SOA)) -> u32 { +capacity :: #force_inline proc "contextless" (a: $T/Xarr($E, $SOA)) -> uint { allocated_mask := a.allocated_chunks_mask return capacity_from_allocated_mask(allocated_mask) } -reserve :: proc(a: $T/^Xarr($E, $SOA), cap: int, allocator := context.allocator) { +reserve :: proc(a: $T/^Xarr($E, $SOA), cap: int, allocator := context.allocator) #no_bounds_check { allocated_mask := a.allocated_chunks_mask current_chunk := msb(allocated_mask) required_chunks := chunk_by_index(max(cap - 1, 0)) + 1 + assert(required_chunks <= NUM_CHUNKS) + for i := current_chunk + 1; i < required_chunks; i += 1 { when SOA { chunk_slice := make_soa_slice(#soa[]E, chunk_size(i), allocator) @@ -77,24 +79,40 @@ reserve :: proc(a: $T/^Xarr($E, $SOA), cap: int, allocator := context.allocator) } } -append :: proc(a: $T/^Xarr($E, $SOA), elems: ..E, allocator := context.allocator) { +append_elem :: proc(a: $T/^Xarr($E, $SOA), elem: E, allocator := context.allocator) { + if capacity(a^) <= uint(a.len + 1) { + reserve(a, a.len + 1) + } + #no_bounds_check { + chunk_idx, idx_within_chunk := translate_index(a.len) + a.chunks[chunk_idx][idx_within_chunk] = elem + } + a.len += 1 +} + +append_elems :: proc(a: $T/^Xarr($E, $SOA), elems: ..E, allocator := context.allocator) { if len(elems) == 0 { return } - reserve(a, a.len + len(elems)) + if capacity(a^) < uint(a.len + len(elems)) { + reserve(a, a.len + len(elems)) + } set_elems_assume_allocated(a, elems) a.len += len(elems) } -translate_index :: #force_inline proc( +append :: proc { + append_elem, + append_elems, +} + +translate_index :: #force_inline proc "contextless" ( #any_int idx: int, ) -> ( - chunk_idx: i8, + chunk_idx: i32, idx_within_chunk: uint, ) { - assert(idx >= 0) - chunk_idx = chunk_by_index(idx) idx_within_chunk = uint(idx) & (chunk_size(chunk_idx) - 1) @@ -102,13 +120,23 @@ translate_index :: #force_inline proc( } @(private = "file") -set_elems_assume_allocated :: proc(a: $T/^Xarr($E, $SOA), elems: []E) { +set_elems_assume_allocated :: proc "contextless" ( + a: $T/^Xarr($E, $SOA), + elems: []E, +) #no_bounds_check { for &e, i in elems { idx := a.len + i chunk_idx, idx_within_chunk := translate_index(idx) - assert(a.chunks[chunk_idx] != nil) - a.chunks[chunk_idx][idx_within_chunk] = e + when SOA { + a.chunks[chunk_idx][idx_within_chunk] = e + } else { + intrinsics.mem_copy_non_overlapping( + &a.chunks[chunk_idx][idx_within_chunk], + &e, + size_of(E), + ) + } } } @@ -145,7 +173,7 @@ clear :: proc "contextless" (a: $T/^Xarr($E, $SOA)) { delete :: proc(a: $T/^Xarr($E, $SOA), allocator := context.allocator) { for i in 0 ..< len(a.chunks) { - builtin.delete(get_chunk_slice(a^, i8(i)), allocator) + builtin.delete(get_chunk_slice(a^, i32(i)), allocator) } a^ = Xarr(E, SOA){} @@ -176,7 +204,7 @@ iterator_next :: proc(it: ^Iterator($E, $SOA)) -> (e: ^E, idx: int, ok: bool) { Chunk_Iterator :: struct($E: typeid, $SOA: bool) { xarr: ^Xarr(E, SOA), base_element_idx: int, - chunk_idx: i8, + chunk_idx: i32, } chunk_iterator :: proc(a: $T/^Xarr($E, $SOA)) -> Chunk_Iterator(E, SOA) { @@ -190,15 +218,17 @@ chunk_iterator_next_scalar :: proc( base_element_idx: int, ok: bool, ) { - if (it.xarr.allocated_chunks_mask & (u32(1) << it.idx)) == 0 { + if (it.xarr.allocated_chunks_mask & (u32(1) << u32(it.chunk_idx))) == 0 { return nil, 0, false } - chunk = get_chunk_slice_scalar(it.xarr, it.idx) + chunk = get_chunk_slice_scalar(it.xarr^, it.chunk_idx) + // Limit the chunk to the length so user code doesn't have to worry about this base_element_idx = it.base_element_idx + chunk = chunk[:min(len(chunk), it.xarr.len - base_element_idx)] ok = true - base_element_idx += chunk_size(it.chunk_idx) + base_element_idx += int(chunk_size(it.chunk_idx)) it.chunk_idx += 1 return } @@ -210,12 +240,14 @@ chunk_iterator_next_soa :: proc( base_element_idx: int, ok: bool, ) { - if (it.xarr.allocated_chunks_mask & (u32(1) << it.idx)) == 0 { + if (it.xarr.allocated_chunks_mask & (u32(1) << it.chunk_idx)) == 0 { return nil, 0, false } - chunk = get_chunk_slice_soa(it.xarr, it.idx) + chunk = get_chunk_slice_soa(it.xarr^, it.chunk_idx) + // Limit the chunk to the length so user code doesn't have to worry about this base_element_idx = it.base_element_idx + chunk = chunk[:min(len(chunk), it.xarr.len - base_element_idx)] ok = true base_element_idx += chunk_size(it.chunk_idx) diff --git a/common/container/xarr/xarr_test.odin b/common/container/xarr/xarr_test.odin index 6e87105..28406db 100644 --- a/common/container/xarr/xarr_test.odin +++ b/common/container/xarr/xarr_test.odin @@ -1,6 +1,11 @@ package xarr +import "base:runtime" +import "core:fmt" +import "core:mem/virtual" +import "core:strings" import "core:testing" +import "core:time" @(test) test_msb :: proc(t: ^testing.T) { @@ -132,5 +137,311 @@ test_soa :: proc(t: ^testing.T) { append(&a, My_Struct{x = 1, y = 2, z = 3}) testing.expect_value(t, get(a, 0), My_Struct{x = 1, y = 2, z = 3}) - testing.expect_value(t, size_of(Xarr(My_Struct, false)), 0) +} + +@(test) +benchmark_dyn_array_append :: proc(t: ^testing.T) { + str: strings.Builder + strings.builder_init(&str, context.allocator) + defer { + fmt.println(strings.to_string(str)) + strings.builder_destroy(&str) + } + + + { + arena: virtual.Arena + arena_err := virtual.arena_init_static(&arena) + testing.expect_value(t, arena_err, nil) + defer virtual.arena_destroy(&arena) + + name := "Dynamic Array Append" + options := &time.Benchmark_Options { + rounds = 10_000, + bytes = 100_000, + setup = setup_bench, + bench = benchmark_dynamic_array_append, + // teardown = teardown_bench, + } + err := time.benchmark(options, virtual.arena_allocator(&arena)) + testing.expect_value(t, err, nil) + benchmark_print(&str, name, options) + } +} + +@(test) +benchmark_xarr_append :: proc(t: ^testing.T) { + str: strings.Builder + strings.builder_init(&str, context.allocator) + defer { + fmt.println(strings.to_string(str)) + strings.builder_destroy(&str) + } + + { + arena: virtual.Arena + arena_err := virtual.arena_init_static(&arena) + testing.expect_value(t, arena_err, nil) + defer virtual.arena_destroy(&arena) + + name := "Xarr Append" + options := &time.Benchmark_Options { + rounds = 10_000, + bytes = 1_000_000, + setup = setup_bench, + bench = benchmar_xarr_append, + } + err := time.benchmark(options, virtual.arena_allocator(&arena)) + testing.expect_value(t, err, nil) + benchmark_print(&str, name, options) + } +} + +ITERATION_ARRAY_NUM :: 1_000_000 +ITERATION_ROUNDS :: 10_000 + +@(test) +benchmark_xarr_index_iteration :: proc(t: ^testing.T) { + str: strings.Builder + strings.builder_init(&str, context.allocator) + defer { + fmt.println(strings.to_string(str)) + strings.builder_destroy(&str) + } + + arr: Xarr(int) + defer delete(&arr) + + for i in 0 ..< ITERATION_ARRAY_NUM { + append(&arr, i) + } + + total_sum: int + + diff: time.Duration + { + time.SCOPED_TICK_DURATION(&diff) + + for _ in 0 ..< ITERATION_ROUNDS { + + sum: int + for i in 0 ..< arr.len { + sum += get(arr, i) + } + + total_sum += sum + } + } + + options := &time.Benchmark_Options{rounds = 10_000, bytes = 1_000_000} + options.count = options.rounds + options.processed = size_of(int) * arr.len * options.rounds + + options.duration = diff + times_per_second := f64(time.Second) / f64(diff) + options.rounds_per_second = times_per_second * f64(options.count) + options.megabytes_per_second = f64(options.processed) / f64(1024 * 1024) * times_per_second + + benchmark_print(&str, "Xarr Index Iteration", options) +} + +@(test) +benchmark_xarr_chunk_iteration :: proc(t: ^testing.T) { + str: strings.Builder + strings.builder_init(&str, context.allocator) + defer { + fmt.println(strings.to_string(str)) + strings.builder_destroy(&str) + } + + arr: Xarr(int) + defer delete(&arr) + + for i in 0 ..< ITERATION_ARRAY_NUM { + append(&arr, i) + } + + total_sum: int + + diff: time.Duration + { + time.SCOPED_TICK_DURATION(&diff) + + for _ in 0 ..< ITERATION_ROUNDS { + sum: int + + it := chunk_iterator(&arr) + for chunk, base_idx in chunk_iterator_next(&it) { + for i in 0 ..< len(chunk) { + sum += chunk[i] + } + } + + total_sum += sum + } + } + + options := &time.Benchmark_Options{rounds = 10_000, bytes = 1_000_000} + options.count = options.rounds + options.processed = size_of(int) * arr.len * options.rounds + + options.duration = diff + times_per_second := f64(time.Second) / f64(diff) + options.rounds_per_second = times_per_second * f64(options.count) + options.megabytes_per_second = f64(options.processed) / f64(1024 * 1024) * times_per_second + + benchmark_print(&str, "Xarr Chunk Iteration", options) +} + +@(test) +benchmark_slice_index_iteration :: proc(t: ^testing.T) { + str: strings.Builder + strings.builder_init(&str, context.allocator) + defer { + fmt.println(strings.to_string(str)) + strings.builder_destroy(&str) + } + + slice := make([]int, ITERATION_ARRAY_NUM) + + for i in 0 ..< ITERATION_ARRAY_NUM { + slice[i] = i + } + + total_sum: int + + diff: time.Duration + { + time.SCOPED_TICK_DURATION(&diff) + + for _ in 0 ..< ITERATION_ROUNDS { + + sum: int + for i in 0 ..< len(slice) { + sum += slice[i] + } + + total_sum += sum + } + } + + options := &time.Benchmark_Options{rounds = 10_000, bytes = 1_000_000} + options.count = options.rounds + options.processed = size_of(int) * len(slice) * options.rounds + + options.duration = diff + times_per_second := f64(time.Second) / f64(diff) + options.rounds_per_second = times_per_second * f64(options.count) + options.megabytes_per_second = f64(options.processed) / f64(1024 * 1024) * times_per_second + + benchmark_print(&str, "Slice Index Iteration", options) +} + +setup_bench :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + assert(options != nil) + + options.input = make([]u8, options.bytes, allocator) + for &b, i in options.input { + b = u8(i & 0xff) + } + return nil if len(options.input) == options.bytes else .Allocation_Error +} + +teardown_bench :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + assert(options != nil) + + runtime.delete(options.input) + return nil +} + +benchmark_dynamic_array_append :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + buf := options.input + + for _ in 0 ..< options.rounds { + arr: [dynamic]u8 + defer runtime.delete(arr) + + for byte in buf { + runtime.append(&arr, byte) + } + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + + return nil +} + +benchmar_xarr_append :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + buf := options.input + + for _ in 0 ..< options.rounds { + arr: Xarr(u8) + defer delete(&arr) + + for byte in buf { + append(&arr, byte) + } + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + return nil +} + +benchmar_xarr_iterate :: proc( + options: ^time.Benchmark_Options, + allocator := context.allocator, +) -> ( + err: time.Benchmark_Error, +) { + buf := options.input + + for _ in 0 ..< options.rounds { + arr: Xarr(u8) + defer delete(&arr) + + for byte in buf { + append(&arr, byte) + } + } + options.count = options.rounds + options.processed = options.rounds * options.bytes + return nil +} + +benchmark_print :: proc( + str: ^strings.Builder, + name: string, + options: ^time.Benchmark_Options, + loc := #caller_location, +) { + fmt.sbprintfln( + str, + "[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n", + name, + options.rounds, + options.processed, + time.duration_nanoseconds(options.duration), + options.rounds_per_second, + options.megabytes_per_second, + ) } diff --git a/test.sh b/test.sh index 8de8fbd..8af38b8 100755 --- a/test.sh +++ b/test.sh @@ -1,3 +1,4 @@ #!/usr/bin/env bash -odin test common/container/xarr -collection:common=./common -collection:game=./game -collection:libs=./libs -strict-style -vet -sanitize:memory +odin build common/container/xarr -build-mode:test -collection:common=./common -collection:game=./game -collection:libs=./libs -strict-style -o:speed -debug "$@" +# odin test common/container/xarr -collection:common=./common -collection:game=./game -collection:libs=./libs -strict-style -vet -o:speed -debug