A bunch of optimizations and becnhmarks for XARR, almost as fast as a simple slice now

This commit is contained in:
sergeypdev 2025-08-10 20:26:22 +04:00
parent 250f86ac2d
commit 48ef1fb4b7
3 changed files with 375 additions and 31 deletions

View File

@ -9,35 +9,35 @@ BASE_CHUNK_SHIFT :: BASE_CHUNK_SIZE_LOG2 - 1
NUM_CHUNKS :: 30 NUM_CHUNKS :: 30
Xarr :: struct($T: typeid, $SOA := false) { Xarr :: struct($T: typeid, $SOA := false) {
chunks: ([NUM_CHUNKS]#soa[]T when SOA else [NUM_CHUNKS][^]T),
len: int, len: int,
allocated_chunks_mask: u32, allocated_chunks_mask: u32,
chunks: ([NUM_CHUNKS]#soa[]T when SOA else [NUM_CHUNKS][^]T),
} }
UINT_BITS :: size_of(uint) * 8 UINT_BITS :: size_of(uint) * 8
msb :: #force_inline proc "contextless" (#any_int idx: uint) -> i8 { msb :: #force_inline proc "contextless" (#any_int idx: uint) -> i32 {
return i8(UINT_BITS - intrinsics.count_leading_zeros(idx)) - 1 return i32(UINT_BITS - intrinsics.count_leading_zeros(idx)) - 1
} }
chunk_by_index :: #force_inline proc "contextless" (#any_int idx: uint) -> (chunk: i8) { chunk_by_index :: #force_inline proc "contextless" (#any_int idx: uint) -> (chunk: i32) {
return max(msb(idx) - BASE_CHUNK_SHIFT, 0) return max(msb(idx) - BASE_CHUNK_SHIFT, 0)
} }
chunk_size :: #force_inline proc "contextless" (chunk_idx: i8) -> uint { chunk_size :: #force_inline proc "contextless" (chunk_idx: i32) -> uint {
return BASE_CHUNK_SIZE << u32(max(chunk_idx - 1, 0)) return BASE_CHUNK_SIZE << intrinsics.saturating_sub(u32(chunk_idx), 1)
} }
get_chunk_slice_scalar :: #force_inline proc "contextless" ( get_chunk_slice_scalar :: #force_inline proc "contextless" (
a: $T/Xarr($E, false), a: $T/Xarr($E, false),
chunk_idx: i8, chunk_idx: i32,
) -> []E { ) -> []E {
return a.chunks[chunk_idx][:chunk_size(chunk_idx)] return a.chunks[chunk_idx][:chunk_size(chunk_idx)]
} }
get_chunk_slice_soa :: #force_inline proc "contextless" ( get_chunk_slice_soa :: #force_inline proc "contextless" (
a: $T/Xarr($E, true), a: $T/Xarr($E, true),
chunk_idx: i8, chunk_idx: i32,
) -> #soa[]E { ) -> #soa[]E {
return a.chunks[chunk_idx] return a.chunks[chunk_idx]
} }
@ -47,24 +47,26 @@ get_chunk_slice :: proc {
get_chunk_slice_soa, get_chunk_slice_soa,
} }
capacity_from_allocated_mask :: #force_inline proc(allocated_mask: uint) -> uint { capacity_from_allocated_mask :: #force_inline proc "contextless" (allocated_mask: u32) -> uint {
return( return(
(allocated_mask >> 1) << BASE_CHUNK_SIZE_LOG2 + uint(allocated_mask >> 1) << BASE_CHUNK_SIZE_LOG2 +
(allocated_mask & 1) << BASE_CHUNK_SIZE_LOG2 \ uint(allocated_mask & 1) << BASE_CHUNK_SIZE_LOG2 \
) )
} }
capacity :: #force_inline proc(a: $T/Xarr($E, $SOA)) -> u32 { capacity :: #force_inline proc "contextless" (a: $T/Xarr($E, $SOA)) -> uint {
allocated_mask := a.allocated_chunks_mask allocated_mask := a.allocated_chunks_mask
return capacity_from_allocated_mask(allocated_mask) return capacity_from_allocated_mask(allocated_mask)
} }
reserve :: proc(a: $T/^Xarr($E, $SOA), cap: int, allocator := context.allocator) { reserve :: proc(a: $T/^Xarr($E, $SOA), cap: int, allocator := context.allocator) #no_bounds_check {
allocated_mask := a.allocated_chunks_mask allocated_mask := a.allocated_chunks_mask
current_chunk := msb(allocated_mask) current_chunk := msb(allocated_mask)
required_chunks := chunk_by_index(max(cap - 1, 0)) + 1 required_chunks := chunk_by_index(max(cap - 1, 0)) + 1
assert(required_chunks <= NUM_CHUNKS)
for i := current_chunk + 1; i < required_chunks; i += 1 { for i := current_chunk + 1; i < required_chunks; i += 1 {
when SOA { when SOA {
chunk_slice := make_soa_slice(#soa[]E, chunk_size(i), allocator) chunk_slice := make_soa_slice(#soa[]E, chunk_size(i), allocator)
@ -77,24 +79,40 @@ reserve :: proc(a: $T/^Xarr($E, $SOA), cap: int, allocator := context.allocator)
} }
} }
append :: proc(a: $T/^Xarr($E, $SOA), elems: ..E, allocator := context.allocator) { append_elem :: proc(a: $T/^Xarr($E, $SOA), elem: E, allocator := context.allocator) {
if capacity(a^) <= uint(a.len + 1) {
reserve(a, a.len + 1)
}
#no_bounds_check {
chunk_idx, idx_within_chunk := translate_index(a.len)
a.chunks[chunk_idx][idx_within_chunk] = elem
}
a.len += 1
}
append_elems :: proc(a: $T/^Xarr($E, $SOA), elems: ..E, allocator := context.allocator) {
if len(elems) == 0 { if len(elems) == 0 {
return return
} }
if capacity(a^) < uint(a.len + len(elems)) {
reserve(a, a.len + len(elems)) reserve(a, a.len + len(elems))
}
set_elems_assume_allocated(a, elems) set_elems_assume_allocated(a, elems)
a.len += len(elems) a.len += len(elems)
} }
translate_index :: #force_inline proc( append :: proc {
append_elem,
append_elems,
}
translate_index :: #force_inline proc "contextless" (
#any_int idx: int, #any_int idx: int,
) -> ( ) -> (
chunk_idx: i8, chunk_idx: i32,
idx_within_chunk: uint, idx_within_chunk: uint,
) { ) {
assert(idx >= 0)
chunk_idx = chunk_by_index(idx) chunk_idx = chunk_by_index(idx)
idx_within_chunk = uint(idx) & (chunk_size(chunk_idx) - 1) idx_within_chunk = uint(idx) & (chunk_size(chunk_idx) - 1)
@ -102,13 +120,23 @@ translate_index :: #force_inline proc(
} }
@(private = "file") @(private = "file")
set_elems_assume_allocated :: proc(a: $T/^Xarr($E, $SOA), elems: []E) { set_elems_assume_allocated :: proc "contextless" (
a: $T/^Xarr($E, $SOA),
elems: []E,
) #no_bounds_check {
for &e, i in elems { for &e, i in elems {
idx := a.len + i idx := a.len + i
chunk_idx, idx_within_chunk := translate_index(idx) chunk_idx, idx_within_chunk := translate_index(idx)
assert(a.chunks[chunk_idx] != nil)
when SOA {
a.chunks[chunk_idx][idx_within_chunk] = e a.chunks[chunk_idx][idx_within_chunk] = e
} else {
intrinsics.mem_copy_non_overlapping(
&a.chunks[chunk_idx][idx_within_chunk],
&e,
size_of(E),
)
}
} }
} }
@ -145,7 +173,7 @@ clear :: proc "contextless" (a: $T/^Xarr($E, $SOA)) {
delete :: proc(a: $T/^Xarr($E, $SOA), allocator := context.allocator) { delete :: proc(a: $T/^Xarr($E, $SOA), allocator := context.allocator) {
for i in 0 ..< len(a.chunks) { for i in 0 ..< len(a.chunks) {
builtin.delete(get_chunk_slice(a^, i8(i)), allocator) builtin.delete(get_chunk_slice(a^, i32(i)), allocator)
} }
a^ = Xarr(E, SOA){} a^ = Xarr(E, SOA){}
@ -176,7 +204,7 @@ iterator_next :: proc(it: ^Iterator($E, $SOA)) -> (e: ^E, idx: int, ok: bool) {
Chunk_Iterator :: struct($E: typeid, $SOA: bool) { Chunk_Iterator :: struct($E: typeid, $SOA: bool) {
xarr: ^Xarr(E, SOA), xarr: ^Xarr(E, SOA),
base_element_idx: int, base_element_idx: int,
chunk_idx: i8, chunk_idx: i32,
} }
chunk_iterator :: proc(a: $T/^Xarr($E, $SOA)) -> Chunk_Iterator(E, SOA) { chunk_iterator :: proc(a: $T/^Xarr($E, $SOA)) -> Chunk_Iterator(E, SOA) {
@ -190,15 +218,17 @@ chunk_iterator_next_scalar :: proc(
base_element_idx: int, base_element_idx: int,
ok: bool, ok: bool,
) { ) {
if (it.xarr.allocated_chunks_mask & (u32(1) << it.idx)) == 0 { if (it.xarr.allocated_chunks_mask & (u32(1) << u32(it.chunk_idx))) == 0 {
return nil, 0, false return nil, 0, false
} }
chunk = get_chunk_slice_scalar(it.xarr, it.idx) chunk = get_chunk_slice_scalar(it.xarr^, it.chunk_idx)
// Limit the chunk to the length so user code doesn't have to worry about this
base_element_idx = it.base_element_idx base_element_idx = it.base_element_idx
chunk = chunk[:min(len(chunk), it.xarr.len - base_element_idx)]
ok = true ok = true
base_element_idx += chunk_size(it.chunk_idx) base_element_idx += int(chunk_size(it.chunk_idx))
it.chunk_idx += 1 it.chunk_idx += 1
return return
} }
@ -210,12 +240,14 @@ chunk_iterator_next_soa :: proc(
base_element_idx: int, base_element_idx: int,
ok: bool, ok: bool,
) { ) {
if (it.xarr.allocated_chunks_mask & (u32(1) << it.idx)) == 0 { if (it.xarr.allocated_chunks_mask & (u32(1) << it.chunk_idx)) == 0 {
return nil, 0, false return nil, 0, false
} }
chunk = get_chunk_slice_soa(it.xarr, it.idx) chunk = get_chunk_slice_soa(it.xarr^, it.chunk_idx)
// Limit the chunk to the length so user code doesn't have to worry about this
base_element_idx = it.base_element_idx base_element_idx = it.base_element_idx
chunk = chunk[:min(len(chunk), it.xarr.len - base_element_idx)]
ok = true ok = true
base_element_idx += chunk_size(it.chunk_idx) base_element_idx += chunk_size(it.chunk_idx)

View File

@ -1,6 +1,11 @@
package xarr package xarr
import "base:runtime"
import "core:fmt"
import "core:mem/virtual"
import "core:strings"
import "core:testing" import "core:testing"
import "core:time"
@(test) @(test)
test_msb :: proc(t: ^testing.T) { test_msb :: proc(t: ^testing.T) {
@ -132,5 +137,311 @@ test_soa :: proc(t: ^testing.T) {
append(&a, My_Struct{x = 1, y = 2, z = 3}) append(&a, My_Struct{x = 1, y = 2, z = 3})
testing.expect_value(t, get(a, 0), My_Struct{x = 1, y = 2, z = 3}) testing.expect_value(t, get(a, 0), My_Struct{x = 1, y = 2, z = 3})
testing.expect_value(t, size_of(Xarr(My_Struct, false)), 0) }
@(test)
benchmark_dyn_array_append :: proc(t: ^testing.T) {
str: strings.Builder
strings.builder_init(&str, context.allocator)
defer {
fmt.println(strings.to_string(str))
strings.builder_destroy(&str)
}
{
arena: virtual.Arena
arena_err := virtual.arena_init_static(&arena)
testing.expect_value(t, arena_err, nil)
defer virtual.arena_destroy(&arena)
name := "Dynamic Array Append"
options := &time.Benchmark_Options {
rounds = 10_000,
bytes = 100_000,
setup = setup_bench,
bench = benchmark_dynamic_array_append,
// teardown = teardown_bench,
}
err := time.benchmark(options, virtual.arena_allocator(&arena))
testing.expect_value(t, err, nil)
benchmark_print(&str, name, options)
}
}
@(test)
benchmark_xarr_append :: proc(t: ^testing.T) {
str: strings.Builder
strings.builder_init(&str, context.allocator)
defer {
fmt.println(strings.to_string(str))
strings.builder_destroy(&str)
}
{
arena: virtual.Arena
arena_err := virtual.arena_init_static(&arena)
testing.expect_value(t, arena_err, nil)
defer virtual.arena_destroy(&arena)
name := "Xarr Append"
options := &time.Benchmark_Options {
rounds = 10_000,
bytes = 1_000_000,
setup = setup_bench,
bench = benchmar_xarr_append,
}
err := time.benchmark(options, virtual.arena_allocator(&arena))
testing.expect_value(t, err, nil)
benchmark_print(&str, name, options)
}
}
ITERATION_ARRAY_NUM :: 1_000_000
ITERATION_ROUNDS :: 10_000
@(test)
benchmark_xarr_index_iteration :: proc(t: ^testing.T) {
str: strings.Builder
strings.builder_init(&str, context.allocator)
defer {
fmt.println(strings.to_string(str))
strings.builder_destroy(&str)
}
arr: Xarr(int)
defer delete(&arr)
for i in 0 ..< ITERATION_ARRAY_NUM {
append(&arr, i)
}
total_sum: int
diff: time.Duration
{
time.SCOPED_TICK_DURATION(&diff)
for _ in 0 ..< ITERATION_ROUNDS {
sum: int
for i in 0 ..< arr.len {
sum += get(arr, i)
}
total_sum += sum
}
}
options := &time.Benchmark_Options{rounds = 10_000, bytes = 1_000_000}
options.count = options.rounds
options.processed = size_of(int) * arr.len * options.rounds
options.duration = diff
times_per_second := f64(time.Second) / f64(diff)
options.rounds_per_second = times_per_second * f64(options.count)
options.megabytes_per_second = f64(options.processed) / f64(1024 * 1024) * times_per_second
benchmark_print(&str, "Xarr Index Iteration", options)
}
@(test)
benchmark_xarr_chunk_iteration :: proc(t: ^testing.T) {
str: strings.Builder
strings.builder_init(&str, context.allocator)
defer {
fmt.println(strings.to_string(str))
strings.builder_destroy(&str)
}
arr: Xarr(int)
defer delete(&arr)
for i in 0 ..< ITERATION_ARRAY_NUM {
append(&arr, i)
}
total_sum: int
diff: time.Duration
{
time.SCOPED_TICK_DURATION(&diff)
for _ in 0 ..< ITERATION_ROUNDS {
sum: int
it := chunk_iterator(&arr)
for chunk, base_idx in chunk_iterator_next(&it) {
for i in 0 ..< len(chunk) {
sum += chunk[i]
}
}
total_sum += sum
}
}
options := &time.Benchmark_Options{rounds = 10_000, bytes = 1_000_000}
options.count = options.rounds
options.processed = size_of(int) * arr.len * options.rounds
options.duration = diff
times_per_second := f64(time.Second) / f64(diff)
options.rounds_per_second = times_per_second * f64(options.count)
options.megabytes_per_second = f64(options.processed) / f64(1024 * 1024) * times_per_second
benchmark_print(&str, "Xarr Chunk Iteration", options)
}
@(test)
benchmark_slice_index_iteration :: proc(t: ^testing.T) {
str: strings.Builder
strings.builder_init(&str, context.allocator)
defer {
fmt.println(strings.to_string(str))
strings.builder_destroy(&str)
}
slice := make([]int, ITERATION_ARRAY_NUM)
for i in 0 ..< ITERATION_ARRAY_NUM {
slice[i] = i
}
total_sum: int
diff: time.Duration
{
time.SCOPED_TICK_DURATION(&diff)
for _ in 0 ..< ITERATION_ROUNDS {
sum: int
for i in 0 ..< len(slice) {
sum += slice[i]
}
total_sum += sum
}
}
options := &time.Benchmark_Options{rounds = 10_000, bytes = 1_000_000}
options.count = options.rounds
options.processed = size_of(int) * len(slice) * options.rounds
options.duration = diff
times_per_second := f64(time.Second) / f64(diff)
options.rounds_per_second = times_per_second * f64(options.count)
options.megabytes_per_second = f64(options.processed) / f64(1024 * 1024) * times_per_second
benchmark_print(&str, "Slice Index Iteration", options)
}
setup_bench :: proc(
options: ^time.Benchmark_Options,
allocator := context.allocator,
) -> (
err: time.Benchmark_Error,
) {
assert(options != nil)
options.input = make([]u8, options.bytes, allocator)
for &b, i in options.input {
b = u8(i & 0xff)
}
return nil if len(options.input) == options.bytes else .Allocation_Error
}
teardown_bench :: proc(
options: ^time.Benchmark_Options,
allocator := context.allocator,
) -> (
err: time.Benchmark_Error,
) {
assert(options != nil)
runtime.delete(options.input)
return nil
}
benchmark_dynamic_array_append :: proc(
options: ^time.Benchmark_Options,
allocator := context.allocator,
) -> (
err: time.Benchmark_Error,
) {
buf := options.input
for _ in 0 ..< options.rounds {
arr: [dynamic]u8
defer runtime.delete(arr)
for byte in buf {
runtime.append(&arr, byte)
}
}
options.count = options.rounds
options.processed = options.rounds * options.bytes
return nil
}
benchmar_xarr_append :: proc(
options: ^time.Benchmark_Options,
allocator := context.allocator,
) -> (
err: time.Benchmark_Error,
) {
buf := options.input
for _ in 0 ..< options.rounds {
arr: Xarr(u8)
defer delete(&arr)
for byte in buf {
append(&arr, byte)
}
}
options.count = options.rounds
options.processed = options.rounds * options.bytes
return nil
}
benchmar_xarr_iterate :: proc(
options: ^time.Benchmark_Options,
allocator := context.allocator,
) -> (
err: time.Benchmark_Error,
) {
buf := options.input
for _ in 0 ..< options.rounds {
arr: Xarr(u8)
defer delete(&arr)
for byte in buf {
append(&arr, byte)
}
}
options.count = options.rounds
options.processed = options.rounds * options.bytes
return nil
}
benchmark_print :: proc(
str: ^strings.Builder,
name: string,
options: ^time.Benchmark_Options,
loc := #caller_location,
) {
fmt.sbprintfln(
str,
"[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n",
name,
options.rounds,
options.processed,
time.duration_nanoseconds(options.duration),
options.rounds_per_second,
options.megabytes_per_second,
)
} }

View File

@ -1,3 +1,4 @@
#!/usr/bin/env bash #!/usr/bin/env bash
odin test common/container/xarr -collection:common=./common -collection:game=./game -collection:libs=./libs -strict-style -vet -sanitize:memory odin build common/container/xarr -build-mode:test -collection:common=./common -collection:game=./game -collection:libs=./libs -strict-style -o:speed -debug "$@"
# odin test common/container/xarr -collection:common=./common -collection:game=./game -collection:libs=./libs -strict-style -vet -o:speed -debug