A bunch of optimizations and becnhmarks for XARR, almost as fast as a simple slice now
This commit is contained in:
parent
250f86ac2d
commit
48ef1fb4b7
@ -9,35 +9,35 @@ BASE_CHUNK_SHIFT :: BASE_CHUNK_SIZE_LOG2 - 1
|
||||
NUM_CHUNKS :: 30
|
||||
|
||||
Xarr :: struct($T: typeid, $SOA := false) {
|
||||
chunks: ([NUM_CHUNKS]#soa[]T when SOA else [NUM_CHUNKS][^]T),
|
||||
len: int,
|
||||
allocated_chunks_mask: u32,
|
||||
chunks: ([NUM_CHUNKS]#soa[]T when SOA else [NUM_CHUNKS][^]T),
|
||||
}
|
||||
|
||||
UINT_BITS :: size_of(uint) * 8
|
||||
|
||||
msb :: #force_inline proc "contextless" (#any_int idx: uint) -> i8 {
|
||||
return i8(UINT_BITS - intrinsics.count_leading_zeros(idx)) - 1
|
||||
msb :: #force_inline proc "contextless" (#any_int idx: uint) -> i32 {
|
||||
return i32(UINT_BITS - intrinsics.count_leading_zeros(idx)) - 1
|
||||
}
|
||||
|
||||
chunk_by_index :: #force_inline proc "contextless" (#any_int idx: uint) -> (chunk: i8) {
|
||||
chunk_by_index :: #force_inline proc "contextless" (#any_int idx: uint) -> (chunk: i32) {
|
||||
return max(msb(idx) - BASE_CHUNK_SHIFT, 0)
|
||||
}
|
||||
|
||||
chunk_size :: #force_inline proc "contextless" (chunk_idx: i8) -> uint {
|
||||
return BASE_CHUNK_SIZE << u32(max(chunk_idx - 1, 0))
|
||||
chunk_size :: #force_inline proc "contextless" (chunk_idx: i32) -> uint {
|
||||
return BASE_CHUNK_SIZE << intrinsics.saturating_sub(u32(chunk_idx), 1)
|
||||
}
|
||||
|
||||
get_chunk_slice_scalar :: #force_inline proc "contextless" (
|
||||
a: $T/Xarr($E, false),
|
||||
chunk_idx: i8,
|
||||
chunk_idx: i32,
|
||||
) -> []E {
|
||||
return a.chunks[chunk_idx][:chunk_size(chunk_idx)]
|
||||
}
|
||||
|
||||
get_chunk_slice_soa :: #force_inline proc "contextless" (
|
||||
a: $T/Xarr($E, true),
|
||||
chunk_idx: i8,
|
||||
chunk_idx: i32,
|
||||
) -> #soa[]E {
|
||||
return a.chunks[chunk_idx]
|
||||
}
|
||||
@ -47,24 +47,26 @@ get_chunk_slice :: proc {
|
||||
get_chunk_slice_soa,
|
||||
}
|
||||
|
||||
capacity_from_allocated_mask :: #force_inline proc(allocated_mask: uint) -> uint {
|
||||
capacity_from_allocated_mask :: #force_inline proc "contextless" (allocated_mask: u32) -> uint {
|
||||
return(
|
||||
(allocated_mask >> 1) << BASE_CHUNK_SIZE_LOG2 +
|
||||
(allocated_mask & 1) << BASE_CHUNK_SIZE_LOG2 \
|
||||
uint(allocated_mask >> 1) << BASE_CHUNK_SIZE_LOG2 +
|
||||
uint(allocated_mask & 1) << BASE_CHUNK_SIZE_LOG2 \
|
||||
)
|
||||
}
|
||||
|
||||
capacity :: #force_inline proc(a: $T/Xarr($E, $SOA)) -> u32 {
|
||||
capacity :: #force_inline proc "contextless" (a: $T/Xarr($E, $SOA)) -> uint {
|
||||
allocated_mask := a.allocated_chunks_mask
|
||||
return capacity_from_allocated_mask(allocated_mask)
|
||||
}
|
||||
|
||||
reserve :: proc(a: $T/^Xarr($E, $SOA), cap: int, allocator := context.allocator) {
|
||||
reserve :: proc(a: $T/^Xarr($E, $SOA), cap: int, allocator := context.allocator) #no_bounds_check {
|
||||
allocated_mask := a.allocated_chunks_mask
|
||||
|
||||
current_chunk := msb(allocated_mask)
|
||||
required_chunks := chunk_by_index(max(cap - 1, 0)) + 1
|
||||
|
||||
assert(required_chunks <= NUM_CHUNKS)
|
||||
|
||||
for i := current_chunk + 1; i < required_chunks; i += 1 {
|
||||
when SOA {
|
||||
chunk_slice := make_soa_slice(#soa[]E, chunk_size(i), allocator)
|
||||
@ -77,24 +79,40 @@ reserve :: proc(a: $T/^Xarr($E, $SOA), cap: int, allocator := context.allocator)
|
||||
}
|
||||
}
|
||||
|
||||
append :: proc(a: $T/^Xarr($E, $SOA), elems: ..E, allocator := context.allocator) {
|
||||
append_elem :: proc(a: $T/^Xarr($E, $SOA), elem: E, allocator := context.allocator) {
|
||||
if capacity(a^) <= uint(a.len + 1) {
|
||||
reserve(a, a.len + 1)
|
||||
}
|
||||
#no_bounds_check {
|
||||
chunk_idx, idx_within_chunk := translate_index(a.len)
|
||||
a.chunks[chunk_idx][idx_within_chunk] = elem
|
||||
}
|
||||
a.len += 1
|
||||
}
|
||||
|
||||
append_elems :: proc(a: $T/^Xarr($E, $SOA), elems: ..E, allocator := context.allocator) {
|
||||
if len(elems) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
reserve(a, a.len + len(elems))
|
||||
if capacity(a^) < uint(a.len + len(elems)) {
|
||||
reserve(a, a.len + len(elems))
|
||||
}
|
||||
set_elems_assume_allocated(a, elems)
|
||||
a.len += len(elems)
|
||||
}
|
||||
|
||||
translate_index :: #force_inline proc(
|
||||
append :: proc {
|
||||
append_elem,
|
||||
append_elems,
|
||||
}
|
||||
|
||||
translate_index :: #force_inline proc "contextless" (
|
||||
#any_int idx: int,
|
||||
) -> (
|
||||
chunk_idx: i8,
|
||||
chunk_idx: i32,
|
||||
idx_within_chunk: uint,
|
||||
) {
|
||||
assert(idx >= 0)
|
||||
|
||||
chunk_idx = chunk_by_index(idx)
|
||||
idx_within_chunk = uint(idx) & (chunk_size(chunk_idx) - 1)
|
||||
|
||||
@ -102,13 +120,23 @@ translate_index :: #force_inline proc(
|
||||
}
|
||||
|
||||
@(private = "file")
|
||||
set_elems_assume_allocated :: proc(a: $T/^Xarr($E, $SOA), elems: []E) {
|
||||
set_elems_assume_allocated :: proc "contextless" (
|
||||
a: $T/^Xarr($E, $SOA),
|
||||
elems: []E,
|
||||
) #no_bounds_check {
|
||||
for &e, i in elems {
|
||||
idx := a.len + i
|
||||
chunk_idx, idx_within_chunk := translate_index(idx)
|
||||
assert(a.chunks[chunk_idx] != nil)
|
||||
|
||||
a.chunks[chunk_idx][idx_within_chunk] = e
|
||||
when SOA {
|
||||
a.chunks[chunk_idx][idx_within_chunk] = e
|
||||
} else {
|
||||
intrinsics.mem_copy_non_overlapping(
|
||||
&a.chunks[chunk_idx][idx_within_chunk],
|
||||
&e,
|
||||
size_of(E),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -145,7 +173,7 @@ clear :: proc "contextless" (a: $T/^Xarr($E, $SOA)) {
|
||||
|
||||
delete :: proc(a: $T/^Xarr($E, $SOA), allocator := context.allocator) {
|
||||
for i in 0 ..< len(a.chunks) {
|
||||
builtin.delete(get_chunk_slice(a^, i8(i)), allocator)
|
||||
builtin.delete(get_chunk_slice(a^, i32(i)), allocator)
|
||||
}
|
||||
|
||||
a^ = Xarr(E, SOA){}
|
||||
@ -176,7 +204,7 @@ iterator_next :: proc(it: ^Iterator($E, $SOA)) -> (e: ^E, idx: int, ok: bool) {
|
||||
Chunk_Iterator :: struct($E: typeid, $SOA: bool) {
|
||||
xarr: ^Xarr(E, SOA),
|
||||
base_element_idx: int,
|
||||
chunk_idx: i8,
|
||||
chunk_idx: i32,
|
||||
}
|
||||
|
||||
chunk_iterator :: proc(a: $T/^Xarr($E, $SOA)) -> Chunk_Iterator(E, SOA) {
|
||||
@ -190,15 +218,17 @@ chunk_iterator_next_scalar :: proc(
|
||||
base_element_idx: int,
|
||||
ok: bool,
|
||||
) {
|
||||
if (it.xarr.allocated_chunks_mask & (u32(1) << it.idx)) == 0 {
|
||||
if (it.xarr.allocated_chunks_mask & (u32(1) << u32(it.chunk_idx))) == 0 {
|
||||
return nil, 0, false
|
||||
}
|
||||
|
||||
chunk = get_chunk_slice_scalar(it.xarr, it.idx)
|
||||
chunk = get_chunk_slice_scalar(it.xarr^, it.chunk_idx)
|
||||
// Limit the chunk to the length so user code doesn't have to worry about this
|
||||
base_element_idx = it.base_element_idx
|
||||
chunk = chunk[:min(len(chunk), it.xarr.len - base_element_idx)]
|
||||
ok = true
|
||||
|
||||
base_element_idx += chunk_size(it.chunk_idx)
|
||||
base_element_idx += int(chunk_size(it.chunk_idx))
|
||||
it.chunk_idx += 1
|
||||
return
|
||||
}
|
||||
@ -210,12 +240,14 @@ chunk_iterator_next_soa :: proc(
|
||||
base_element_idx: int,
|
||||
ok: bool,
|
||||
) {
|
||||
if (it.xarr.allocated_chunks_mask & (u32(1) << it.idx)) == 0 {
|
||||
if (it.xarr.allocated_chunks_mask & (u32(1) << it.chunk_idx)) == 0 {
|
||||
return nil, 0, false
|
||||
}
|
||||
|
||||
chunk = get_chunk_slice_soa(it.xarr, it.idx)
|
||||
chunk = get_chunk_slice_soa(it.xarr^, it.chunk_idx)
|
||||
// Limit the chunk to the length so user code doesn't have to worry about this
|
||||
base_element_idx = it.base_element_idx
|
||||
chunk = chunk[:min(len(chunk), it.xarr.len - base_element_idx)]
|
||||
ok = true
|
||||
|
||||
base_element_idx += chunk_size(it.chunk_idx)
|
||||
|
@ -1,6 +1,11 @@
|
||||
package xarr
|
||||
|
||||
import "base:runtime"
|
||||
import "core:fmt"
|
||||
import "core:mem/virtual"
|
||||
import "core:strings"
|
||||
import "core:testing"
|
||||
import "core:time"
|
||||
|
||||
@(test)
|
||||
test_msb :: proc(t: ^testing.T) {
|
||||
@ -132,5 +137,311 @@ test_soa :: proc(t: ^testing.T) {
|
||||
append(&a, My_Struct{x = 1, y = 2, z = 3})
|
||||
|
||||
testing.expect_value(t, get(a, 0), My_Struct{x = 1, y = 2, z = 3})
|
||||
testing.expect_value(t, size_of(Xarr(My_Struct, false)), 0)
|
||||
}
|
||||
|
||||
@(test)
|
||||
benchmark_dyn_array_append :: proc(t: ^testing.T) {
|
||||
str: strings.Builder
|
||||
strings.builder_init(&str, context.allocator)
|
||||
defer {
|
||||
fmt.println(strings.to_string(str))
|
||||
strings.builder_destroy(&str)
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
arena: virtual.Arena
|
||||
arena_err := virtual.arena_init_static(&arena)
|
||||
testing.expect_value(t, arena_err, nil)
|
||||
defer virtual.arena_destroy(&arena)
|
||||
|
||||
name := "Dynamic Array Append"
|
||||
options := &time.Benchmark_Options {
|
||||
rounds = 10_000,
|
||||
bytes = 100_000,
|
||||
setup = setup_bench,
|
||||
bench = benchmark_dynamic_array_append,
|
||||
// teardown = teardown_bench,
|
||||
}
|
||||
err := time.benchmark(options, virtual.arena_allocator(&arena))
|
||||
testing.expect_value(t, err, nil)
|
||||
benchmark_print(&str, name, options)
|
||||
}
|
||||
}
|
||||
|
||||
@(test)
|
||||
benchmark_xarr_append :: proc(t: ^testing.T) {
|
||||
str: strings.Builder
|
||||
strings.builder_init(&str, context.allocator)
|
||||
defer {
|
||||
fmt.println(strings.to_string(str))
|
||||
strings.builder_destroy(&str)
|
||||
}
|
||||
|
||||
{
|
||||
arena: virtual.Arena
|
||||
arena_err := virtual.arena_init_static(&arena)
|
||||
testing.expect_value(t, arena_err, nil)
|
||||
defer virtual.arena_destroy(&arena)
|
||||
|
||||
name := "Xarr Append"
|
||||
options := &time.Benchmark_Options {
|
||||
rounds = 10_000,
|
||||
bytes = 1_000_000,
|
||||
setup = setup_bench,
|
||||
bench = benchmar_xarr_append,
|
||||
}
|
||||
err := time.benchmark(options, virtual.arena_allocator(&arena))
|
||||
testing.expect_value(t, err, nil)
|
||||
benchmark_print(&str, name, options)
|
||||
}
|
||||
}
|
||||
|
||||
ITERATION_ARRAY_NUM :: 1_000_000
|
||||
ITERATION_ROUNDS :: 10_000
|
||||
|
||||
@(test)
|
||||
benchmark_xarr_index_iteration :: proc(t: ^testing.T) {
|
||||
str: strings.Builder
|
||||
strings.builder_init(&str, context.allocator)
|
||||
defer {
|
||||
fmt.println(strings.to_string(str))
|
||||
strings.builder_destroy(&str)
|
||||
}
|
||||
|
||||
arr: Xarr(int)
|
||||
defer delete(&arr)
|
||||
|
||||
for i in 0 ..< ITERATION_ARRAY_NUM {
|
||||
append(&arr, i)
|
||||
}
|
||||
|
||||
total_sum: int
|
||||
|
||||
diff: time.Duration
|
||||
{
|
||||
time.SCOPED_TICK_DURATION(&diff)
|
||||
|
||||
for _ in 0 ..< ITERATION_ROUNDS {
|
||||
|
||||
sum: int
|
||||
for i in 0 ..< arr.len {
|
||||
sum += get(arr, i)
|
||||
}
|
||||
|
||||
total_sum += sum
|
||||
}
|
||||
}
|
||||
|
||||
options := &time.Benchmark_Options{rounds = 10_000, bytes = 1_000_000}
|
||||
options.count = options.rounds
|
||||
options.processed = size_of(int) * arr.len * options.rounds
|
||||
|
||||
options.duration = diff
|
||||
times_per_second := f64(time.Second) / f64(diff)
|
||||
options.rounds_per_second = times_per_second * f64(options.count)
|
||||
options.megabytes_per_second = f64(options.processed) / f64(1024 * 1024) * times_per_second
|
||||
|
||||
benchmark_print(&str, "Xarr Index Iteration", options)
|
||||
}
|
||||
|
||||
@(test)
|
||||
benchmark_xarr_chunk_iteration :: proc(t: ^testing.T) {
|
||||
str: strings.Builder
|
||||
strings.builder_init(&str, context.allocator)
|
||||
defer {
|
||||
fmt.println(strings.to_string(str))
|
||||
strings.builder_destroy(&str)
|
||||
}
|
||||
|
||||
arr: Xarr(int)
|
||||
defer delete(&arr)
|
||||
|
||||
for i in 0 ..< ITERATION_ARRAY_NUM {
|
||||
append(&arr, i)
|
||||
}
|
||||
|
||||
total_sum: int
|
||||
|
||||
diff: time.Duration
|
||||
{
|
||||
time.SCOPED_TICK_DURATION(&diff)
|
||||
|
||||
for _ in 0 ..< ITERATION_ROUNDS {
|
||||
sum: int
|
||||
|
||||
it := chunk_iterator(&arr)
|
||||
for chunk, base_idx in chunk_iterator_next(&it) {
|
||||
for i in 0 ..< len(chunk) {
|
||||
sum += chunk[i]
|
||||
}
|
||||
}
|
||||
|
||||
total_sum += sum
|
||||
}
|
||||
}
|
||||
|
||||
options := &time.Benchmark_Options{rounds = 10_000, bytes = 1_000_000}
|
||||
options.count = options.rounds
|
||||
options.processed = size_of(int) * arr.len * options.rounds
|
||||
|
||||
options.duration = diff
|
||||
times_per_second := f64(time.Second) / f64(diff)
|
||||
options.rounds_per_second = times_per_second * f64(options.count)
|
||||
options.megabytes_per_second = f64(options.processed) / f64(1024 * 1024) * times_per_second
|
||||
|
||||
benchmark_print(&str, "Xarr Chunk Iteration", options)
|
||||
}
|
||||
|
||||
@(test)
|
||||
benchmark_slice_index_iteration :: proc(t: ^testing.T) {
|
||||
str: strings.Builder
|
||||
strings.builder_init(&str, context.allocator)
|
||||
defer {
|
||||
fmt.println(strings.to_string(str))
|
||||
strings.builder_destroy(&str)
|
||||
}
|
||||
|
||||
slice := make([]int, ITERATION_ARRAY_NUM)
|
||||
|
||||
for i in 0 ..< ITERATION_ARRAY_NUM {
|
||||
slice[i] = i
|
||||
}
|
||||
|
||||
total_sum: int
|
||||
|
||||
diff: time.Duration
|
||||
{
|
||||
time.SCOPED_TICK_DURATION(&diff)
|
||||
|
||||
for _ in 0 ..< ITERATION_ROUNDS {
|
||||
|
||||
sum: int
|
||||
for i in 0 ..< len(slice) {
|
||||
sum += slice[i]
|
||||
}
|
||||
|
||||
total_sum += sum
|
||||
}
|
||||
}
|
||||
|
||||
options := &time.Benchmark_Options{rounds = 10_000, bytes = 1_000_000}
|
||||
options.count = options.rounds
|
||||
options.processed = size_of(int) * len(slice) * options.rounds
|
||||
|
||||
options.duration = diff
|
||||
times_per_second := f64(time.Second) / f64(diff)
|
||||
options.rounds_per_second = times_per_second * f64(options.count)
|
||||
options.megabytes_per_second = f64(options.processed) / f64(1024 * 1024) * times_per_second
|
||||
|
||||
benchmark_print(&str, "Slice Index Iteration", options)
|
||||
}
|
||||
|
||||
setup_bench :: proc(
|
||||
options: ^time.Benchmark_Options,
|
||||
allocator := context.allocator,
|
||||
) -> (
|
||||
err: time.Benchmark_Error,
|
||||
) {
|
||||
assert(options != nil)
|
||||
|
||||
options.input = make([]u8, options.bytes, allocator)
|
||||
for &b, i in options.input {
|
||||
b = u8(i & 0xff)
|
||||
}
|
||||
return nil if len(options.input) == options.bytes else .Allocation_Error
|
||||
}
|
||||
|
||||
teardown_bench :: proc(
|
||||
options: ^time.Benchmark_Options,
|
||||
allocator := context.allocator,
|
||||
) -> (
|
||||
err: time.Benchmark_Error,
|
||||
) {
|
||||
assert(options != nil)
|
||||
|
||||
runtime.delete(options.input)
|
||||
return nil
|
||||
}
|
||||
|
||||
benchmark_dynamic_array_append :: proc(
|
||||
options: ^time.Benchmark_Options,
|
||||
allocator := context.allocator,
|
||||
) -> (
|
||||
err: time.Benchmark_Error,
|
||||
) {
|
||||
buf := options.input
|
||||
|
||||
for _ in 0 ..< options.rounds {
|
||||
arr: [dynamic]u8
|
||||
defer runtime.delete(arr)
|
||||
|
||||
for byte in buf {
|
||||
runtime.append(&arr, byte)
|
||||
}
|
||||
}
|
||||
options.count = options.rounds
|
||||
options.processed = options.rounds * options.bytes
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
benchmar_xarr_append :: proc(
|
||||
options: ^time.Benchmark_Options,
|
||||
allocator := context.allocator,
|
||||
) -> (
|
||||
err: time.Benchmark_Error,
|
||||
) {
|
||||
buf := options.input
|
||||
|
||||
for _ in 0 ..< options.rounds {
|
||||
arr: Xarr(u8)
|
||||
defer delete(&arr)
|
||||
|
||||
for byte in buf {
|
||||
append(&arr, byte)
|
||||
}
|
||||
}
|
||||
options.count = options.rounds
|
||||
options.processed = options.rounds * options.bytes
|
||||
return nil
|
||||
}
|
||||
|
||||
benchmar_xarr_iterate :: proc(
|
||||
options: ^time.Benchmark_Options,
|
||||
allocator := context.allocator,
|
||||
) -> (
|
||||
err: time.Benchmark_Error,
|
||||
) {
|
||||
buf := options.input
|
||||
|
||||
for _ in 0 ..< options.rounds {
|
||||
arr: Xarr(u8)
|
||||
defer delete(&arr)
|
||||
|
||||
for byte in buf {
|
||||
append(&arr, byte)
|
||||
}
|
||||
}
|
||||
options.count = options.rounds
|
||||
options.processed = options.rounds * options.bytes
|
||||
return nil
|
||||
}
|
||||
|
||||
benchmark_print :: proc(
|
||||
str: ^strings.Builder,
|
||||
name: string,
|
||||
options: ^time.Benchmark_Options,
|
||||
loc := #caller_location,
|
||||
) {
|
||||
fmt.sbprintfln(
|
||||
str,
|
||||
"[%v] %v rounds, %v bytes processed in %v ns\n\t\t%5.3f rounds/s, %5.3f MiB/s\n",
|
||||
name,
|
||||
options.rounds,
|
||||
options.processed,
|
||||
time.duration_nanoseconds(options.duration),
|
||||
options.rounds_per_second,
|
||||
options.megabytes_per_second,
|
||||
)
|
||||
}
|
||||
|
3
test.sh
3
test.sh
@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
odin test common/container/xarr -collection:common=./common -collection:game=./game -collection:libs=./libs -strict-style -vet -sanitize:memory
|
||||
odin build common/container/xarr -build-mode:test -collection:common=./common -collection:game=./game -collection:libs=./libs -strict-style -o:speed -debug "$@"
|
||||
# odin test common/container/xarr -collection:common=./common -collection:game=./game -collection:libs=./libs -strict-style -vet -o:speed -debug
|
||||
|
Loading…
x
Reference in New Issue
Block a user