medfall

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit b303462d738b75ae6e329babcf9b45074c281168
parent 3560f10648bd404186a099fd065294ef8d7a4ba9
Author: Michael Savage <mikejsavage@gmail.com>
Date:   Sat Aug 20 20:51:40 +0100

Make memory orderings explicit in atomic ops

Diffstat:
benchmark.cc | 19+++++++++----------
platform_atomic.h | 170+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
unix_atomic.h | 60------------------------------------------------------------
work_queue.cc | 37++++++++++++++++++++-----------------
work_queue.h | 8++++----
5 files changed, 131 insertions(+), 163 deletions(-)
diff --git a/benchmark.cc b/benchmark.cc @@ -26,22 +26,20 @@ ScopedTimer::ScopedTimer( u32 idx ) { ScopedTimer::~ScopedTimer() { u64 dt = __rdtsc() - initial_clock; - atomic_fetch_add_u64( &timers[ timer_idx ].total_clocks, dt ); - atomic_fetch_add_u64( &timers[ timer_idx ].num_calls, 1 ); + fetch_add_relaxed( &timers[ timer_idx ].total_clocks, dt ); + fetch_add_relaxed( &timers[ timer_idx ].num_calls, 1 ); } u32 benchmark_new_timer( const char * fn, const char * file, int line ) { - u32 idx = atomic_fetch_add_u32( &num_timers, 1 ); + u32 idx = fetch_add_relaxed( &num_timers, 1 ); assert( idx < MAX_TIMERS ); Timer info; info.fn = fn; info.file = file; info.line = line; - atomic_set_u64( &info.total_clocks, 0 ); - atomic_set_u64( &info.num_calls, 0 ); - - write_barrier(); + store_relaxed( &info.total_clocks, 0 ); + store_release( &info.num_calls, 0 ); timers[ idx ] = info; @@ -49,9 +47,10 @@ u32 benchmark_new_timer( const char * fn, const char * file, int line ) { } void benchmark_print_timers() { - for( u64 i = 0; i < atomic_get_u32( &num_timers ); i++ ) { - u64 clocks = atomic_get( &timers[ i ].total_clocks ); - u64 calls = atomic_get( &timers[ i ].num_calls ); + u32 n = load_acquire( &num_timers ); + for( u32 i = 0; i < n; i++ ) { + u64 clocks = load_relaxed( &timers[ i ].total_clocks ); + u64 calls = load_relaxed( &timers[ i ].num_calls ); printf( "%s (%s:%d) called %lu times, %lu clocks, %lu avg\n", timers[ i ].fn, timers[ i ].file, timers[ i ].line, diff --git a/platform_atomic.h b/platform_atomic.h @@ -3,79 +3,105 @@ #include "intrinsics.h" -#define ATOMIC_STRUCT_DEF( T ) \ - struct atomic_##T { \ - volatile T v; \ - } - -#define ATOMIC_FUNCTION_DEF( T ) \ - inline T atomic_fetch_add( atomic_##T * atom, T x ) { \ - return atomic_fetch_add_##T( atom, x ); \ - } \ - inline T atomic_fetch_sub( atomic_##T * atom, T x ) { \ - return atomic_fetch_sub_##T( atom, x ); \ - } \ - inline T atomic_swap( atomic_##T * atom, T newval ) { \ - return atomic_swap_##T( atom, newval ); \ - } \ - inline bool atomic_cas( atomic_##T * atom, T oldval, T newval ) { \ - return atomic_cas_##T( atom, oldval, newval ); \ - } \ - inline T atomic_get( atomic_##T * atom ) { \ - return atomic_get_##T( atom ); \ - } \ - inline void atomic_set( atomic_##T * atom, T x ) { \ - atomic_set_##T( atom, x ); \ - } - -ATOMIC_STRUCT_DEF( s8 ); -ATOMIC_STRUCT_DEF( s16 ); -ATOMIC_STRUCT_DEF( s32 ); -ATOMIC_STRUCT_DEF( s64 ); -ATOMIC_STRUCT_DEF( u8 ); -ATOMIC_STRUCT_DEF( u16 ); -ATOMIC_STRUCT_DEF( u32 ); -ATOMIC_STRUCT_DEF( u64 ); - -template< typename T > -struct atomic_ptr { - T * volatile v; -}; - -#if defined( __linux__ ) || defined( __APPLE__ ) -#include "unix_atomic.h" +#ifdef RL_TEST + +#define nonatomic( T ) VAR_T( T ) +// relacy defines VAR( x ) + +#define atomic_s32 std::atomic< s32 > +#define atomic_s64 std::atomic< s64 > +#define atomic_u32 std::atomic< u32 > +#define atomic_u64 std::atomic< u64 > + +#define load_relaxed( atom ) ( atom )->load( std::memory_order_relaxed ) +#define store_relaxed( atom, x ) ( atom )->store( x, std::memory_order_relaxed ) +#define fetch_add_relaxed( atom, x ) ( atom )->fetch_add( x, std::memory_order_relaxed ) +#define exchange_relaxed( atom, x ) ( atom )->exchange( x, std::memory_order_relaxed ) +// TODO: check compare_swap_strong updates before to be consistent w/ the c11 version +#define compare_exchange_relaxed( atom, before, after ) ( atom )->compare_swap_strong( before, after, std::memory_order_relaxed, std::memory_order_relaxed ) + +#define load_acquire( atom ) ( atom )->load( std::memory_order_acquire ) +#define fetch_add_acquire( atom, x ) ( atom )->fetch_add( x, std::memory_order_acquire ) +#define exchange_acquire( atom, x ) ( atom )->exchange( x, std::memory_order_acquire ) +#define compare_exchange_acquire( atom, before, after ) ( atom )->compare_swap_strong( before, after, std::memory_order_acquire, std::memory_order_acquire ) + +#define store_release( atom, x ) ( atom )->store( x, std::memory_order_release ) +#define fetch_add_release( atom, x ) ( atom )->fetch_add( x, std::memory_order_release ) +#define exchange_release( atom, x ) ( atom )->exchange( x, std::memory_order_release ) +#define compare_exchange_release( atom, before, after ) ( atom )->compare_swap_strong( before, after, std::memory_order_release, std::memory_order_relaxed ) + +#define fetch_add_acqrel( atom, x ) ( atom )->fetch_add( x, std::memory_order_acq_rel ) +#define exchange_acqrel( atom, x ) ( atom )->exchange( x, std::memory_order_acq_rel ) +#define compare_exchange_acqrel( atom, before, after ) ( atom )->compare_swap_strong( before, after, std::memory_order_acq_rel, std::memory_order_acquire ) + +#define load_seqcst( atom ) ( atom )->load( std::memory_order_seq_cst ) +#define store_seqcst( atom, x ) ( atom )->store( x, std::memory_order_seq_cst ) +#define fetch_add_seqcst( atom, x ) ( atom )->fetch_add( x, std::memory_order_seq_cst ) +#define exchange_seqcst( atom, x ) ( atom )->exchange( x, std::memory_order_seq_cst ) +#define compare_exchange_seqcst( atom, before, after ) ( atom )->compare_swap_strong( before, after, std::memory_order_seq_cst, std::memory_order_seq_cst ) + +// relacy requires C++98 which doesn't have static_assert +// TODO: maybe this should go somewhere else? +#if __cplusplus < 201103L +#define static_assert( p ) static int CONCAT( static_assert, __COUNTER__ )[ int( p ) - 1 ] #endif -ATOMIC_FUNCTION_DEF( s8 ); -ATOMIC_FUNCTION_DEF( s16 ); -ATOMIC_FUNCTION_DEF( s32 ); -ATOMIC_FUNCTION_DEF( s64 ); -ATOMIC_FUNCTION_DEF( u8 ); -ATOMIC_FUNCTION_DEF( u16 ); -ATOMIC_FUNCTION_DEF( u32 ); -ATOMIC_FUNCTION_DEF( u64 ); - -template< typename T > -inline T atomic_swap( atomic_ptr< T > * atom, T * x ) { - return atomic_swap_ptr( atom, x ); -} - -template< typename T > -inline bool atomic_cas( atomic_ptr< T > * atom, T * oldval, T * newval ) { - return atomic_cas_ptr( atom, oldval, newval ); -} - -template< typename T > -inline T atomic_get( atomic_ptr< T > * atom ) { - return atomic_get_ptr( atom ); -} - -template< typename T > -inline void atomic_set( atomic_ptr< T > * atom, T * x ) { - atomic_set_ptr( atom, x ); -} - -#undef ATOMIC_STRUCT_DEF -#undef ATOMIC_FUNCTION_DEF +#else + +#define nonatomic( T ) T +#define VAR( x ) ( x ) + +struct atomic_s32 { volatile s32 v; }; +struct atomic_s64 { volatile s64 v; }; +struct atomic_u32 { volatile u32 v; }; +struct atomic_u64 { volatile u64 v; }; + +#define load_relaxed( atom ) __atomic_load_n( &( atom )->v, __ATOMIC_RELAXED ) +#define store_relaxed( atom, x ) __atomic_store_n( &( atom )->v, ( x ), __ATOMIC_RELAXED ) +#define fetch_add_relaxed( atom, x ) __atomic_fetch_add( &( atom )->v, ( x ), __ATOMIC_RELAXED ) +#define exchange_relaxed( atom, x ) __atomic_exchange_n( &( atom )->v, ( x ), __ATOMIC_RELAXED ) +#define compare_exchange_relaxed( atom, before, after ) __atomic_compare_exchange_n( &( atom )->v, ( before ), ( after ), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED ) + +#define load_acquire( atom ) __atomic_load_n( &( atom )->v, __ATOMIC_ACQUIRE ) +#define fetch_add_acquire( atom, x ) __atomic_fetch_add( &( atom )->v, ( x ), __ATOMIC_ACQUIRE ) +#define exchange_acquire( atom, x ) __atomic_exchange_n( &( atom )->v, ( x ), __ATOMIC_ACQUIRE ) +#define compare_exchange_acquire( atom, before, after ) __atomic_compare_exchange_n( &( atom )->v, ( before ), ( after ), false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE ) + +#define store_release( atom, x ) __atomic_store_n( &( atom )->v, ( x ), __ATOMIC_RELEASE ) +#define fetch_add_release( atom, x ) __atomic_fetch_add( &( atom )->v, ( x ), __ATOMIC_RELEASE ) +#define exchange_release( atom, x ) __atomic_exchange_n( &( atom )->v, ( x ), __ATOMIC_RELEASE ) +#define compare_exchange_release( atom, before, after ) __atomic_compare_exchange_n( &( atom )->v, ( before ), ( after ), false, __ATOMIC_RELEASE, __ATOMIC_RELAXED ) + +#define fetch_add_acqrel( atom, x ) __atomic_fetch_add( &( atom )->v, ( x ), __ATOMIC_ACQ_REL ) +#define exchange_acqrel( atom, x ) __atomic_exchange_n( &( atom )->v, ( x ), __ATOMIC_ACQ_REL ) +#define compare_exchange_acqrel( atom, before, after ) __atomic_compare_exchange_n( &( atom )->v, ( before ), ( after ), false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE ) + +#define load_seqcst( atom ) __atomic_load_n( &( atom )->v, __ATOMIC_SEQ_CST ) +#define store_seqcst( atom, x ) __atomic_store_n( &( atom )->v, ( x ), __ATOMIC_SEQ_CST ) +#define fetch_add_seqcst( atom, x ) __atomic_fetch_add( &( atom )->v, ( x ), __ATOMIC_SEQ_CST ) +#define exchange_seqcst( atom, x ) __atomic_exchange_n( &( atom )->v, ( x ), __ATOMIC_SEQ_CST ) +#define compare_exchange_seqcst( atom, before, after ) __atomic_compare_exchange_n( &( atom )->v, ( before ), ( after ), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) + +// TODO: maybe use these instead? +// s32 load_relaxed( atomic_s32 * atom ) { +// return __atomic_load_n( &atom->v, __ATOMIC_RELAXED ); +// } +// +// s64 load_relaxed( atomic_s64 * atom ) { +// return __atomic_load_n( &atom->v, __ATOMIC_RELAXED ); +// } +// +// u32 load_relaxed( atomic_u32 * atom ) { +// return __atomic_load_n( &atom->v, __ATOMIC_RELAXED ); +// } +// +// u64 load_relaxed( atomic_u64 * atom ) { +// return __atomic_load_n( &atom->v, __ATOMIC_RELAXED ); +// } +// +// bool compare_exchange_relaxed( atomic_s32 * atom, s32 * before, s32 after ) { +// return __atomic_compare_exchange_n( &atom->v, before, after, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED ); +// } +#endif // RL_TEST #endif // _PLATFORM_ATOMIC_H_ diff --git a/unix_atomic.h b/unix_atomic.h @@ -1,60 +0,0 @@ -#ifndef _UNIX_ATOMIC_H_ -#define _UNIX_ATOMIC_H_ - -#include "intrinsics.h" - -#define read_barrier() asm volatile ( "" ::: "memory" ) -#define write_barrier() asm volatile ( "" ::: "memory" ) - -#define ATOMIC_DEFS( T ) \ - inline T atomic_fetch_add_##T( atomic_##T * atom, T x ) { \ - return __sync_fetch_and_add( &atom->v, x ); \ - } \ - inline T atomic_fetch_sub_##T( atomic_##T * atom, T x ) { \ - return __sync_fetch_and_sub( &atom->v, x ); \ - } \ - inline T atomic_swap_##T( atomic_##T * atom, T x ) { \ - return __sync_lock_test_and_set( &atom->v, x ); \ - } \ - inline T atomic_cas_##T( atomic_##T * atom, T oldval, T newval ) { \ - return __sync_bool_compare_and_swap( &atom->v, oldval, newval ); \ - } \ - inline T atomic_get_##T( atomic_##T * atom ) { \ - return atom->v; \ - } \ - inline void atomic_set_##T( atomic_##T * atom, T x ) { \ - atom->v = x; \ - } - -ATOMIC_DEFS( s8 ); -ATOMIC_DEFS( s16 ); -ATOMIC_DEFS( s32 ); -ATOMIC_DEFS( s64 ); -ATOMIC_DEFS( u8 ); -ATOMIC_DEFS( u16 ); -ATOMIC_DEFS( u32 ); -ATOMIC_DEFS( u64 ); - -#undef ATOMIC_DEFS - -template< typename T > -inline bool atomic_swap_pointer( atomic_ptr< T > * atom, T * x ) { - return __sync_lock_test_and_set( &atom->v, x ); -} - -template< typename T > -inline bool atomic_cas_pointer( atomic_ptr< T > * atom, T * oldval, T * newval ) { - return __sync_lock_compare_and_swap( &atom->v, oldval, newval ); -} - -template< typename T > -inline T atomic_get_ptr( atomic_ptr< T > * atom ) { - return atom->v; -} - -template< typename T > -inline void atomic_set_ptr( atomic_ptr< T > * atom, T * x ) { - atom->v = x; -} - -#endif // _UNIX_ATOMIC_H_ diff --git a/work_queue.cc b/work_queue.cc @@ -1,3 +1,7 @@ +/* + * TODO XXX FIXME: the threading code is probably fucked because i rushed through. + */ + #include "intrinsics.h" #include "work_queue.h" #include "platform_atomic.h" @@ -11,15 +15,15 @@ struct ThreadInfo { }; static bool workqueue_step( u32 thread_id, WorkQueue * queue ) { - u16 current_head = atomic_get_u16( &queue->head ); - u16 new_head = ( current_head + 1 ) % array_count( queue->jobs ); + u32 current_head = load_seqcst( &queue->head ); + u32 new_head = current_head + 1; - if( current_head != atomic_get_u16( &queue->tail ) ) { - if( atomic_cas_u16( &queue->head, current_head, new_head ) ) { - const Job & job = queue->jobs[ current_head ]; + if( current_head != load_seqcst( &queue->tail ) ) { + if( compare_exchange_seqcst( &queue->head, &current_head, new_head ) ) { + const Job & job = queue->jobs[ current_head % array_count( queue->jobs ) ]; job.callback( job.data, &queue->arenas[ thread_id ] ); - atomic_fetch_add_u16( &queue->jobs_completed, 1 ); + fetch_add_seqcst( &queue->jobs_completed, 1 ); } return true; @@ -34,8 +38,7 @@ static THREAD( workqueue_worker ) { WorkQueue * queue = info->queue; u32 thread_id = info->thread_id; - write_barrier(); - atomic_fetch_add_u32( info->started_threads, 1 ); + fetch_add_acqrel( info->started_threads, 1 ); for( ;; ) { if( !workqueue_step( thread_id, queue ) ) { @@ -57,11 +60,12 @@ void workqueue_init( WorkQueue * queue, MemoryArena * arena, u32 num_threads ) { queue->arenas[ i ] = memarena_push_arena( arena, megabytes( 1 ) ); } + // TODO: if we comment this out we can get rid of the wait at the bottom MEMARENA_SCOPED_CHECKPOINT( arena ); ThreadInfo * infos = memarena_push_many( arena, ThreadInfo, num_threads ); atomic_u32 started_threads; - atomic_set_u32( &started_threads, 0 ); + store_release( &started_threads, 0 ); for( u32 i = 0; i < num_threads; i++ ) { infos[ i ] = { i, queue, &started_threads }; @@ -71,7 +75,9 @@ void workqueue_init( WorkQueue * queue, MemoryArena * arena, u32 num_threads ) { } // wait until all threads have a local copy of ThreadInfo - while( atomic_get_u32( &started_threads ) < num_threads ) { + // TODO: do this properly + // TODO: do we really need to do this? + while( load_acquire( &started_threads ) < num_threads ) { continue; } } @@ -81,22 +87,19 @@ void workqueue_enqueue( WorkQueue * queue, WorkQueueCallback * callback, void * Job job = { callback, data }; - u16 tail = atomic_get_u16( &queue->tail ); + u32 tail = fetch_add_seqcst( &queue->tail, 1 ); - queue->jobs[ tail ] = job; + queue->jobs[ tail % array_count( queue->jobs ) ] = job; queue->jobs_queued++; - write_barrier(); - atomic_set_u16( &queue->tail, ( tail + 1 ) % array_count( queue->jobs ) ); - semaphore_signal( &queue->sem ); } void workqueue_exhaust( WorkQueue * queue ) { - while( atomic_get( &queue->jobs_completed ) < queue->jobs_queued ) { + while( load_seqcst( &queue->jobs_completed ) < queue->jobs_queued ) { workqueue_step( queue->num_threads, queue ); } queue->jobs_queued = 0; - atomic_set_u16( &queue->jobs_completed, 0 ); + store_seqcst( &queue->jobs_completed, 0 ); } diff --git a/work_queue.h b/work_queue.h @@ -20,11 +20,11 @@ struct WorkQueue { Semaphore sem; // using head/length means we need to an atomic pair which is a pain - atomic_u16 head; - atomic_u16 tail; + atomic_u32 head; + atomic_u32 tail; - u16 jobs_queued; - atomic_u16 jobs_completed; + u32 jobs_queued; + atomic_u32 jobs_completed; u32 num_threads; MemoryArena * arenas;